In [2]:
import os
from pathlib import Path
import cv2
import numpy as np
import fitz  # PyMuPDF
from ultralytics import YOLO
from PIL import Image
import json
import time  # for timing

# ---------------------- PATHS ----------------------
ROOT_DIR = Path("/home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon")

MODEL_PATH = ROOT_DIR / "models" / "best_yolo11_merged_4datasets.pt"
# IMPORTANT: make PDF_FOLDER a Path, not a string
PDF_FOLDER = ROOT_DIR / "data" / "initial" / "pdfs-20251115T105755Z-1-001" / "pdfs"

OUTPUT_FOLDER = ROOT_DIR / "data/initial/pdfs-20251115T092645Z-1-001"
OUTPUT_IMAGE_FOLDER = OUTPUT_FOLDER / "output_images"
OUTPUT_JSON_FOLDER  = OUTPUT_FOLDER / "output_json"
OUTPUT_PDF_FOLDER   = OUTPUT_FOLDER / "output_annotated_pdfs"

# Create output dirs if not exist
OUTPUT_IMAGE_FOLDER.mkdir(parents=True, exist_ok=True)
OUTPUT_JSON_FOLDER.mkdir(parents=True, exist_ok=True)
OUTPUT_PDF_FOLDER.mkdir(parents=True, exist_ok=True)

# ---------------------- STRESS SETTINGS ----------------------
# How many times to process the whole PDF folder
STRESS_RUNS = 3   # set to 1 for normal run, >1 for stress test

# ---------------------- MODEL ----------------------
model = YOLO(str(MODEL_PATH))
CLASS_NAMES = model.names  # e.g. {0: "signature", 1: "stamp", ...}
print("Loaded model with classes:", CLASS_NAMES)


# ---------------------- HELPERS ----------------------
def pdf_to_images(pdf_path: Path, zoom: float = 2.0):
    """
    Convert a PDF into a list of (page_index, image_bgr) with given zoom.
    """
    doc = fitz.open(str(pdf_path))
    images = []
    for page_index in range(len(doc)):
        page = doc.load_page(page_index)
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
        else:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        images.append((page_index, img))
    doc.close()
    return images


def preprocess_image(img_bgr, max_width=2000):
    """
    Downscale image if too wide, keeping aspect ratio.
    """
    h, w = img_bgr.shape[:2]
    if w > max_width:
        scale = max_width / w
        img_bgr = cv2.resize(
            img_bgr,
            (int(w * scale), int(h * scale)),
            interpolation=cv2.INTER_AREA
        )
    return img_bgr


# ---------------------- OVERALL TIMER ----------------------
overall_start = time.perf_counter()

# ===== MAIN STRESS LOOP: repeat whole batch STRESS_RUNS times =====
for run_idx in range(STRESS_RUNS):
    print(f"\n========== STRESS RUN {run_idx + 1} / {STRESS_RUNS} ==========")

    # Per-run storage
    selected = {}    # { "file.pdf": { "page_1": {...}, ... }, ... }
    file_times = []  # list of (filename, seconds)

    run_start = time.perf_counter()

    # ---------------------- MAIN LOOP OVER PDFs ----------------------
    for pdf_file in PDF_FOLDER.glob("*.pdf"):

        file_start = time.perf_counter()     # Start timer for this file
        print(f"\nProcessing PDF: {pdf_file.name}")

        pdf_key = pdf_file.name
        pages = pdf_to_images(pdf_file, zoom=2.0)

        annotated_image_paths = []

        for (page_idx, img_bgr) in pages:
            orig_h, orig_w = img_bgr.shape[:2]

            page_key = f"page_{page_idx + 1}"
            page_size = {"width": orig_w, "height": orig_h}

            img_bgr_proc = preprocess_image(img_bgr, max_width=2000)

            results = model.predict(
                source=img_bgr_proc,
                imgsz=1024,
                conf=0.25,
                iou=0.45,
                verbose=False
            )[0]

            # Build annotations
            sel_ann = []

            if results.boxes is not None:
                boxes = results.boxes.xyxy.cpu().numpy()
                cls_ids = results.boxes.cls.cpu().numpy().astype(int)

                for i, (box, cls_id) in enumerate(zip(boxes, cls_ids)):
                    x1, y1, x2, y2 = box.tolist()
                    width  = float(x2 - x1)
                    height = float(y2 - y1)
                    area   = width * height

                    category = CLASS_NAMES[cls_id]
                    ann_id = f"annotation_{page_idx * 1000 + i + 1}"

                    sel_ann.append({
                        ann_id: {
                            "category": category,
                            "bbox": {
                                "x": float(x1),
                                "y": float(y1),
                                "width": width,
                                "height": height
                            },
                            "area": area
                        }
                    })

            selected.setdefault(pdf_key, {})[page_key] = {
                "annotations": sel_ann,
                "page_size": page_size
            }

            # Save annotated JPG (will be overwritten each run, which is fine for stress)
            vis = results.plot()
            out_img_path = OUTPUT_IMAGE_FOLDER / f"{pdf_file.stem}_page{page_idx + 1}.jpg"
            cv2.imwrite(str(out_img_path), vis)
            annotated_image_paths.append(out_img_path)

            print(f"  Saved image: {out_img_path.name}")

        # Save annotated PDF (also overwritten each run)
        if annotated_image_paths:
            imgs = [Image.open(str(p)) for p in annotated_image_paths]
            out_pdf_path = OUTPUT_PDF_FOLDER / f"{pdf_file.stem}_annotated.pdf"
            imgs[0].save(str(out_pdf_path), save_all=True, append_images=imgs[1:])
            print(f"  Saved annotated PDF: {out_pdf_path}")

        # End timing for this file
        duration = time.perf_counter() - file_start
        file_times.append((pdf_file.name, duration))

        print(f"Finished {pdf_file.name} in {duration:.2f} seconds")

    # ---------------------- SAVE JSON FOR THIS RUN ----------------------
    json_out_path = OUTPUT_JSON_FOLDER / f"output_selected_annotations_run{run_idx + 1}.json"
    with open(json_out_path, "w", encoding="utf-8") as f:
        json.dump(selected, f, ensure_ascii=False, indent=2)

    run_duration = time.perf_counter() - run_start

    # ---------------------- SUMMARY FOR THIS RUN ----------------------
    print("\n================== RUN SUMMARY ==================")
    print(f"Run {run_idx + 1} / {STRESS_RUNS}")
    print("Processed files:")
    for i, (filename, seconds) in enumerate(file_times, start=1):
        print(f"{i}. {filename} → {seconds:.2f} seconds")

    print(f"\nRun {run_idx + 1} total time: {run_duration:.2f} seconds")
    print(f"Selected annotations JSON saved to: {json_out_path}")
    print("=================================================\n")

# ---------------------- OVERALL SUMMARY ----------------------
overall_duration = time.perf_counter() - overall_start
print(f"ALL RUNS FINISHED. Total wall-clock time for {STRESS_RUNS} run(s): {overall_duration:.2f} seconds")

Loaded model with classes: {0: 'signature', 1: 'stamp', 2: 'qr'}


Processing PDF: ТУ-31-.pdf
  Saved image: ТУ-31-_page1.jpg
  Saved image: ТУ-31-_page2.jpg
  Saved annotated PDF: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T092645Z-1-001/output_annotated_pdfs/ТУ-31-_annotated.pdf
Finished ТУ-31-.pdf in 1.14 seconds

Processing PDF: АПЗ-2.pdf
  Saved image: АПЗ-2_page1.jpg
  Saved image: АПЗ-2_page2.jpg
  Saved image: АПЗ-2_page3.jpg
  Saved image: АПЗ-2_page4.jpg
  Saved image: АПЗ-2_page5.jpg
  Saved image: АПЗ-2_page6.jpg
  Saved image: АПЗ-2_page7.jpg
  Saved image: АПЗ-2_page8.jpg
  Saved image: АПЗ-2_page9.jpg
  Saved annotated PDF: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T092645Z-1-001/output_annotated_pdfs/АПЗ-2_annotated.pdf
Finished АПЗ-2.pdf in 1.50 seconds

Processing PDF: Локалсмета-2.pdf
  Saved image: Локалсмета-2_page1.jpg
  Saved image: Локалсмета-2_page2.jpg
  Saved image: Локалсмета-2_page3.jpg
  Saved 