In [1]:
# Cell: Process ALL PDFs in a folder â†’ annotated PDFs + global JSONs

import os
from pathlib import Path
import cv2
import numpy as np
import fitz
from PIL import Image
from ultralytics import YOLO
import json

# ----------------------------
# Paths and model
# ----------------------------
ROOT_DIR = Path("/home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon")
MODEL_PATH = ROOT_DIR / "models" / "best_yolo11_merged_4datasets.pt"

PDF_DIR = ROOT_DIR / "data/extra/pdfs"
OUTPUT_FOLDER = ROOT_DIR / "data/extra"
OUTPUT_IMAGE_FOLDER = OUTPUT_FOLDER / "output_images"       # page-level images
OUTPUT_JSON_FOLDER  = OUTPUT_FOLDER / "output_json"         # global JSONs
ANNOTATED_PDF_FOLDER = OUTPUT_FOLDER / "annotated_pdfs"     # final annotated PDFs

# Create output dirs
OUTPUT_IMAGE_FOLDER.mkdir(parents=True, exist_ok=True)
OUTPUT_JSON_FOLDER.mkdir(parents=True, exist_ok=True)
ANNOTATED_PDF_FOLDER.mkdir(parents=True, exist_ok=True)

# Load model
model = YOLO(str(MODEL_PATH))
CLASS_NAMES = model.names
print("Loaded model with classes:", CLASS_NAMES)

# ----------------------------
# Helpers
# ----------------------------
def pdf_to_images(pdf_path: Path, zoom: float = 1.0):
    """Convert each page of a PDF to a BGR image (OpenCV format)."""
    doc = fitz.open(str(pdf_path))
    images = []
    for page_idx in range(len(doc)):
        page = doc.load_page(page_idx)
        mat  = fitz.Matrix(zoom, zoom)
        pix  = page.get_pixmap(matrix=mat)
        img  = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
        else:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        images.append((page_idx, img))
    doc.close()
    return images

def preprocess_image(img_bgr, max_width=2000):
    """Optionally downscale very wide pages for faster inference."""
    h, w = img_bgr.shape[:2]
    if w > max_width:
        scale = max_width / w
        img_bgr = cv2.resize(img_bgr, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_AREA)
    return img_bgr

# ----------------------------
# Global JSON containers
# ----------------------------
selected = {}   # pdf_name -> page_k -> {annotations, page_size}
masked   = {}   # pdf_name -> page_k -> {annotations, page_size}

# ----------------------------
# Main loop over all PDFs
# ----------------------------
pdf_files = sorted(PDF_DIR.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDFs in {PDF_DIR}")

for pdf_path in pdf_files:
    print("=" * 60)
    print("Processing PDF:", pdf_path.name)

    pdf_key = pdf_path.name
    pages = pdf_to_images(pdf_path, zoom=1.0)

    annotated_image_paths = []  # reset for each pdf

    for (page_idx, img_bgr) in pages:
        img_proc = preprocess_image(img_bgr, max_width=2000)
        results  = model.predict(
            source=img_proc,
            imgsz=1024,
            conf=0.25,
            iou=0.45,
            verbose=False
        )[0]

        page_key  = f"page_{page_idx+1}"
        page_size = {"width": img_bgr.shape[1], "height": img_bgr.shape[0]}

        sel_ann  = []
        mask_ann = []

        if results.boxes is not None:
            boxes   = results.boxes.xyxy.cpu().numpy()
            cls_ids = results.boxes.cls.cpu().numpy().astype(int)
            confs   = results.boxes.conf.cpu().numpy()

            for i, (box, cls_id, conf) in enumerate(zip(boxes, cls_ids, confs)):
                x1, y1, x2, y2 = box.tolist()
                width  = float(x2 - x1)
                height = float(y2 - y1)
                area   = width * height
                category = CLASS_NAMES[cls_id]

                ann_id_selected = f"annotation_{page_idx*1000 + i + 1}"
                sel_ann.append({
                    ann_id_selected: {
                        "category": category,
                        "bbox": {
                            "x": float(x1),
                            "y": float(y1),
                            "width": width,
                            "height": height
                        },
                        "area": area,
                        "confidence": float(conf),
                    }
                })

                ann_id_masked = f"annotation_{page_idx*1000 + i + 10000}"
                mask_ann.append({
                    ann_id_masked: {
                        "category": f"label_{cls_id+1}",  # simple label mapping
                        "bbox": {
                            "x": float(x1),
                            "y": float(y1),
                            "width": width,
                            "height": height
                        },
                        "area": area,
                        "confidence": float(conf),
                    }
                })

        # Attach to global JSON structures
        selected.setdefault(pdf_key, {})[page_key] = {
            "annotations": sel_ann,
            "page_size": page_size
        }
        masked.setdefault(pdf_key, {})[page_key] = {
            "annotations": mask_ann,
            "page_size": page_size
        }

        # Save annotated page image
        vis = results.plot()
        out_img_path = OUTPUT_IMAGE_FOLDER / f"{pdf_path.stem}_page{page_idx+1}.jpg"
        cv2.imwrite(str(out_img_path), vis)
        annotated_image_paths.append(out_img_path)

        print(f"  Page {page_idx+1}: {len(sel_ann)} detections, saved image: {out_img_path}")

    # Merge this PDF's page images into a single annotated PDF
    if annotated_image_paths:
        imgs = [Image.open(str(p)) for p in annotated_image_paths]
        pdf_save_path = ANNOTATED_PDF_FOLDER / f"{pdf_path.stem}_annotated.pdf"
        imgs[0].save(str(pdf_save_path), save_all=True, append_images=imgs[1:])
        print("  Saved annotated PDF:", pdf_save_path)
    else:
        print("  No pages/images to merge for:", pdf_path.name)

# ----------------------------
# Save global JSONs for ALL PDFs
# ----------------------------
with open(OUTPUT_JSON_FOLDER / "output_selected_annotations.json", "w", encoding="utf-8") as f:
    json.dump(selected, f, ensure_ascii=False, indent=2)

with open(OUTPUT_JSON_FOLDER / "output_masked_annotations.json", "w", encoding="utf-8") as f:
    json.dump(masked, f, ensure_ascii=False, indent=2)

print("=" * 60)
print("Saved JSON files to:", OUTPUT_JSON_FOLDER)
print("Annotated PDFs saved to:", ANNOTATED_PDF_FOLDER)


Loaded model with classes: {0: 'signature', 1: 'stamp', 2: 'qr'}
Found 1 PDFs in /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/extra/pdfs
Processing PDF: Scanned Documents.pdf
  Page 1: 11 detections, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/extra/output_images/Scanned Documents_page1.jpg
  Saved annotated PDF: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/extra/annotated_pdfs/Scanned Documents_annotated.pdf
Saved JSON files to: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/extra/output_json
Annotated PDFs saved to: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/extra/annotated_pdfs
