## jpg and one json

In [6]:
# New Cell: Single-PDF inference → images → model → save outputs

import os
from pathlib import Path
import cv2
import numpy as np
import fitz  # PyMuPDF
from ultralytics import YOLO
import json

# Paths
ROOT_DIR = Path("/home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon")
MODEL_PATH = ROOT_DIR / "models" / "best_yolo11_multi_doc.pt"   # adjust if name differs
PDF_PATH = ROOT_DIR / "data" / "initial" / "pdfs-20251115T105755Z-1-001" / "pdfs" / "АПЗ-.pdf"  # change to target file
OUTPUT_IMAGE_FOLDER = ROOT_DIR / "data" / "initial" / "pdfs-20251115T105755Z-1-001" / "output_images"
OUTPUT_JSON_FOLDER = ROOT_DIR / "data" / "initial" / "pdfs-20251115T105755Z-1-001" / "output_json"

# Create output dirs if not exist
OUTPUT_IMAGE_FOLDER.mkdir(parents=True, exist_ok=True)
OUTPUT_JSON_FOLDER.mkdir(parents=True, exist_ok=True)

# Load model
model = YOLO(str(MODEL_PATH))
CLASS_NAMES = model.names  # should have {0:"signature",1:"stamp"}
print("Loaded model with classes:", CLASS_NAMES)

def pdf_to_images(pdf_path: Path, zoom: float = 2.0):
    doc = fitz.open(str(pdf_path))
    images = []
    for page_index in range(len(doc)):
        page = doc.load_page(page_index)
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
        else:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        images.append((page_index, img))
    doc.close()
    return images

def preprocess_image(img_bgr, max_width=2000):
    h, w = img_bgr.shape[:2]
    if w > max_width:
        scale = max_width / w
        img_bgr = cv2.resize(img_bgr, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
    return img_bgr

# Process the single PDF
print("Processing PDF:", PDF_PATH.name)
images = pdf_to_images(PDF_PATH, zoom=1.0)
for (page_idx, img_bgr) in images:
    img_proc = preprocess_image(img_bgr, max_width=1024)
    results = model.predict(source=img_proc, imgsz=640, conf=0.25, iou=0.45, verbose=False)[0]


    detections = []
    if results.boxes is not None:
        boxes = results.boxes.xyxy.cpu().numpy()
        cls_ids = results.boxes.cls.cpu().numpy().astype(int)
        confs = results.boxes.conf.cpu().numpy()
        for box, cls_id, conf in zip(boxes, cls_ids, confs):
            x1, y1, x2, y2 = box.tolist()
            detections.append({
                "page": page_idx,
                "class_id": int(cls_id),
                "class_name": CLASS_NAMES[cls_id],
                "confidence": float(conf),
                "bbox": [float(x1), float(y1), float(x2), float(y2)]
            })
    # Save image with boxes
    vis = results.plot()
    out_img_name = f"{PDF_PATH.stem}_page{page_idx+1}.jpg"
    out_img_path = OUTPUT_IMAGE_FOLDER / out_img_name
    cv2.imwrite(str(out_img_path), vis)

    # Save JSON
    out_json_name = f"{PDF_PATH.stem}_page{page_idx+1}.json"
    out_json_path = OUTPUT_JSON_FOLDER / out_json_name
    with open(out_json_path, "w") as f:
        json.dump({"pdf": PDF_PATH.name, "page": page_idx+1, "detections": detections}, f, indent=2)

    print(f"  Saved image: {out_img_name}")
    print(f"  Saved JSON: {out_json_name}")

print("Done processing the single PDF.")


Loaded model with classes: {0: 'signature', 1: 'stamp', 2: 'qr_code'}
Processing PDF: АПЗ-.pdf
  Saved image: АПЗ-_page1.jpg
  Saved JSON: АПЗ-_page1.json
  Saved image: АПЗ-_page2.jpg
  Saved JSON: АПЗ-_page2.json
  Saved image: АПЗ-_page3.jpg
  Saved JSON: АПЗ-_page3.json
  Saved image: АПЗ-_page4.jpg
  Saved JSON: АПЗ-_page4.json
  Saved image: АПЗ-_page5.jpg
  Saved JSON: АПЗ-_page5.json
  Saved image: АПЗ-_page6.jpg
  Saved JSON: АПЗ-_page6.json
  Saved image: АПЗ-_page7.jpg
  Saved JSON: АПЗ-_page7.json
  Saved image: АПЗ-_page8.jpg
  Saved JSON: АПЗ-_page8.json
  Saved image: АПЗ-_page9.jpg
  Saved JSON: АПЗ-_page9.json
Done processing the single PDF.


## annotated pdf and 2 jsons

In [7]:
# Updated Cell: Process a single PDF and output annotated PDF + JSONs

import os
from pathlib import Path
import cv2
import numpy as np
import fitz
from PIL import Image
from ultralytics import YOLO
import json

# Paths
ROOT_DIR = Path("/home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon")
MODEL_PATH = ROOT_DIR / "models" / "best_yolo11_merged_4datasets.pt"
# PDF_PATH  = ROOT_DIR / "data/initial/pdfs-20251115T105755Z-1-001/pdfs/АПЗ-.pdf"      # change filename as needed
PDF_PATH  = ROOT_DIR / "data/initial/pdfs-20251115T105755Z-1-001/pdfs/дефект-2.pdf"      # change filename as needed


OUTPUT_FOLDER = ROOT_DIR / "data/initial/pdfs-20251115T105755Z-1-001"
OUTPUT_IMAGE_FOLDER = OUTPUT_FOLDER / "output_images"
OUTPUT_JSON_FOLDER  = OUTPUT_FOLDER / "output_json"
OUTPUT_ANNOT_PDF    = OUTPUT_FOLDER / f"{PDF_PATH.stem}_annotated.pdf"

# Create output dirs
OUTPUT_IMAGE_FOLDER.mkdir(parents=True, exist_ok=True)
OUTPUT_JSON_FOLDER.mkdir(parents=True, exist_ok=True)

# Load model
model = YOLO(str(MODEL_PATH))
CLASS_NAMES = model.names
print("Loaded model with classes:", CLASS_NAMES)

def pdf_to_images(pdf_path: Path, zoom: float = 1.0):
    doc = fitz.open(str(pdf_path))
    images = []
    for page_idx in range(len(doc)):
        page = doc.load_page(page_idx)
        mat  = fitz.Matrix(zoom, zoom)
        pix  = page.get_pixmap(matrix=mat)
        img  = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
        else:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        images.append((page_idx, img))
    doc.close()
    return images

def preprocess_image(img_bgr, max_width=2000):
    h, w = img_bgr.shape[:2]
    if w > max_width:
        scale = max_width / w
        img_bgr = cv2.resize(img_bgr, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
    return img_bgr

# Storage for JSONs
selected = {}
masked   = {}
pdf_key = PDF_PATH.name   # use the pdf filename as key

# Read PDF pages to images
pages = pdf_to_images(PDF_PATH, zoom=1.0)

annotated_image_paths = []

for (page_idx, img_bgr) in pages:
    img_proc = preprocess_image(img_bgr, max_width=2000)
    results  = model.predict(source=img_proc, imgsz=1024, conf=0.25, iou=0.45, verbose=False)[0]

    page_key = f"page_{page_idx+1}"
    page_size = {"width": img_bgr.shape[1], "height": img_bgr.shape[0]}

    # Build selected annotations JSON structure
    sel_ann = []
    mask_ann = []

    if results.boxes is not None:
        boxes  = results.boxes.xyxy.cpu().numpy()
        cls_ids = results.boxes.cls.cpu().numpy().astype(int)
        confs  = results.boxes.conf.cpu().numpy()
        for i, (box, cls_id, conf) in enumerate(zip(boxes, cls_ids, confs)):
            x1, y1, x2, y2 = box.tolist()
            width  = float(x2 - x1)
            height = float(y2 - y1)
            area   = width * height
            category = CLASS_NAMES[cls_id]
            # annotation id: e.g. annotation_{page_idx*1000 + i}
            ann_id_selected = f"annotation_{page_idx*1000 + i + 1}"
            sel_ann.append({
                ann_id_selected: {
                    "category": category,
                    "bbox": {"x": float(x1), "y": float(y1), "width": width, "height": height},
                    "area": area
                }
            })
            # For masked JSON, you would have different categories (if you have “label_XX”) – adapt as needed:
            ann_id_masked = f"annotation_{page_idx*1000 + i + 10000}"
            mask_ann.append({
                ann_id_masked: {
                    "category": f"label_{cls_id+1}",   # example mapping
                    "bbox": {"x": float(x1), "y": float(y1), "width": width, "height": height},
                    "area": area
                }
            })

    # Add page-level entry
    selected.setdefault(pdf_key, {})[page_key] = {
        "annotations": sel_ann,
        "page_size": page_size
    }
    masked.setdefault(pdf_key, {})[page_key] = {
        "annotations": mask_ann,
        "page_size": page_size
    }

    # Save annotated image
    vis = results.plot()
    out_img_path = OUTPUT_IMAGE_FOLDER / f"{PDF_PATH.stem}_page{page_idx+1}.jpg"
    cv2.imwrite(str(out_img_path), vis)
    annotated_image_paths.append(out_img_path)

    print(f"Processed page {page_idx+1}, saved image: {out_img_path}")

# Save JSONs
with open(OUTPUT_JSON_FOLDER / "output_selected_annotations.json", "w", encoding="utf-8") as f:
    json.dump(selected, f, ensure_ascii=False, indent=2)
with open(OUTPUT_JSON_FOLDER / "output_masked_annotations.json", "w", encoding="utf-8") as f:
    json.dump(masked, f, ensure_ascii=False, indent=2)

print("Saved JSON files to:", OUTPUT_JSON_FOLDER)

# Merge annotated images into a single PDF
imgs = [Image.open(str(p)) for p in annotated_image_paths]
pdf_save_path = OUTPUT_ANNOT_PDF
imgs[0].save(str(pdf_save_path), save_all=True, append_images=imgs[1:])
print("Saved annotated PDF:", pdf_save_path)


Loaded model with classes: {0: 'signature', 1: 'stamp', 2: 'qr'}
Processed page 1, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/output_images/дефект-2_page1.jpg
Processed page 2, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/output_images/дефект-2_page2.jpg
Saved JSON files to: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/output_json
Saved annotated PDF: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/дефект-2_annotated.pdf


## annotated pdf + 2 json + qr

In [5]:
# Cell: Process ALL PDFs in a folder → annotated PDFs + global JSONs

import os
from pathlib import Path
import cv2
import numpy as np
import fitz
from PIL import Image
from ultralytics import YOLO
import json

# ----------------------------
# Paths and model
# ----------------------------
ROOT_DIR = Path("/home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon")
MODEL_PATH = ROOT_DIR / "models" / "best_yolo11_merged_4datasets_v2.pt"

PDF_DIR = ROOT_DIR / "data/initial/pdfs-20251115T105755Z-1-001/pdfs"
OUTPUT_FOLDER = ROOT_DIR / "data/initial/pdfs-20251115T105755Z-1-001"
OUTPUT_IMAGE_FOLDER = OUTPUT_FOLDER / "output_images"       # page-level images
OUTPUT_JSON_FOLDER  = OUTPUT_FOLDER / "output_json"         # global JSONs
ANNOTATED_PDF_FOLDER = OUTPUT_FOLDER / "annotated_pdfs"     # final annotated PDFs

# Create output dirs
OUTPUT_IMAGE_FOLDER.mkdir(parents=True, exist_ok=True)
OUTPUT_JSON_FOLDER.mkdir(parents=True, exist_ok=True)
ANNOTATED_PDF_FOLDER.mkdir(parents=True, exist_ok=True)

# Load model
model = YOLO(str(MODEL_PATH))
CLASS_NAMES = model.names
print("Loaded model with classes:", CLASS_NAMES)

# ----------------------------
# Helpers
# ----------------------------
def pdf_to_images(pdf_path: Path, zoom: float = 1.0):
    """Convert each page of a PDF to a BGR image (OpenCV format)."""
    doc = fitz.open(str(pdf_path))
    images = []
    for page_idx in range(len(doc)):
        page = doc.load_page(page_idx)
        mat  = fitz.Matrix(zoom, zoom)
        pix  = page.get_pixmap(matrix=mat)
        img  = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
        else:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        images.append((page_idx, img))
    doc.close()
    return images

def preprocess_image(img_bgr, max_width=2000):
    """Optionally downscale very wide pages for faster inference."""
    h, w = img_bgr.shape[:2]
    if w > max_width:
        scale = max_width / w
        img_bgr = cv2.resize(img_bgr, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_AREA)
    return img_bgr

# ----------------------------
# Global JSON containers
# ----------------------------
selected = {}   # pdf_name -> page_k -> {annotations, page_size}
masked   = {}   # pdf_name -> page_k -> {annotations, page_size}

# ----------------------------
# Main loop over all PDFs
# ----------------------------
pdf_files = sorted(PDF_DIR.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDFs in {PDF_DIR}")

for pdf_path in pdf_files:
    print("=" * 60)
    print("Processing PDF:", pdf_path.name)

    pdf_key = pdf_path.name
    pages = pdf_to_images(pdf_path, zoom=1.0)

    annotated_image_paths = []  # reset for each pdf

    for (page_idx, img_bgr) in pages:
        img_proc = preprocess_image(img_bgr, max_width=2000)
        results  = model.predict(
            source=img_proc,
            imgsz=1024,
            conf=0.25,
            iou=0.45,
            verbose=False
        )[0]

        page_key  = f"page_{page_idx+1}"
        page_size = {"width": img_bgr.shape[1], "height": img_bgr.shape[0]}

        sel_ann  = []
        mask_ann = []

        if results.boxes is not None:
            boxes   = results.boxes.xyxy.cpu().numpy()
            cls_ids = results.boxes.cls.cpu().numpy().astype(int)
            confs   = results.boxes.conf.cpu().numpy()

            for i, (box, cls_id, conf) in enumerate(zip(boxes, cls_ids, confs)):
                x1, y1, x2, y2 = box.tolist()
                width  = float(x2 - x1)
                height = float(y2 - y1)
                area   = width * height
                category = CLASS_NAMES[cls_id]

                ann_id_selected = f"annotation_{page_idx*1000 + i + 1}"
                sel_ann.append({
                    ann_id_selected: {
                        "category": category,
                        "bbox": {
                            "x": float(x1),
                            "y": float(y1),
                            "width": width,
                            "height": height
                        },
                        "area": area,
                        "confidence": float(conf),
                    }
                })

                ann_id_masked = f"annotation_{page_idx*1000 + i + 10000}"
                mask_ann.append({
                    ann_id_masked: {
                        "category": f"label_{cls_id+1}",  # simple label mapping
                        "bbox": {
                            "x": float(x1),
                            "y": float(y1),
                            "width": width,
                            "height": height
                        },
                        "area": area,
                        "confidence": float(conf),
                    }
                })

        # Attach to global JSON structures
        selected.setdefault(pdf_key, {})[page_key] = {
            "annotations": sel_ann,
            "page_size": page_size
        }
        masked.setdefault(pdf_key, {})[page_key] = {
            "annotations": mask_ann,
            "page_size": page_size
        }

        # Save annotated page image
        vis = results.plot()
        out_img_path = OUTPUT_IMAGE_FOLDER / f"{pdf_path.stem}_page{page_idx+1}.jpg"
        cv2.imwrite(str(out_img_path), vis)
        annotated_image_paths.append(out_img_path)

        print(f"  Page {page_idx+1}: {len(sel_ann)} detections, saved image: {out_img_path}")

    # Merge this PDF's page images into a single annotated PDF
    if annotated_image_paths:
        imgs = [Image.open(str(p)) for p in annotated_image_paths]
        pdf_save_path = ANNOTATED_PDF_FOLDER / f"{pdf_path.stem}_annotated.pdf"
        imgs[0].save(str(pdf_save_path), save_all=True, append_images=imgs[1:])
        print("  Saved annotated PDF:", pdf_save_path)
    else:
        print("  No pages/images to merge for:", pdf_path.name)

# ----------------------------
# Save global JSONs for ALL PDFs
# ----------------------------
with open(OUTPUT_JSON_FOLDER / "output_selected_annotations.json", "w", encoding="utf-8") as f:
    json.dump(selected, f, ensure_ascii=False, indent=2)

with open(OUTPUT_JSON_FOLDER / "output_masked_annotations.json", "w", encoding="utf-8") as f:
    json.dump(masked, f, ensure_ascii=False, indent=2)

print("=" * 60)
print("Saved JSON files to:", OUTPUT_JSON_FOLDER)
print("Annotated PDFs saved to:", ANNOTATED_PDF_FOLDER)


Loaded model with classes: {0: 'signature', 1: 'stamp', 2: 'qr'}
Found 45 PDFs in /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/pdfs
Processing PDF: АПЗ-.pdf
  Page 1: 1 detections, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/output_images/АПЗ-_page1.jpg
  Page 2: 0 detections, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/output_images/АПЗ-_page2.jpg
  Page 3: 1 detections, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/output_images/АПЗ-_page3.jpg
  Page 4: 0 detections, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/output_images/АПЗ-_page4.jpg
  Page 5: 0 detections, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/output_images/АПЗ-_page5.jpg
  Page 6: 1 detections, 

## creating pdfs for ground truth

In [9]:
import json
from pathlib import Path
import fitz          # PyMuPDF
import cv2
import numpy as np
from PIL import Image

# ---- Paths ----
ROOT_DIR   = Path("/home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon")
DATA_DIR   = ROOT_DIR / "data" / "initial"
PDF_DIR    = DATA_DIR / "pdfs-20251115T105755Z-1-001" / "pdfs"

SELECTED_JSON = DATA_DIR / "selected_annotations.json"

GT_PDF_DIR = DATA_DIR / "ground_truth_pdfs"
GT_IMG_DIR = GT_PDF_DIR / "images"   # intermediate images used to build PDFs

GT_PDF_DIR.mkdir(parents=True, exist_ok=True)
GT_IMG_DIR.mkdir(parents=True, exist_ok=True)

print("PDF dir      :", PDF_DIR)
print("Selected JSON:", SELECTED_JSON)
print("Output PDFs  :", GT_PDF_DIR)

# categories we want to draw as GT
TARGET_CATEGORIES = {"signature", "stamp"}   # add "qr" here if you want


# ---- Helper: annotate a single PDF using its GT from JSON ----
def annotate_pdf(pdf_name: str, pdf_gt: dict):
    """
    pdf_name: e.g. 'локалсмета-.pdf'
    pdf_gt:   selected[pdf_name] → dict of pages: { 'page_1': {...}, ... }
    """
    pdf_path = PDF_DIR / pdf_name
    if not pdf_path.exists():
        print(f"[WARN] PDF not found on disk, skipping: {pdf_path}")
        return

    print(f"\nProcessing ground truth for: {pdf_name}")
    doc = fitz.open(str(pdf_path))

    annotated_image_paths = []

    for page_index in range(len(doc)):
        page = doc.load_page(page_index)
        # render page to image (no resize so coords remain in same space as page_size)
        pix = page.get_pixmap()
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
        else:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        h, w = img.shape[:2]
        page_key = f"page_{page_index+1}"
        page_info = pdf_gt.get(page_key)

        if page_info:
            ann_list = page_info.get("annotations", [])

            for ann_obj in ann_list:
                for ann_id, ann_data in ann_obj.items():
                    cat = ann_data.get("category", "")
                    if cat not in TARGET_CATEGORIES:
                        continue

                    bbox = ann_data.get("bbox", {})
                    x = float(bbox.get("x", 0.0))
                    y = float(bbox.get("y", 0.0))
                    w_box = float(bbox.get("width", 0.0))
                    h_box = float(bbox.get("height", 0.0))

                    x1 = int(round(x))
                    y1 = int(round(y))
                    x2 = int(round(x + w_box))
                    y2 = int(round(y + h_box))

                    # Color by category
                    if cat == "signature":
                        color = (0, 0, 255)   # red BGR
                    elif cat == "stamp":
                        color = (255, 0, 0)   # blue BGR
                    else:
                        color = (0, 255, 0)

                    cv2.rectangle(img, (x1, y1), (x2, y2), color, 3)
                    cv2.putText(
                        img,
                        cat,
                        (x1, max(y1 - 10, 20)),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.8,
                        color,
                        2,
                        lineType=cv2.LINE_AA
                    )

        # Save annotated page image
        out_img_path = GT_IMG_DIR / f"{pdf_path.stem}_page{page_index+1}_gt.jpg"
        cv2.imwrite(str(out_img_path), img)
        annotated_image_paths.append(out_img_path)

    doc.close()

    if not annotated_image_paths:
        print(f"  No GT annotations found for {pdf_name}, no PDF created.")
        return

    # Merge annotated images into a single PDF
    pil_imgs = [Image.open(str(p)).convert("RGB") for p in annotated_image_paths]
    out_pdf_path = GT_PDF_DIR / f"{pdf_path.stem}_gt.pdf"
    pil_imgs[0].save(str(out_pdf_path), save_all=True, append_images=pil_imgs[1:])
    print(f"  ✅ Saved ground truth PDF: {out_pdf_path}")


# ---- Main: load JSON and process all PDFs it contains ----
with open(SELECTED_JSON, "r", encoding="utf-8") as f:
    selected = json.load(f)

print(f"\nFound {len(selected)} PDFs in selected_annotations.json")

for pdf_name, pdf_gt in selected.items():
    annotate_pdf(pdf_name, pdf_gt)

print("\nDone. Ground-truth PDFs are in:", GT_PDF_DIR)


PDF dir      : /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/pdfs
Selected JSON: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/selected_annotations.json
Output PDFs  : /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/ground_truth_pdfs

Found 45 PDFs in selected_annotations.json

Processing ground truth for: локалсмета-.pdf
  ✅ Saved ground truth PDF: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/ground_truth_pdfs/локалсмета-_gt.pdf

Processing ground truth for: Разрешназемлю-41-.pdf
  ✅ Saved ground truth PDF: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/ground_truth_pdfs/Разрешназемлю-41-_gt.pdf

Processing ground truth for: письмо-11.pdf
  ✅ Saved ground truth PDF: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/ground_truth_pdfs/письмо-11_gt.pdf

Processing ground truth for: дефект-41-чб.pdf
  ✅ Saved ground truth PDF: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data

## Comparing ground truth with model.pt


In [1]:
# Cell 1: setup & parse ground-truth from selected_annotations.json

import json
from pathlib import Path
from collections import defaultdict

import numpy as np
import cv2
import fitz  # PyMuPDF
from ultralytics import YOLO

# ---------- Paths ----------
ROOT_DIR = Path("/home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon")

DATA_DIR = ROOT_DIR / "data" / "initial"
PDF_DIR  = DATA_DIR / "pdfs-20251115T105755Z-1-001" / "pdfs"

SELECTED_JSON = DATA_DIR / "selected_annotations.json"
MODEL_PATH    = ROOT_DIR / "models" / "best_yolo11_merged_4datasets_v2.pt"

print("ROOT_DIR   :", ROOT_DIR)
print("PDF_DIR    :", PDF_DIR)
print("SELECTED   :", SELECTED_JSON)
print("MODEL_PATH :", MODEL_PATH)

# ---------- Load model ----------
model = YOLO(str(MODEL_PATH))
CLASS_NAMES = model.names  # e.g. {0: 'signature', 1: 'stamp', 2: 'qr'}
print("Model classes:", CLASS_NAMES)

# ---------- Category mapping ----------
# Ground truth may use 'qr' or 'qr_code'; unify them
GT_TO_CANON = {
    "signature": "signature",
    "stamp": "stamp",
    "qr": "qr",
    # "qr_code": "qr",
}

# Only evaluate for these canonical classes
EVAL_CLASSES = ["signature", "stamp", "qr"]  # you can drop 'qr' if you want


def bbox_dict_to_xyxy(bbox_dict):
    """Convert {'x','y','width','height'} to [x1,y1,x2,y2]."""
    x = float(bbox_dict["x"])
    y = float(bbox_dict["y"])
    w = float(bbox_dict["width"])
    h = float(bbox_dict["height"])
    return [x, y, x + w, y + h]


def compute_iou(boxA, boxB):
    """IoU for two [x1,y1,x2,y2] boxes."""
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    inter_w = max(0.0, xB - xA)
    inter_h = max(0.0, yB - yA)
    inter_area = inter_w * inter_h

    if inter_area <= 0:
        return 0.0

    boxA_area = max(0.0, (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
    boxB_area = max(0.0, (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))

    denom = boxA_area + boxB_area - inter_area
    if denom <= 0:
        return 0.0
    return inter_area / denom


def pdf_to_images(pdf_path: Path, zoom: float = 1.0):
    """
    Render each page of a PDF to an RGB image (no resizing afterwards).
    Coordinates of GT are in this pixel space.
    """
    doc = fitz.open(str(pdf_path))
    pages = []
    for page_idx in range(len(doc)):
        page = doc.load_page(page_idx)
        mat  = fitz.Matrix(zoom, zoom)
        pix  = page.get_pixmap(matrix=mat)
        img  = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
        else:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        pages.append((page_idx, img))
    doc.close()
    return pages


# ---------- Parse selected_annotations.json into GT dict ----------
# Structure:
# {
#   "file.pdf": {
#       "page_3": {
#          "annotations": [ { "annotation_117": { "category": "...", "bbox": {...}, ... }}, ...],
#          "page_size": { "width": ..., "height": ... }
#       },
#       ...
#   },
#   ...
# }

with open(SELECTED_JSON, "r", encoding="utf-8") as f:
    selected = json.load(f)

# gt[pdf_name][page_idx] = list of { "bbox": [x1,y1,x2,y2], "category": canon_class }
gt = {}

for pdf_name, pdf_data in selected.items():
    for page_key, page_info in pdf_data.items():
        # page_key like "page_3"
        try:
            page_idx = int(page_key.split("_")[1]) - 1
        except Exception:
            continue

        gt_annots = []
        for ann_obj in page_info.get("annotations", []):
            for ann_id, ann_data in ann_obj.items():
                cat = ann_data.get("category", "")
                cat_canon = GT_TO_CANON.get(cat)
                if cat_canon is None:
                    # ignore categories we don't evaluate
                    continue

                bbox = ann_data.get("bbox")
                if bbox is None:
                    continue

                xyxy = bbox_dict_to_xyxy(bbox)
                gt_annots.append({
                    "bbox": xyxy,
                    "category": cat_canon,
                    "ann_id": ann_id,
                })

        if gt_annots:
            gt.setdefault(pdf_name, {})
            gt[pdf_name].setdefault(page_idx, [])
            gt[pdf_name][page_idx].extend(gt_annots)

print(f"Loaded GT for {len(gt)} PDFs from selected_annotations.json")


ROOT_DIR   : /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon
PDF_DIR    : /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/pdfs-20251115T105755Z-1-001/pdfs
SELECTED   : /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/selected_annotations.json
MODEL_PATH : /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/models/best_yolo11_merged_4datasets_v2.pt
Model classes: {0: 'signature', 1: 'stamp', 2: 'qr'}
Loaded GT for 45 PDFs from selected_annotations.json


In [2]:
# Cell 2: run evaluation (YOLO vs selected_annotations GT)

IOU_THRESH = 0.5   # IoU threshold for a match

# Counters per class
tp = defaultdict(int)
fp = defaultdict(int)
fn = defaultdict(int)

# Optional: overall counters
tp_all = 0
fp_all = 0
fn_all = 0

for pdf_name, pages_gt in gt.items():
    pdf_path = PDF_DIR / pdf_name
    if not pdf_path.exists():
        print(f"[WARN] PDF file not found on disk, skipping: {pdf_path}")
        continue

    print(f"\nEvaluating on PDF: {pdf_name}")
    pages = pdf_to_images(pdf_path, zoom=1.0)
    page_dict = {idx: img for (idx, img) in pages}

    for page_idx, gt_list in pages_gt.items():
        if page_idx not in page_dict:
            print(f"  [WARN] page {page_idx+1} not found in rendered pages, skipping.")
            continue

        img = page_dict[page_idx]

        # YOLO inference – outputs boxes in original image coords
        res = model.predict(
            source=img,
            imgsz=1024,
            conf=0.25,
            iou=0.45,
            verbose=False
        )[0]

        preds_raw = []
        if res.boxes is not None and len(res.boxes) > 0:
            boxes   = res.boxes.xyxy.cpu().numpy()
            cls_ids = res.boxes.cls.cpu().numpy().astype(int)
            confs   = res.boxes.conf.cpu().numpy()

            for box, cid, conf in zip(boxes, cls_ids, confs):
                name = CLASS_NAMES[int(cid)]
                # we only care about eval classes
                if name not in EVAL_CLASSES:
                    continue
                preds_raw.append({
                    "bbox": box.tolist(),   # [x1,y1,x2,y2]
                    "category": name,
                    "conf": float(conf),
                })

        # Per-class matching
        for cls in EVAL_CLASSES:
            gt_boxes = [g["bbox"] for g in gt_list if g["category"] == cls]
            pred_boxes = [p["bbox"] for p in preds_raw if p["category"] == cls]

            matched_gt_indices = set()

            # For each prediction, find best matching GT of same class
            for pb in pred_boxes:
                best_iou = 0.0
                best_j = -1
                for j, gb in enumerate(gt_boxes):
                    if j in matched_gt_indices:
                        continue
                    iou = compute_iou(pb, gb)
                    if iou > best_iou:
                        best_iou = iou
                        best_j = j

                if best_iou >= IOU_THRESH and best_j >= 0:
                    tp[cls] += 1
                    tp_all += 1
                    matched_gt_indices.add(best_j)
                else:
                    fp[cls] += 1
                    fp_all += 1

            # Any GT boxes not matched → FN
            num_unmatched = len(gt_boxes) - len(matched_gt_indices)
            if num_unmatched > 0:
                fn[cls] += num_unmatched
                fn_all += num_unmatched

print("\n=== Per-class metrics (IoU >= %.2f) ===" % IOU_THRESH)
for cls in EVAL_CLASSES:
    TP = tp[cls]
    FP = fp[cls]
    FN = fn[cls]
    if TP + FP > 0:
        precision = TP / (TP + FP)
    else:
        precision = 0.0
    if TP + FN > 0:
        recall = TP / (TP + FN)
    else:
        recall = 0.0
    if precision + recall > 0:
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0.0

    print(f"\nClass: {cls}")
    print(f"  TP: {TP}, FP: {FP}, FN: {FN}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall   : {recall:.3f}")
    print(f"  F1-score : {f1:.3f}")

print("\n=== Overall metrics (all eval classes combined) ===")
if tp_all + fp_all > 0:
    precision_all = tp_all / (tp_all + fp_all)
else:
    precision_all = 0.0
if tp_all + fn_all > 0:
    recall_all = tp_all / (tp_all + fn_all)
else:
    recall_all = 0.0
if precision_all + recall_all > 0:
    f1_all = 2 * precision_all * recall_all / (precision_all + recall_all)
else:
    f1_all = 0.0

print(f"  TP: {tp_all}, FP: {fp_all}, FN: {fn_all}")
print(f"  Precision: {precision_all:.3f}")
print(f"  Recall   : {recall_all:.3f}")
print(f"  F1-score : {f1_all:.3f}")



Evaluating on PDF: локалсмета-.pdf

Evaluating on PDF: Разрешназемлю-41-.pdf

Evaluating on PDF: письмо-11.pdf

Evaluating on PDF: дефект-41-чб.pdf

Evaluating on PDF: отр-1.pdf

Evaluating on PDF: Археология-31-.pdf

Evaluating on PDF: локалсмета-32-.pdf

Evaluating on PDF: Письмо-42.pdf

Evaluating on PDF: ТЗ-.pdf

Evaluating on PDF: отр-32-.pdf

Evaluating on PDF: Археология-2.pdf

Evaluating on PDF: перечень-.pdf

Evaluating on PDF: отр-41-.pdf

Evaluating on PDF: отр-11.pdf

Evaluating on PDF: ТУ-31-.pdf

Evaluating on PDF: Письмо-41-.pdf

Evaluating on PDF: чертеж-.pdf

Evaluating on PDF: дефект-.pdf

Evaluating on PDF: дефект-31-.pdf

Evaluating on PDF: письмо-22.pdf

Evaluating on PDF: археология-41-чб.pdf

Evaluating on PDF: письмо-.pdf

Evaluating on PDF: АПЗ-2.pdf

Evaluating on PDF: Письмо-2.pdf

Evaluating on PDF: АПЗ-31-.pdf

Evaluating on PDF: ТУ-41-чб.pdf

Evaluating on PDF: локалсметчб-.pdf

Evaluating on PDF: дефект-2.pdf

Evaluating on PDF: дозиметрия-31-.pdf

Evalu

## annotated for test

In [1]:
# Cell: Process ALL PDFs in a folder → annotated PDFs + global JSONs

import os
from pathlib import Path
import cv2
import numpy as np
import fitz
from PIL import Image
from ultralytics import YOLO
import json

# ----------------------------
# Paths and model
# ----------------------------
ROOT_DIR = Path("/home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon")
MODEL_PATH = ROOT_DIR / "models" / "best_yolo11_merged_4datasets.pt"

# ⚠️ FIXED: correct test folder path (with ...-001)
PDF_DIR = ROOT_DIR / "data" / "initial" / "test-20251116T042326Z-1-001" / "test"

OUTPUT_FOLDER        = ROOT_DIR / "data" / "initial" / "test-20251116T042326Z-1-001"
OUTPUT_IMAGE_FOLDER  = OUTPUT_FOLDER / "output_images"       # page-level images
OUTPUT_JSON_FOLDER   = OUTPUT_FOLDER / "output_json"         # global JSONs
ANNOTATED_PDF_FOLDER = OUTPUT_FOLDER / "annotated_pdfs"      # final annotated PDFs

# Create output dirs
OUTPUT_IMAGE_FOLDER.mkdir(parents=True, exist_ok=True)
OUTPUT_JSON_FOLDER.mkdir(parents=True, exist_ok=True)
ANNOTATED_PDF_FOLDER.mkdir(parents=True, exist_ok=True)

# Load model
model = YOLO(str(MODEL_PATH))
CLASS_NAMES = model.names
print("Loaded model with classes:", CLASS_NAMES)

# ----------------------------
# Helpers
# ----------------------------
def pdf_to_images(pdf_path: Path, zoom: float = 1.0):
    """Convert each page of a PDF to a BGR image (OpenCV format)."""
    doc = fitz.open(str(pdf_path))
    images = []
    for page_idx in range(len(doc)):
        page = doc.load_page(page_idx)
        mat  = fitz.Matrix(zoom, zoom)
        pix  = page.get_pixmap(matrix=mat)
        img  = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
        else:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        images.append((page_idx, img))
    doc.close()
    return images

def preprocess_image(img_bgr, max_width=2000):
    """Optionally downscale very wide pages for faster inference."""
    h, w = img_bgr.shape[:2]
    if w > max_width:
        scale = max_width / w
        img_bgr = cv2.resize(img_bgr, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_AREA)
    return img_bgr

# ----------------------------
# Global JSON containers
# ----------------------------
selected = {}   # pdf_name -> page_k -> {annotations, page_size}
masked   = {}   # pdf_name -> page_k -> {annotations, page_size}

# ----------------------------
# Main loop over all PDFs
# ----------------------------
pdf_files = sorted(PDF_DIR.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDFs in {PDF_DIR}")

for pdf_path in pdf_files:
    print("=" * 60)
    print("Processing PDF:", pdf_path.name)

    pdf_key = pdf_path.name
    pages = pdf_to_images(pdf_path, zoom=1.0)

    annotated_image_paths = []  # reset for each pdf

    for (page_idx, img_bgr) in pages:
        img_proc = preprocess_image(img_bgr, max_width=2000)
        results  = model.predict(
            source=img_proc,
            imgsz=1024,
            conf=0.25,
            iou=0.45,
            verbose=False
        )[0]

        page_key  = f"page_{page_idx+1}"
        page_size = {"width": img_bgr.shape[1], "height": img_bgr.shape[0]}

        sel_ann  = []
        mask_ann = []

        if results.boxes is not None:
            boxes   = results.boxes.xyxy.cpu().numpy()
            cls_ids = results.boxes.cls.cpu().numpy().astype(int)
            confs   = results.boxes.conf.cpu().numpy()

            for i, (box, cls_id, conf) in enumerate(zip(boxes, cls_ids, confs)):
                x1, y1, x2, y2 = box.tolist()
                width  = float(x2 - x1)
                height = float(y2 - y1)
                area   = width * height
                category = CLASS_NAMES[cls_id]

                ann_id_selected = f"annotation_{page_idx*1000 + i + 1}"
                sel_ann.append({
                    ann_id_selected: {
                        "category": category,
                        "bbox": {
                            "x": float(x1),
                            "y": float(y1),
                            "width": width,
                            "height": height
                        },
                        "area": area,
                        "confidence": float(conf),
                    }
                })

                ann_id_masked = f"annotation_{page_idx*1000 + i + 10000}"
                mask_ann.append({
                    ann_id_masked: {
                        "category": f"label_{cls_id+1}",  # simple label mapping
                        "bbox": {
                            "x": float(x1),
                            "y": float(y1),
                            "width": width,
                            "height": height
                        },
                        "area": area,
                        "confidence": float(conf),
                    }
                })

        # Attach to global JSON structures
        selected.setdefault(pdf_key, {})[page_key] = {
            "annotations": sel_ann,
            "page_size": page_size
        }
        masked.setdefault(pdf_key, {})[page_key] = {
            "annotations": mask_ann,
            "page_size": page_size
        }

        # Save annotated page image
        vis = results.plot()
        out_img_path = OUTPUT_IMAGE_FOLDER / f"{pdf_path.stem}_page{page_idx+1}.jpg"
        cv2.imwrite(str(out_img_path), vis)
        annotated_image_paths.append(out_img_path)

        print(f"  Page {page_idx+1}: {len(sel_ann)} detections, saved image: {out_img_path}")

    # Merge this PDF's page images into a single annotated PDF
    if annotated_image_paths:
        imgs = [Image.open(str(p)) for p in annotated_image_paths]
        pdf_save_path = ANNOTATED_PDF_FOLDER / f"{pdf_path.stem}_annotated.pdf"
        imgs[0].save(str(pdf_save_path), save_all=True, append_images=imgs[1:])
        print("  Saved annotated PDF:", pdf_save_path)
    else:
        print("  No pages/images to merge for:", pdf_path.name)

# ----------------------------
# Save global JSONs for ALL PDFs
# ----------------------------
with open(OUTPUT_JSON_FOLDER / "output_selected_annotations.json", "w", encoding="utf-8") as f:
    json.dump(selected, f, ensure_ascii=False, indent=2)

with open(OUTPUT_JSON_FOLDER / "output_masked_annotations.json", "w", encoding="utf-8") as f:
    json.dump(masked, f, ensure_ascii=False, indent=2)

print("=" * 60)
print("Saved JSON files to:", OUTPUT_JSON_FOLDER)
print("Annotated PDFs saved to:", ANNOTATED_PDF_FOLDER)


Loaded model with classes: {0: 'signature', 1: 'stamp', 2: 'qr'}
Found 13 PDFs in /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/test-20251116T042326Z-1-001/test
Processing PDF: Паспортпроекта-2.pdf
  Page 1: 0 detections, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/test-20251116T042326Z-1-001/output_images/Паспортпроекта-2_page1.jpg
  Page 2: 1 detections, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/test-20251116T042326Z-1-001/output_images/Паспортпроекта-2_page2.jpg
  Page 3: 0 detections, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/test-20251116T042326Z-1-001/output_images/Паспортпроекта-2_page3.jpg
  Page 4: 0 detections, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/test-20251116T042326Z-1-001/output_images/Паспортпроекта-2_page4.jpg
  Page 5: 2 detections, saved image: /home/gpuhead-1/Desktop/Makhmud/AISEC_hackathon/data/initial/test-20251116T042326Z