# Bounding-Box Highlight Workbench (PDF + JSON)

This notebook lets you **visualize and validate** bounding boxes for your extracted fields and table cells.

**What it does**
- Scans a root directory where each document lives in its own folder, e.g.:
  - `ROOT/<doc_folder>/<doc_folder>.pdf`
  - `ROOT/<doc_folder>/<doc_folder>.json`
  - (optionally) `<doc_folder>.ocr`, `<doc_folder>.metadata`
- Loads the PDF, renders each page as an image (via PyMuPDF).
- Parses your JSON with fields/tables using either **normalized** `[0..1]` or **pixel** coordinates.
- Draws **polygons** (if any) and **rectangles** for each span.
- Handles **multi-span** fields and **table cells** (with header styling).
- Exports **annotated PNGs** to an output folder and (optionally) a combined **annotated PDF**.

> ⚠️ You need **PyMuPDF (`pip install pymupdf`)** and **matplotlib** in this environment.

In [ ]:
# %pip install pymupdf matplotlib pillow tqdm

In [ ]:
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple
import os, json, math
from pathlib import Path

import fitz  # PyMuPDF
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle, Polygon
from PIL import Image

from tqdm import tqdm

ROOT = Path("./docs")  # <-- CHANGE to your directory
OUT = Path("./annotated_out")
OUT.mkdir(parents=True, exist_ok=True)

RENDER_SCALE = 2.0
RECT_ROUND = 2
FIELD_ALPHA = 0.15
FIELD_STROKE_W = 1.2
TABLE_COLOR = (0.0, 0.7, 0.5)
HEADER_ALPHA = 0.08
POLY_OVER_RECT = True
FIELD_COLOR_MAP = {"date": None, "amount": None, "id": None, "string": None}

In [ ]:
def ensure_page_geom(pdf_page, scale: float) -> Tuple[int, int]:
    vp = pdf_page.get_pixmap(matrix=fitz.Matrix(scale, scale))
    return vp.width, vp.height

def load_json(json_path: Path) -> Dict[str, Any]:
    with open(json_path, "r", encoding="utf-8") as f:
        return json.load(f)

def to_px_rect(bbox: Dict[str, Any], page_w: float, page_h: float):
    coord = (bbox.get("coordSpace") or bbox.get("coord_space") or "pixels").lower()
    x, y, w, h = bbox["x"], bbox["y"], bbox["w"], bbox["h"]
    if coord.startswith("norm"):
        return x * page_w, y * page_h, w * page_w, h * page_h
    return x, y, w, h

def to_px_poly(bbox: Dict[str, Any], page_w: float, page_h: float):
    poly = bbox.get("poly")
    if not poly: return None
    coord = (bbox.get("coordSpace") or bbox.get("coord_space") or "pixels").lower()
    if coord.startswith("norm"):
        return [(p["x"] * page_w, p["y"] * page_h) for p in poly]
    return [(p["x"], p["y"]) for p in poly]

def draw_field(ax, span, page_w, page_h, color=None, hovered=False):
    bbox = span["bbox"]
    x, y, w, h = to_px_rect(bbox, page_w, page_h)
    poly = to_px_poly(bbox, page_w, page_h)
    if poly and POLY_OVER_RECT:
        poly_patch = Polygon(poly, closed=True, fill=True, alpha=FIELD_ALPHA,
                             edgecolor=color, linewidth=FIELD_STROKE_W, facecolor=color)
        ax.add_patch(poly_patch)
    rect = Rectangle((x, y), w, h, linewidth=FIELD_STROKE_W, edgecolor=color,
                     facecolor=color, alpha=FIELD_ALPHA)
    ax.add_patch(rect)

def draw_table(ax, table, page_w, page_h):
    tbx = table.get("bbox", {})
    x, y, w, h = to_px_rect(tbx, page_w, page_h)
    rect = Rectangle((x, y), w, h, fill=False, edgecolor=TABLE_COLOR, linewidth=1.0)
    ax.add_patch(rect)
    for cell in table.get("cells", []):
        span = cell.get("span", {})
        bbox = span.get("bbox", {})
        cx, cy, cw, ch = to_px_rect(bbox, page_w, page_h)
        is_header = bool(cell.get("header", False))
        rc = Rectangle((cx, cy), cw, ch, fill=is_header, facecolor=TABLE_COLOR,
                       alpha=HEADER_ALPHA if is_header else 0.0, edgecolor=TABLE_COLOR,
                       linewidth=1.0, linestyle="--" if is_header else "-")
        ax.add_patch(rc)

In [ ]:
def find_doc_folders(root: Path):
    for p in root.iterdir():
        if p.is_dir():
            pdf = p / f"{p.name}.pdf"
            jsn = p / f"{p.name}.json"
            if pdf.exists() and jsn.exists():
                yield p, pdf, jsn

def annotate_doc(doc_folder, pdf_path, json_path, out_dir, scale=2.0):
    data = load_json(json_path)
    doc = fitz.open(pdf_path)
    doc_out_dir = out_dir / doc_folder.name
    doc_out_dir.mkdir(parents=True, exist_ok=True)

    for i, page in enumerate(doc):
        pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)
        page_w, page_h = pix.width, pix.height
        fig, ax = plt.subplots(figsize=(page_w/96, page_h/96), dpi=96)
        ax.imshow(Image.frombytes("RGB", [pix.width, pix.height], pix.samples))
        ax.set_axis_off()

        for fname, fobj in data.get("fields", {}).items():
            for s in fobj.get("spans", []):
                if int(s.get("page", -1)) == i + 1:
                    draw_field(ax, s, page_w, page_h)

        for t in data.get("tables", []) or []:
            if int(t.get("page", -1)) == i + 1:
                draw_table(ax, t, page_w, page_h)

        out_png = doc_out_dir / f"{doc_folder.name}_page_{i+1:03d}.png"
        fig.savefig(out_png, bbox_inches='tight', pad_inches=0)
        plt.close(fig)
    print(f"Annotated {doc_folder.name}")

def run_all(root=ROOT, out_dir=OUT, scale=RENDER_SCALE):
    for folder, pdf, jsn in tqdm(list(find_doc_folders(root))):
        annotate_doc(folder, pdf, jsn, out_dir, scale)
    print('Done.')

In [ ]:
# Run batch
run_all()

In [ ]:
def show_single_page(doc_folder: Path, page_index_zero_based=0, scale=2.0):
    pdf = doc_folder / f"{doc_folder.name}.pdf"
    jsn = doc_folder / f"{doc_folder.name}.json"
    data = load_json(jsn)
    doc = fitz.open(pdf)
    page = doc[page_index_zero_based]
    pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale), alpha=False)
    page_w, page_h = pix.width, pix.height
    fig, ax = plt.subplots(figsize=(page_w/96, page_h/96), dpi=96)
    ax.imshow(Image.frombytes("RGB", [pix.width, pix.height], pix.samples))
    ax.set_axis_off()
    for fname, fobj in data.get("fields", {}).items():
        for s in fobj.get("spans", []):
            if int(s.get("page", -1)) == page_index_zero_based+1:
                draw_field(ax, s, page_w, page_h)
    for t in data.get("tables", []) or []:
        if int(t.get("page", -1)) == page_index_zero_based+1:
            draw_table(ax, t, page_w, page_h)
    plt.show()

# Example:
# show_single_page(Path('./docs/FDS_001'), page_index_zero_based=0)