# 01 — Extract Blocks (Native + OCR)

From PDF to page-level native, OCR, and merged blocks. Emits per-page JSON artifacts.
**Papermill parameters:** `pdf_path`, `ocr_lang`, `merge_strategy`, `output_dir`, `dpi`.

In [None]:
# --- CONFIG ---
pdf_path = "input_pdfs/ET1-Adobe Scan 10 Sept 2025.pdf"
output_dir = "outputs/run_001/01_blocks"
ocr_lang = "en"
merge_strategy = "hybrid"
dpi = 300

# --- OCR / Language ---
ocr_lang        = "en"         # 'en','hi','te','mr','ta' or mix of above
dpi             = 300          # rasterization dpi for OCR
use_easyocr     = True
use_tesseract   = True
use_trocr       = False
use_paddle      = True         
use_surya       = True


# Paddle tuning (safe defaults)
paddle_det_model    = "DB"      # detector
paddle_rec_model    = "SVTR_LCNet" # recognizer (good for Latin+Indic)
paddle_use_anglecls = True
paddle_gpu          = False     # True if you’ve set up CUDA for Paddle

# --- Preprocessing ---
mask_banners    = True         # neutralize big colored headers/footers before OCR
banner_top_pct  = 0.18         # top colored band height fraction
banner_bot_pct  = 0.20         # bottom colored band height fraction

# --- Adaptive OCR knobs ---
FAST_DPI        = 200
FAST_MAX_SIDE   = 1800
HEAVY_DPI       = 300
HEAVY_MAX_SIDE  = 3000

NATIVE_STRONG_CHARS = 200   # if native >= this, skip heavy OCR
FAST_MIN_CHARS      = 400   # if fast OCR < this → escalate
FAST_MIN_MEANCONF   = 0.70
FAST_MIN_LINES      = 10

GRID_RESCUE         = True  # try heavy OCR on sparse cells
GRID_N              = 3     # 3x3
CELL_MIN_CHARS      = 60    # if a cell has < this, try heavy on that cell

# --- Merge / Filtering ---
native_len_gate = 100          # if native chars < this, prefer OCR
min_conf        = 0.50         # drop OCR fragments below this conf (unless no alternative)
line_join_px    = 14           # y-gap threshold (screen px) to join into lines
para_join_px    = 26           # y-gap threshold to join lines into paragraphs
dedup_iou_thr   = 0.50         # bbox IoU threshold to consider same region
dedup_sim_thr   = 0.92         # text similarity threshold (Levenshtein) to dedup

# --- Visualization ---
make_viz_png    = True         # dump quick overlay PNGs (for debugging)



In [None]:
from PIL import Image
import numpy as np
from difflib import SequenceMatcher

def clamp_long_side(pil: Image.Image, max_side:int) -> Image.Image:
    w, h = pil.size
    s = max(w, h)
    if s <= max_side:
        return pil
    scale = max_side / s
    return pil.resize((int(w*scale), int(h*scale)))

def page_metrics(blocks):
    if not blocks: 
        return 0, 0.0, 0
    chars = sum(len(b.get("text","")) for b in blocks)
    confs = [float(b.get("confidence", 0.0)) for b in blocks if "confidence" in b]
    mean_conf = (sum(confs)/len(confs)) if confs else 0.0
    return chars, mean_conf, len(blocks)

def need_escalation(native_chars:int, fast_chars:int, fast_mean_conf:float, fast_lines:int)->bool:
    if native_chars >= NATIVE_STRONG_CHARS:
        return False
    return (fast_chars < FAST_MIN_CHARS) or (fast_mean_conf < FAST_MIN_MEANCONF) or (fast_lines < FAST_MIN_LINES)

def split_grid(pil: Image.Image, n:int):
    w, h = pil.size
    cw, ch = w//n, h//n
    cells=[]
    for gy in range(n):
        for gx in range(n):
            x0, y0 = gx*cw, gy*ch
            x1, y1 = (gx+1)*cw if gx<n-1 else w, (gy+1)*ch if gy<n-1 else h
            cells.append(((x0,y0,x1,y1), pil.crop((x0,y0,x1,y1))))
    return cells

def blocks_in_cell(blocks, x0,y0,x1,y1):
    out=[]
    for b in blocks:
        bx0,by0,bx1,by1 = b["bbox"]
        if bx1<=x0 or by1<=y0 or bx0>=x1 or by0>=y1: 
            continue
        out.append(b)
    return out

def translate_blocks(blocks, dx, dy, source_suffix=None):
    out=[]
    for b in blocks:
        nb = dict(b)
        x0,y0,x1,y1 = b["bbox"]
        nb["bbox"] = [x0+dx, y0+dy, x1+dx, y1+dy]
        if source_suffix:
            nb["source"] = f"{b.get('source','')}{source_suffix}"
        out.append(nb)
    return out


In [None]:
_paddle_fast = None
_paddle_heavy = None

def build_paddle(langs, heavy=False):
    """Version-safe init; tries to pick mobile for fast and server for heavy (if supported)."""
    from paddleocr import PaddleOCR
    import inspect
    supported = {"en","hi","te","mr","ta"}
    paddle_lang = next((l for l in langs if l in supported), "en")

    params = set(inspect.signature(PaddleOCR).parameters.keys())
    kw = {"lang": paddle_lang}
    # Angle classifier if present
    if "use_angle_cls" in params:
        kw["use_angle_cls"] = True
    # New API doc flags (3.x)
    if "use_doc_orientation_classify" in params:
        kw.update(
            use_doc_orientation_classify=False,
            use_doc_unwarping=False,
            use_textline_orientation=False,
        )
    # Prefer mobile on fast, server on heavy if your version exposes model selectors
    if "det_model" in params:
        kw["det_model"] = "PP-OCRv5_server_det" if heavy else "PP-OCRv5_mobile_det"
    if "rec_model" in params:
        rec_prefix = paddle_lang + "_" if paddle_lang != "en" else "en_"
        kw["rec_model"] = f"{rec_prefix}PP-OCRv5_mobile_rec"  # mobile rec is fine both

    return PaddleOCR(**kw)

def get_paddle_fast(langs):
    global _paddle_fast
    if _paddle_fast is None:
        try: _paddle_fast = build_paddle(langs, heavy=False)
        except Exception as e:
            print("[WARN] Paddle FAST init failed:", e); _paddle_fast = None
    return _paddle_fast

def get_paddle_heavy(langs):
    global _paddle_heavy
    if _paddle_heavy is None:
        try: _paddle_heavy = build_paddle(langs, heavy=True)
        except Exception as e:
            print("[WARN] Paddle HEAVY init failed, falling back to FAST:", e); _paddle_heavy = get_paddle_fast(langs)
    return _paddle_heavy


In [None]:
# ## 1) Imports & Setup

# %%
import io, os, json, re, math, traceback
import numpy as np
from typing import List, Dict, Any, Tuple
from pathlib import Path

# Core
import fitz  # PyMuPDF
from PIL import Image, ImageDraw
import easyocr
import pytesseract
import warnings
warnings.filterwarnings("ignore", message="'pin_memory' argument")

from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch

# Optional TrOCR (handwriting)
trocr_processor = trocr_model = None
if use_trocr:
    try:
        trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
        trocr_model     = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
        trocr_model.eval()
        print("[INFO] TrOCR initialized.")
    except Exception as e:
        print("[WARN] TrOCR init failed, disabling handwriting OCR:", e)
        trocr_processor = trocr_model = None
        use_trocr = False

# Fuzzy similarity (for dedup)
try:
    from rapidfuzz.fuzz import ratio as fuzz_ratio
except Exception:
    def fuzz_ratio(a,b):  # tiny fallback
        try:
            from difflib import SequenceMatcher
            return int(100*SequenceMatcher(None, a, b).ratio())
        except Exception:
            return 0

# Paths
run_root = output_dir
pdf_path = Path(pdf_path).expanduser().resolve()
out_dir  = Path(run_root).expanduser().resolve()
out_dir.mkdir(parents=True, exist_ok=True)

print("[INFO] PDF:", pdf_path)
print("[INFO] Out:", out_dir)

# Language map
LANG_MAP = {
    "en": ["en"],
    "hi": ["hi","en"],
    "te": ["te","en"],
    "mr": ["mr","en"],
    "ta": ["ta","en"]
}
langs = LANG_MAP.get(ocr_lang.lower(), ["en"])

# %% [markdown]
# ## 2) Utilities

# %%
def page_to_image(doc, page_index: int, dpi: int=300) -> Image.Image:
    """Rasterize a PDF page to PIL image."""
    page = doc[page_index]
    zoom = dpi / 72
    mat  = fitz.Matrix(zoom, zoom)
    pix  = page.get_pixmap(matrix=mat, alpha=False)
    return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

def mask_bands(pil: Image.Image, top_pct: float, bot_pct: float) -> Image.Image:
    """Roughly paint over colored banners to reduce junk OCR."""
    if not mask_banners: 
        return pil
    w,h   = pil.size
    top_h = int(h * max(0, min(0.45, top_pct)))
    bot_h = int(h * max(0, min(0.45, bot_pct)))
    out   = pil.copy()
    draw  = ImageDraw.Draw(out)
    # estimate background
    bg = (240,240,240)
    if top_h>0: draw.rectangle([0,0,w,top_h], fill=bg)
    if bot_h>0: draw.rectangle([0,h-bot_h,w,h], fill=bg)
    return out

def blocks_sort_key(b):
    y0 = round(b["bbox"][1],1)
    x0 = round(b["bbox"][0],1)
    return (y0, x0)

def iou(a, b) -> float:
    ax0, ay0, ax1, ay1 = a
    bx0, by0, bx1, by1 = b
    inter_x0 = max(ax0, bx0); inter_y0 = max(ay0, by0)
    inter_x1 = min(ax1, bx1); inter_y1 = min(ay1, by1)
    iw = max(0.0, inter_x1 - inter_x0); ih = max(0.0, inter_y1 - inter_y0)
    inter = iw * ih
    if inter <= 0: return 0.0
    area_a = (ax1-ax0)*(ay1-ay0); area_b = (bx1-bx0)*(by1-by0)
    return inter / max(1e-6, area_a + area_b - inter)

def section_of(bbox, page_h):
    cy = 0.5 * (bbox[1] + bbox[3])
    if cy < 0.25*page_h: return "header"
    if cy > 0.85*page_h: return "footer"
    return "body"


# --- Lazy singletons for OCR engines ---
_easy_reader = None
_paddle_ocr  = None

def get_easyocr(langs):
    global _easy_reader
    if _easy_reader is None:
        _easy_reader = easyocr.Reader(langs, gpu=False)
    return _easy_reader

def _map_langs_to_paddle(langs):
    """
    Map your LANG_MAP entries to PaddleOCR lang tags.
    Paddle supports: 'en','hi','te','mr','ta' (and many more).
    We'll pick the first supported language; Paddle is single-lang per OCR instance.
    """
    supported = {"en","hi","te","mr","ta"}
    for l in langs:
        if l in supported:
            return l
    return "en"
def get_paddleocr(langs):
    """Works with PaddleOCR 2.7 → 3.x (no deprecated kwargs)."""
    global _paddle_ocr
    if _paddle_ocr is not None:
        return _paddle_ocr

    try:
        from paddleocr import PaddleOCR
    except Exception as e:
        print("[WARN] PaddleOCR import failed; disabling:", e)
        return None

    supported = {"en","hi","te","mr","ta"}
    paddle_lang = next((l for l in langs if l in supported), "en")

    # Build kwargs by introspecting supported params
    import inspect
    sig = inspect.signature(PaddleOCR)
    params = sig.parameters
    init_kwargs = {"lang": paddle_lang}
    if "use_angle_cls" in params:              # old & some mid builds
        init_kwargs["use_angle_cls"] = True
    if "use_doc_orientation_classify" in params:  # new 3.x builds
        init_kwargs.update(
            use_doc_orientation_classify=False,
            use_doc_unwarping=False,
            use_textline_orientation=False,
        )
    # DO NOT pass: show_log / use_gpu / det_algorithm / rec_algorithm

    try:
        _paddle_ocr = PaddleOCR(**init_kwargs)
        print(f"[INFO] PaddleOCR initialized (lang={paddle_lang})")
    except Exception as e:
        print("[WARN] PaddleOCR init failed:", e)
        _paddle_ocr = None
    return _paddle_ocr



#-- surya ---

_surya = None

def get_surya():
    global _surya
    if _surya is None:
        from surya.ocr import Reader  # API wrapper; thin, loads detection/rec/LA models
        _surya = Reader()             # Surya auto-handles multilingual and layout
        print("[INFO] Surya OCR initialized")
    return _surya

def extract_surya(pil: Image.Image) -> list[dict]:
    if not use_surya:
        return []
    try:
        reader = get_surya()
        # Surya accepts numpy arrays; returns lines with boxes & confidence
        arr = np.array(pil)
        result = reader.read(arr, return_lines=True)  # lines with bbox + text + conf
    except Exception as e:
        print("[WARN] Surya failed:", e); return []
    out = []
    for ln in result or []:
        # ln: {"text":..., "confidence":..., "bbox":[x0,y0,x1,y1]}  (API shape simplified)
        txt = (ln.get("text") or "").strip()
        if not txt: continue
        conf = float(ln.get("confidence", 0.0))
        if conf < min_conf: continue
        x0,y0,x1,y1 = [float(v) for v in ln.get("bbox", [0,0,0,0])]
        out.append({"bbox":[x0,y0,x1,y1], "text":txt, "confidence":conf, "source":"surya"})
    return sorted(out, key=blocks_sort_key)
# --- end surya ---

# %% [markdown]
# ## 3) Extractors

# %%
def extract_native(page) -> List[Dict[str,Any]]:
    """PyMuPDF block extraction."""
    out = []
    for b in page.get_text("blocks"):
        if len(b) >= 5:
            x0,y0,x1,y1,txt = b[:5]
            if txt and str(txt).strip():
                out.append({
                    "bbox":[float(x0),float(y0),float(x1),float(y1)],
                    "text":str(txt).strip(),
                    "source":"native",
                    "confidence":1.0
                })
    return sorted(out, key=blocks_sort_key)

def extract_easyocr(pil: Image.Image, langs: List[str]) -> List[Dict[str,Any]]:
    if not easyocr or not use_easyocr:
        return []
    # reader = easyocr.Reader(langs, gpu=False)
    reader = get_easyocr(langs)   
    res = reader.readtext(np.array(pil), detail=1, paragraph=True)
    out=[]
    for item in res:
        if not isinstance(item,(list,tuple)) or len(item)<2: 
            continue
        bbox, text = item[0], item[1]
        conf = float(item[2]) if len(item)>2 else 1.0
        try:
            xs=[p[0] for p in bbox]; ys=[p[1] for p in bbox]
            x0,y0,x1,y1 = min(xs),min(ys),max(xs),max(ys)
        except Exception:
            continue
        if conf < min_conf: 
            continue
        out.append({"bbox":[x0,y0,x1,y1],"text":(text or "").strip(),"confidence":conf,"source":"easyocr"})
    return sorted(out, key=blocks_sort_key)

def extract_paddleocr(ocr, pil: Image.Image) -> list[dict]:
    if ocr is None:
        return []
    tmp = out_dir / "_paddle_tmp.png"
    pil.save(tmp)
    try:
        result = ocr.predict(str(tmp)) if hasattr(ocr,"predict") else ocr.ocr(str(tmp))
    except Exception as e:
        print("[WARN] Paddle predict/ocr failed:", e); result=None
    finally:
        try: tmp.unlink(missing_ok=True)
        except: pass
    if not result:
        return []
    # robust parse
    out=[]
    batches = result if isinstance(result, list) else [result]
    for batch in batches or []:
        for it in batch or []:
            try:
                if isinstance(it, dict):
                    txt=(it.get("text") or "").strip()
                    conf=float(it.get("score",it.get("confidence",0.0)))
                    box=it.get("box") or it.get("bbox")
                    if not (txt and box): continue
                    xs=[p[0] for p in box]; ys=[p[1] for p in box]
                else:
                    poly, rec = it[0], it[1]
                    txt=(rec[0] or "").strip()
                    conf=float(rec[1] or 0.0)
                    xs=[p[0] for p in poly]; ys=[p[1] for p in poly]
                if conf < min_conf or not txt: continue
                out.append({"bbox":[min(xs),min(ys),max(xs),max(ys)],
                            "text":txt,"confidence":conf,"source":"paddle"})
            except: 
                continue
    return sorted(out, key=blocks_sort_key)


def extract_tesseract(pil: Image.Image, langs: List[str]) -> List[Dict[str,Any]]:
    if not pytesseract or not use_tesseract:
        return []
    lang = "+".join({
        "en":"eng","hi":"hin","te":"tel","mr":"mar","ta":"tam"
    }.get(x,"eng") for x in langs)
    cfg = "--oem 3 --psm 6"
    txt = pytesseract.image_to_string(pil, lang=lang, config=cfg)
    if not txt.strip(): 
        return []
    # one big block (tess loses layout here); we still keep it to not miss content
    w,h = pil.size
    return [{"bbox":[0,0,w,h], "text":txt.strip(), "confidence":0.8, "source":"tesseract"}]

def extract_trocr(pil: Image.Image) -> List[Dict[str,Any]]:
    """Very rough line-based handwriting OCR with TrOCR (slow)."""
    if not use_trocr or not (trocr_model and trocr_processor):
        return []
    # naive: resize & run as single image
    import torch
    with torch.no_grad():
        pixel_values = trocr_processor(images=pil, return_tensors="pt").pixel_values
        generated_ids = trocr_model.generate(pixel_values)
        text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    if not text.strip():
        return []
    w,h = pil.size
    return [{"bbox":[0,0,w,h], "text":text.strip(), "confidence":0.75, "source":"trocr"}]

# %% [markdown]
# ## 4) Post–processing (regroup, dedup, merge)

# %%
def regroup_lines(blocks: List[Dict[str,Any]], line_gap:int=14, para_gap:int=26)->List[Dict[str,Any]]:
    """Join tiny fragments into lines & paragraphs by Y proximity and reading order."""
    if not blocks: 
        return []
    # sort by y,x
    bs = sorted(blocks, key=blocks_sort_key)
    rows = []
    cur = [bs[0]]
    for b in bs[1:]:
        prev = cur[-1]
        # if near same line
        if abs(b["bbox"][1] - prev["bbox"][1]) <= line_gap:
            cur.append(b)
        else:
            rows.append(cur); cur=[b]
    rows.append(cur)

    # join line fragments
    lines=[]
    for row in rows:
        row = sorted(row, key=lambda x:x["bbox"][0])
        text = " ".join(x["text"] for x in row if x["text"])
        x0 = min(x["bbox"][0] for x in row); y0=min(x["bbox"][1] for x in row)
        x1 = max(x["bbox"][2] for x in row); y1=max(x["bbox"][3] for x in row)
        src= "+".join(sorted(set(x["source"] for x in row)))
        conf=sum(x.get("confidence",1.0) for x in row)/len(row)
        lines.append({"bbox":[x0,y0,x1,y1], "text":text.strip(), "source":src, "confidence":conf})

    # paragraph join
    paras=[]
    current=[lines[0]]
    for ln in lines[1:]:
        prev=current[-1]
        if abs(ln["bbox"][1]-prev["bbox"][3]) <= para_gap:
            current.append(ln)
        else:
            # flush
            txt=" ".join(x["text"] for x in current if x["text"])
            x0=min(x["bbox"][0] for x in current); y0=min(x["bbox"][1] for x in current)
            x1=max(x["bbox"][2] for x in current); y1=max(x["bbox"][3] for x in current)
            src="+".join(sorted(set(",".join(x["source"] for x in current).split("+"))))
            conf=sum(x.get("confidence",1.0) for x in current)/len(current)
            paras.append({"bbox":[x0,y0,x1,y1], "text":txt.strip(), "source":src, "confidence":conf})
            current=[ln]
    if current:
        txt=" ".join(x["text"] for x in current)
        x0=min(x["bbox"][0] for x in current); y0=min(x["bbox"][1] for x in current)
        x1=max(x["bbox"][2] for x in current); y1=max(x["bbox"][3] for x in current)
        src="+".join(sorted(set(",".join(x["source"] for x in current).split("+"))))
        conf=sum(x.get("confidence",1.0) for x in current)/len(current)
        paras.append({"bbox":[x0,y0,x1,y1], "text":txt.strip(), "source":src, "confidence":conf})
    return paras

def deduplicate(blocks: List[Dict[str,Any]], iou_thr:float=0.45, sim_thr:float=0.90)->List[Dict[str,Any]]:
    out=[]
    for b in sorted(blocks, key=lambda x: (-x.get("confidence",1.0), len(x.get("text","")), )):
        t = (b.get("text","") or "").strip()
        if not t: 
            continue
        keep=True
        for a in out:
            if iou(b["bbox"], a["bbox"]) >= iou_thr:
                if fuzz_ratio(t.lower(), a["text"].lower())/100.0 >= sim_thr:
                    keep=False; break
        if keep: out.append(b)
    return sorted(out, key=blocks_sort_key)

def merge_ensemble(native: List[Dict], ocrs: List[List[Dict]], page_h: int) -> List[Dict]:
    # union
    all_blocks = []
    all_blocks.extend(native)
    for s in ocrs:
        all_blocks.extend(s)
    # regroup → dedup
    regrouped = regroup_lines(all_blocks, line_join_px, para_join_px)
    deduped   = deduplicate(regrouped, dedup_iou_thr, dedup_sim_thr)
    # tag section
    for b in deduped:
        b["section"] = section_of(b["bbox"], page_h)
    return deduped

# %% [markdown]
# ## 5) Main loop

# %%
if not fitz:
    raise RuntimeError("PyMuPDF required for this step.")

doc  = fitz.open(pdf_path)
meta = {"pages": len(doc), "dpi": dpi, "langs": langs, "ensemble":{"easyocr":use_easyocr,"tesseract":use_tesseract,"trocr":use_trocr}}
(out_dir/"metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")

for i in range(len(doc)):
    page = doc[i]
    # native
    native = extract_native(page)
    (out_dir/f"page_{i+1:03d}_native.json").write_text(json.dumps(native,indent=2,ensure_ascii=False), encoding="utf-8")

    # rasterize + preprocess
    pil = page_to_image(doc, i, dpi=dpi)
    if mask_banners:
        pil_for_ocr = mask_bands(pil, banner_top_pct, banner_bot_pct)
    else:
        pil_for_ocr = pil

    pil_fast  = clamp_long_side(mask_bands(pil, banner_top_pct, banner_bot_pct) if mask_banners else pil, FAST_MAX_SIDE)
    pil_heavy = clamp_long_side(pil, HEAVY_MAX_SIDE)

    # OCR ensemble
    o_easy = extract_easyocr(pil_for_ocr, langs) if use_easyocr else []
    o_tess = extract_tesseract(pil_for_ocr, langs) if use_tesseract else []
    o_troc = extract_trocr(pil_for_ocr) if use_trocr else []
    o_padl = extract_paddleocr(pil_for_ocr, langs) if use_paddle else []   
    o_sur  = extract_surya(pil_for_ocr) if use_surya else []

    # keep raw outputs for traceability
    (out_dir/f"page_{i+1:03d}_ocr_easy.json").write_text(json.dumps(o_easy,indent=2,ensure_ascii=False), encoding="utf-8")
    (out_dir/f"page_{i+1:03d}_ocr_tess.json").write_text(json.dumps(o_tess,indent=2,ensure_ascii=False), encoding="utf-8")
    if use_trocr:
        (out_dir/f"page_{i+1:03d}_ocr_trocr.json").write_text(json.dumps(o_troc,indent=2,ensure_ascii=False), encoding="utf-8")
    if use_paddle:
        (out_dir/f"page_{i+1:03d}_ocr_paddle.json").write_text(json.dumps(o_padl,indent=2,ensure_ascii=False), encoding="utf-8")
    if use_surya:
        (out_dir/f"page_{i+1:03d}_ocr_surya.json").write_text(json.dumps(o_sur,indent=2,ensure_ascii=False), encoding="utf-8")

    # pick strategy if native weak
    native_chars = sum(len(b.get("text","")) for b in native)
    ocr_heads = [o_easy, o_tess, o_troc, o_padl, o_sur]
    merged = merge_ensemble(native if native_chars >= native_len_gate else [],
                            ocr_heads, pil.height)
    # save merged
    out_path = out_dir / f"page_{i+1:03d}_blocks.json"
    out_path.write_text(json.dumps(merged, indent=2, ensure_ascii=False), encoding="utf-8")
    print(
        f"[page {i+1}] "
        f"native={len(native)} "
        f"easy={len(o_easy)} "
        f"tess={len(o_tess)} "
        f"trocr={len(o_troc)} "
        f"paddle={len(o_padl)} "
        f"surya={len(o_sur)} "
        f"→ merged={len(merged)}"
    )

    # optional viz overlay
    if make_viz_png:
        im = pil.copy()
        dr = ImageDraw.Draw(im, "RGBA")
        for b in merged:
            x0,y0,x1,y1 = map(int, b["bbox"])
            src = (b.get("source") or "").lower()
            col = (122, 199, 136, 80)   # green-ish default
            if "easyocr" in src: col=(255,165,0,90)   # orange
            if "tesseract" in src: col=(70,130,180,90) # steelblue
            if "native" in src: col=(147,112,219,90)   # purple
            dr.rectangle([x0,y0,x1,y1], outline=(0,0,0,180), width=2, fill=col)
        im.save(out_dir/f"page_{i+1:03d}_viz.png")

print("✅ Done Extraction complete → :", out_dir)
