
# OCR (MCP-first) Pipeline — Notebook

This notebook uses **MCP-backed OCR servers** (HTTP `/ocr`) for **Tesseract**, **PaddleOCR**, and **Surya**.  
No local OCR engines run inside the kernel — each OCR engine lives in its own process.

**What you get:**
- 3 server scripts (download below) to run OCR backends:
  - `mcp_ocr_tesseract.py` → http://127.0.0.1:8089/ocr
  - `mcp_ocr_paddle.py`    → http://127.0.0.1:8090/ocr
  - `mcp_ocr_surya.py`     → http://127.0.0.1:8091/ocr
- A pipeline that calls these endpoints only (MCP/HTTP), merges blocks, and saves per-page JSON + viz overlays.

> Tip: Each server also mounts an **MCP adapter** at `/mcp` if `fastmcp` is installed.



## 0) Install dependencies (run in a terminal / new cell if needed)

```bash
# base
pip install fastapi uvicorn pillow requests nbformat rapidfuzz

# native text extraction (optional but recommended)
pip install pymupdf

# OCR engines (install what you plan to run as servers)
pip install pytesseract
# macOS: brew install tesseract tesseract-lang  (or similar)

pip install paddleocr  # (or follow PaddlePaddle platform-specific instructions)
pip install surya-ocr  # requires torch; CPU-only:
pip install -U torch --index-url https://download.pytorch.org/whl/cpu

# Optional MCP adapter
pip install fastmcp
```



## 1) (Recommended) Start servers in separate terminals

**Terminal A (Tesseract):**
```bash
python mcp_ocr_tesseract.py  # serves http://127.0.0.1:8089/ocr
```

**Terminal B (Paddle):**
```bash
python mcp_ocr_paddle.py     # serves http://127.0.0.1:8090/ocr
```

**Terminal C (Surya):**
```bash
python mcp_ocr_surya.py      # serves http://127.0.0.1:8091/ocr
```


In [5]:

# 2) Config — endpoints & pipeline options
MCP_ENDPOINTS = {
    "tesseract": "http://127.0.0.1:8089/ocr",  # set None to disable any
    "paddle":    "http://127.0.0.1:8090/ocr",
    "surya":     "http://127.0.0.1:8091/ocr",
}

PDF_PATH    = "input_pdfs/ET1-Adobe Scan 10 Sept 2025.pdf"  # change me
OUTPUT_DIR  = "outputs/run_mcp/01_blocks"
OCR_LANG    = "en"   # 'en','hi','te','mr','ta'

# rasterization
DPI             = 300
MASK_BANNERS    = True
BANNER_TOP_PCT  = 0.18
BANNER_BOT_PCT  = 0.20

# merge/filter
MIN_CONF        = 0.50
LINE_JOIN_PX    = 14
PARA_JOIN_PX    = 26
DEDUP_IOU_THR   = 0.50
DEDUP_SIM_THR   = 0.92
NATIVE_LEN_GATE = 100

# viz
MAKE_VIZ_PNG    = True

from pathlib import Path
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
print("Endpoints:", MCP_ENDPOINTS)


Endpoints: {'tesseract': 'http://127.0.0.1:8089/ocr', 'paddle': 'http://127.0.0.1:8090/ocr', 'surya': 'http://127.0.0.1:8091/ocr'}


In [6]:

# 3) Utilities (native extraction + merge)
import json, io, requests, numpy as np
from PIL import Image, ImageDraw
from pathlib import Path

# Try PyMuPDF; fallback to pdf2image if missing
try:
    import fitz as _fitz
    HAVE_FITZ = True
except Exception:
    HAVE_FITZ = False

def clamp_long_side(pil: Image.Image, max_side:int) -> Image.Image:
    w, h = pil.size
    s = max(w, h)
    if s <= max_side: return pil
    scale = max_side / s
    return pil.resize((int(w*scale), int(h*scale)))

def page_to_image(doc, page_index: int, dpi: int=300) -> Image.Image:
    page = doc[page_index]
    zoom = dpi / 72
    mat  = _fitz.Matrix(zoom, zoom)
    pix  = page.get_pixmap(matrix=mat, alpha=False)
    return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

def mask_bands(pil: Image.Image, top_pct: float, bot_pct: float) -> Image.Image:
    if not MASK_BANNERS: 
        return pil
    w,h   = pil.size
    top_h = int(h * max(0, min(0.45, top_pct)))
    bot_h = int(h * max(0, min(0.45, bot_pct)))
    out   = pil.copy()
    draw  = ImageDraw.Draw(out)
    bg = (240,240,240)
    if top_h>0: draw.rectangle([0,0,w,top_h], fill=bg)
    if bot_h>0: draw.rectangle([0,h-bot_h,w,h], fill=bg)
    return out

def blocks_sort_key(b):
    y0 = round(b["bbox"][1],1); x0 = round(b["bbox"][0],1)
    return (y0, x0)

def iou(a, b) -> float:
    ax0, ay0, ax1, ay1 = a; bx0, by0, bx1, by1 = b
    inter_x0 = max(ax0, bx0); inter_y0 = max(ay0, by0)
    inter_x1 = min(ax1, bx1); inter_y1 = min(ay1, by1)
    iw = max(0.0, inter_x1 - inter_x0); ih = max(0.0, inter_y1 - inter_y0)
    inter = iw * ih
    if inter <= 0: return 0.0
    area_a = (ax1-ax0)*(ay1-ay0); area_b = (bx1-bx0)*(by1-by0)
    return inter / max(1e-6, area_a + area_b - inter)

# fuzzy sim (dedup)
try:
    from rapidfuzz.fuzz import ratio as fuzz_ratio
except Exception:
    from difflib import SequenceMatcher
    def fuzz_ratio(a,b): return int(100*SequenceMatcher(None, a, b).ratio())

def regroup_lines(blocks, line_gap:int=14, para_gap:int=26):
    if not blocks: return []
    bs = sorted(blocks, key=blocks_sort_key)
    rows, cur = [], [bs[0]]
    for b in bs[1:]:
        prev = cur[-1]
        if abs(b["bbox"][1] - prev["bbox"][1]) <= line_gap:
            cur.append(b)
        else:
            rows.append(cur); cur=[b]
    rows.append(cur)

    lines=[]
    for row in rows:
        row = sorted(row, key=lambda x:x["bbox"][0])
        text = " ".join(x["text"] for x in row if x["text"])
        x0 = min(x["bbox"][0] for x in row); y0=min(x["bbox"][1] for x in row)
        x1 = max(x["bbox"][2] for x in row); y1=max(x["bbox"][3] for x in row)
        src= "+".join(sorted(set(x["source"] for x in row)))
        conf=sum(x.get("confidence",1.0) for x in row)/len(row)
        lines.append({"bbox":[x0,y0,x1,y1], "text":text.strip(), "source":src, "confidence":conf})

    paras=[]
    current=[lines[0]]
    for ln in lines[1:]:
        prev=current[-1]
        if abs(ln["bbox"][1]-prev["bbox"][3]) <= para_gap:
            current.append(ln)
        else:
            txt=" ".join(x["text"] for x in current if x["text"])
            x0=min(x["bbox"][0] for x in current); y0=min(x["bbox"][1] for x in current)
            x1=max(x["bbox"][2] for x in current); y1=max(x["bbox"][3] for x in current)
            src="+".join(sorted(set(",".join(x["source"] for x in current).split("+"))))
            conf=sum(x.get("confidence",1.0) for x in current)/len(current)
            paras.append({"bbox":[x0,y0,x1,y1], "text":txt.strip(), "source":src, "confidence":conf})
            current=[ln]
    if current:
        txt=" ".join(x["text"] for x in current)
        x0=min(x["bbox"][0] for x in current); y0=min(x["bbox"][1] for x in current)
        x1=max(x["bbox"][2] for x in current); y1=max(x["bbox"][3] for x in current)
        src="+".join(sorted(set(",".join(x["source"] for x in current).split("+"))))
        conf=sum(x.get("confidence",1.0) for x in current)/len(current)
        paras.append({"bbox":[x0,y0,x1,y1], "text":txt.strip(), "source":src, "confidence":conf})
    return paras

def deduplicate(blocks, iou_thr:float=0.50, sim_thr:float=0.92):
    out=[]
    for b in sorted(blocks, key=lambda x: (-x.get("confidence",1.0), len(x.get("text","")))):
        t = (b.get("text","") or "").strip()
        if not t: continue
        keep=True
        for a in out:
            if iou(b["bbox"], a["bbox"]) >= iou_thr:
                if fuzz_ratio(t.lower(), a["text"].lower())/100.0 >= sim_thr:
                    keep=False; break
        if keep: out.append(b)
    return sorted(out, key=blocks_sort_key)

def merge_ensemble(native, ocrs, page_h:int):
    all_blocks = []; all_blocks.extend(native)
    for s in ocrs: all_blocks.extend(s)
    regrouped = regroup_lines(all_blocks, LINE_JOIN_PX, PARA_JOIN_PX)
    deduped   = deduplicate(regrouped, DEDUP_IOU_THR, DEDUP_SIM_THR)
    return deduped

def extract_native(page):
    out = []
    for b in page.get_text("blocks"):
        if len(b) >= 5:
            x0,y0,x1,y1,txt = b[:5]
            if txt and str(txt).strip():
                out.append({
                    "bbox":[float(x0),float(y0),float(x1),float(y1)],
                    "text":str(txt).strip(),
                    "source":"native",
                    "confidence":1.0
                })
    return sorted(out, key=blocks_sort_key)


In [7]:

# 4) Client to call MCP endpoints and normalize blocks
def call_mcp(endpoint: str, pil_img: Image.Image, lang="en"):
    if not endpoint:
        return []
    try:
        buf = io.BytesIO()
        pil_img.save(buf, format="PNG"); buf.seek(0)
        r = requests.post(endpoint, data={"lang": lang},
                          files={"image": ("page.png", buf.getvalue(), "image/png")},
                          timeout=60)
        r.raise_for_status()
        js = r.json()
    except Exception as e:
        print(f"[WARN] MCP call failed for {endpoint}: {e}")
        return []

    blocks = []
    # Preferred: {"blocks":[{"text","confidence","bbox":[x0,y0,x1,y1]}, ...]}
    if isinstance(js.get("blocks"), list):
        for it in js["blocks"]:
            try:
                txt = (it.get("text") or "").strip()
                conf = float(it.get("confidence", 0.0))
                box = it.get("bbox") or it.get("box")
                if not (txt and box): 
                    continue
                if isinstance(box, (list,tuple)) and len(box)==4:
                    x0,y0,x1,y1 = [float(v) for v in box]
                else:
                    xs = [p[0] for p in box]; ys = [p[1] for p in box]
                    x0,y0,x1,y1 = float(min(xs)), float(min(ys)), float(max(xs)), float(max(ys))
                blocks.append({"bbox":[x0,y0,x1,y1], "text":txt, "confidence":conf, "source":"paddle_mcp"})
            except Exception:
                continue
        return blocks

    # Fallback: {"text":"...", "avg_confidence":...}
    if js.get("text"):
        w,h = pil_img.size
        conf = float(js.get("avg_confidence", 0.0))
        blocks.append({"bbox":[0.0,0.0,float(w),float(h)], "text":js["text"].strip(), "confidence":conf, "source":"paddle_mcp"})
    return blocks


In [8]:

# 5) Run pipeline (MCP only for OCR)
from pathlib import Path
from PIL import Image

pdf_path = Path(PDF_PATH).expanduser().resolve()
out_dir  = Path(OUTPUT_DIR).expanduser().resolve()
out_dir.mkdir(parents=True, exist_ok=True)

if HAVE_FITZ:
    doc = _fitz.open(str(pdf_path))
    total_pages = len(doc)
else:
    try:
        from pdf2image import convert_from_path
    except Exception:
        raise RuntimeError("Neither PyMuPDF nor pdf2image available. Install one of them.")
    pages = convert_from_path(str(pdf_path), dpi=DPI)
    total_pages = len(pages)

meta = {
    "pages": total_pages,
    "dpi": DPI,
    "lang": OCR_LANG,
    "endpoints": MCP_ENDPOINTS,
}
(out_dir/"metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("[INFO] PDF:", pdf_path)
print("[INFO] Out:", out_dir)

for i in range(total_pages):
    # native
    if HAVE_FITZ:
        page = doc[i]
        native = extract_native(page)
        pil    = page_to_image(doc, i, dpi=DPI)
    else:
        native = []
        pil = pages[i]

    pil_base = mask_bands(pil, BANNER_TOP_PCT, BANNER_BOT_PCT) if MASK_BANNERS else pil

    # Call each MCP endpoint configured
    heads = []
    for name, ep in MCP_ENDPOINTS.items():
        if not ep: 
            continue
        bs = call_mcp(ep, pil_base, lang=OCR_LANG)
        # tag source by engine name
        for b in bs: 
            b["source"] = name
            # conf gate
        bs = [b for b in bs if b.get("confidence",0.0) >= MIN_CONF or not b.get("confidence")]
        # dump raw
        Path(out_dir / f"page_{i+1:03d}_ocr_{name}.json").write_text(json.dumps(bs, indent=2, ensure_ascii=False), encoding="utf-8")
        heads.append(bs)

    merged = merge_ensemble(native if (sum(len(b.get('text','')) for b in native) >= NATIVE_LEN_GATE) else [], heads, pil.height)
    Path(out_dir / f"page_{i+1:03d}_blocks.json").write_text(json.dumps(merged, indent=2, ensure_ascii=False), encoding="utf-8")

    print(f"[page {i+1}] native={len(native)} " + " ".join(f"{k}={len(v)}" for k,v in zip(MCP_ENDPOINTS.keys(), heads)) + f" → merged={len(merged)}")

    if MAKE_VIZ_PNG:
        im = pil.copy()
        dr = ImageDraw.Draw(im, "RGBA")
        for b in merged:
            x0,y0,x1,y1 = map(int, b["bbox"])
            src = (b.get("source") or "").lower()
            col = (122,199,136,80)
            if "tesseract" in src: col=(70,130,180,90)
            if "paddle"    in src: col=(34,139,34,90)
            if "surya"     in src: col=(220,20,60,90)
            if "native"    in src: col=(147,112,219,90)
            dr.rectangle([x0,y0,x1,y1], outline=(0,0,0,180), width=2, fill=col)
        im.save(out_dir/f"page_{i+1:03d}_viz.png")

print("✅ Done Extraction →", out_dir)


[INFO] PDF: /Users/balijepalli/Documents/GitHub/entheory-ai/notebooks/input_pdfs/ET1-Adobe Scan 10 Sept 2025.pdf
[INFO] Out: /Users/balijepalli/Documents/GitHub/entheory-ai/notebooks/outputs/run_mcp/01_blocks
[page 1] native=38 tesseract=79 paddle=0 surya=0 → merged=11
[page 2] native=54 tesseract=186 paddle=0 surya=0 → merged=16
[page 3] native=14 tesseract=84 paddle=0 surya=0 → merged=25
[page 4] native=27 tesseract=71 paddle=0 surya=0 → merged=25
✅ Done Extraction → /Users/balijepalli/Documents/GitHub/entheory-ai/notebooks/outputs/run_mcp/01_blocks
