In [1]:
# %% [markdown]
# 01a — Normalize multi-layout structures (fixed)
# Input : outputs/run_001/01_blocks/page_*_blocks.json
# Output: outputs/run_001/01a_normalized/page_*_blocks.norm.json
# What this does:
#   • unify reading order (top→bottom, left→right)
#   • adaptive line merge (scaled by median text height)
#   • tag (not drop) header/footer regions
#   • simple 1/2-column stitching if columns are well-separated
#   • PRESERVE structure: keep per-row fragments as `text_lines`
# QA printed: counts, character coverage, tags, mode

# %%
import json, statistics, re, math
from pathlib import Path
from typing import List, Dict, Any, Tuple

RUN_ROOT = Path("outputs/run_001").resolve()
IN_DIR   = RUN_ROOT / "01_blocks"
OUT_DIR  = RUN_ROOT / "01a_normalized"
OUT_DIR.mkdir(parents=True, exist_ok=True)
assert IN_DIR.exists(), f"Missing {IN_DIR}"

# ---------------- Tunables ----------------
# We *tag* header/footer instead of dropping to avoid accidental losses.
DROP_HEADER = False
DROP_FOOTER = False

# Base merge thresholds (scaled per page by median text height)
BASE_LINE_MERGE_TOL_Y = 0.60   # × median text height
BASE_LINE_MERGE_TOL_X = 0.90   # × median text height

# Header/footer bands (adaptive caps)
HEADER_MAX_PX = 140
FOOTER_MAX_PX = 120

MIN_TEXT_LEN = 2

# ---------- Helpers ----------
def chars(blocks): 
    return sum(len((b.get("text") or "")) for b in blocks)

def page_height(blocks: List[Dict[str,Any]]) -> float:
    ys = [b["bbox"][3] for b in blocks] + [b["bbox"][1] for b in blocks]
    return max(ys) if ys else 842.0

def page_width(blocks: List[Dict[str,Any]]) -> float:
    xs = [b["bbox"][2] for b in blocks]
    return max(xs) if xs else 595.0

def block_h(b): 
    x0,y0,x1,y1 = b["bbox"]; return max(1.0, y1-y0)

def is_native(b): 
    return "native" in (b.get("source") or "").lower()

def detect_handwriting(blocks) -> bool:
    """Heuristic: few native blocks OR very tall/variable text boxes."""
    if not blocks: return False
    n_native = sum(1 for b in blocks if is_native(b))
    ratio_native = n_native / len(blocks)
    hts = [block_h(b) for b in blocks]
    med_h = statistics.median(hts) if hts else 12
    iqr = (statistics.quantiles(hts, n=4)[2] - statistics.quantiles(hts, n=4)[0]) if len(hts) >= 4 else 0
    return (ratio_native < 0.35) or (med_h > 22) or (iqr > 18)

def adaptive_bands(H: float, handwriting: bool) -> Tuple[float,float]:
    """(header_px, footer_px). Smaller bands for handwriting pages."""
    if handwriting:
        return min(0.06*H, 70), min(0.05*H, 60)
    return min(0.10*H, HEADER_MAX_PX), min(0.08*H, FOOTER_MAX_PX)

def safe_text(b) -> str:
    return (b.get("text") or "").strip()

# Soft markers that often start a *new* list item/line even if y-close.
_BULLET_START = re.compile(r"""^\s*(?:[\u2022\u25CF\u25E6\-–—→]|\(?\d+\)|[A-Za-z]\))\s+""")

def should_force_new_row(prev_txt: str, next_txt: str) -> bool:
    """Avoid over-merging lists/bullets."""
    if _BULLET_START.match(next_txt or ""):  # next item looks like a bullet/numbered point
        return True
    # If prev ends with strong stop, keep separate
    if prev_txt and prev_txt.rstrip().endswith((".", ";", ":")):
        return True
    return False

def merge_rows(blocks, tol_y_px, tol_x_px):
    """Greedy row assembly with bullet-aware protection."""
    blocks = sorted(blocks, key=lambda b:(round(b["bbox"][1],1), round(b["bbox"][0],1)))
    rows = []
    for b in blocks:
        t = safe_text(b)
        if not t:
            continue
        x0,y0,x1,y1 = b["bbox"]
        cy = 0.5*(y0+y1)
        placed = False
        for r in rows:
            # same baseline?
            if abs(cy - r["y"]) <= tol_y_px:
                # small horizontal gap?
                close_x = (x0 - r["xmax"]) <= tol_x_px
                if close_x and not should_force_new_row(r["texts"][-1] if r["texts"] else "", t):
                    r["texts"].append(t)
                    r["xmax"] = max(r["xmax"], x1)
                    r["bbox"][0] = min(r["bbox"][0], x0)
                    r["bbox"][1] = min(r["bbox"][1], y0)
                    r["bbox"][2] = max(r["bbox"][2], x1)
                    r["bbox"][3] = max(r["bbox"][3], y1)
                    r["src"].append(b)
                    placed = True
                    break
        if not placed:
            rows.append({
                "y": cy, "xmax": x1, "texts":[t],
                "bbox":[x0,y0,x1,y1], "src":[b]
            })
    return rows

def maybe_two_columns(rows, page_w: float):
    """Simple 2-cluster on x-centres; only if well separated."""
    if len(rows) < 8:
        return [rows]
    xc = [(r["bbox"][0]+r["bbox"][2])/2 for r in rows]
    if not xc: 
        return [rows]
    c1, c2 = min(xc), max(xc)
    for _ in range(8):
        g1, g2 = [], []
        for i,x in enumerate(xc):
            (g1 if abs(x-c1)<=abs(x-c2) else g2).append(i)
        if not g1 or not g2: 
            break
        c1 = sum(xc[i] for i in g1)/len(g1)
        c2 = sum(xc[i] for i in g2)/len(g2)
    if not g1 or not g2:
        return [rows]
    sep = abs(c1-c2)/max(1.0, page_w)
    if sep < 0.12:
        return [rows]
    col1 = [rows[i] for i in g1]
    col2 = [rows[i] for i in g2]
    col1.sort(key=lambda r:(round(r["bbox"][1],1), round(r["bbox"][0],1)))
    col2.sort(key=lambda r:(round(r["bbox"][1],1), round(r["bbox"][0],1)))
    return [col1, col2]

def normalize_page(blocks: List[Dict[str,Any]]) -> Tuple[List[Dict[str,Any]], Dict[str,int]]:
    if not blocks:
        return [], {"tag_header":0,"tag_footer":0,"rows_out":0,"mode":"none"}

    H = page_height(blocks)
    W = page_width(blocks)
    handwriting = detect_handwriting(blocks)

    # Adaptive tolerances from median height
    hts = [block_h(b) for b in blocks]
    med_h = statistics.median(hts) if hts else 12
    tol_y = max(4.0, BASE_LINE_MERGE_TOL_Y * med_h)
    tol_x = max(10.0, BASE_LINE_MERGE_TOL_X * med_h)

    head_px, foot_px = adaptive_bands(H, handwriting)
    y_head, y_foot = head_px, H - foot_px

    tagged_header = tagged_footer = 0
    kept = []
    for b in blocks:
        t = safe_text(b)
        if not t:
            continue
        x0,y0,x1,y1 = b["bbox"]
        in_header = (y1 <= y_head)
        in_footer = (y0 >= y_foot)
        meta = dict(b.get("stage_meta") or {})
        if in_header:
            meta["header_tagged"] = True
            tagged_header += 1
            if DROP_HEADER:
                continue
        if in_footer:
            meta["footer_tagged"] = True
            tagged_footer += 1
            if DROP_FOOTER:
                continue
        nb = dict(b)
        nb["stage_meta"] = meta
        kept.append(nb)

    # Merge on lines with bullet-aware guard
    rows = merge_rows(kept, tol_y, tol_x)

    # Column stitching if clearly 2 columns
    groups = maybe_two_columns(rows, W)
    mode = "1col" if len(groups)==1 else "2col"

    # Emit: keep `text_lines` to preserve structure
    out = []
    for grp in groups:
        for r in grp:
            lines = [t for t in r["texts"] if t.strip()]
            joined = " ".join(lines)
            if len(joined.strip()) < MIN_TEXT_LEN:
                continue
            src0 = r["src"][0]
            nb = {**src0}
            nb["bbox"] = r["bbox"]
            nb["text"] = joined
            nb["text_norm"] = joined
            nb["text_lines"] = lines  # <-- structure preserved
            meta = dict(nb.get("stage_meta") or {})
            meta.update({
                "line_merge": True,
                "two_column_mode": (mode=="2col"),
                "median_h": med_h,
                "tol_y_px": tol_y,
                "tol_x_px": tol_x,
            })
            nb["stage_meta"] = meta
            out.append(nb)

    # Final order
    out.sort(key=lambda b:(round(b["bbox"][1],1), round(b["bbox"][0],1)))
    stats = {"tag_header":tagged_header, "tag_footer":tagged_footer, "rows_out":len(out), "mode":mode}
    return out, stats

# ----------------- Run --------------------
pages = sorted(IN_DIR.glob("page_*_blocks.json"))
assert pages, f"No page_*_blocks.json in {IN_DIR}"

for p in pages:
    data = json.loads(p.read_text(encoding="utf-8"))
    c0 = chars(data)
    out, st = normalize_page(data)
    c1 = chars(out)
    cov = (c1/c0*100) if c0 else 100.0

    outp = OUT_DIR / f"{p.stem}.norm.json"
    outp.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")

    print(f"✓ {p.name}: in={len(data)} out={len(out)} "
          f"chars={c1}/{c0} ({cov:.1f}%) "
          f"tagged(H/F)={st['tag_header']}/{st['tag_footer']} mode={st['mode']}")

print("\n✅ Layout normalization complete →", OUT_DIR)


✓ page_001_blocks.json: in=8 out=6 chars=2155/2153 (100.1%) tagged(H/F)=0/0 mode=1col
✓ page_002_blocks.json: in=12 out=11 chars=4099/4098 (100.0%) tagged(H/F)=0/0 mode=2col
✓ page_003_blocks.json: in=8 out=6 chars=1642/1642 (100.0%) tagged(H/F)=0/0 mode=1col
✓ page_004_blocks.json: in=9 out=8 chars=1901/1900 (100.1%) tagged(H/F)=0/0 mode=2col

✅ Layout normalization complete → /Users/balijepalli/Documents/GitHub/entheory-ai/notebooks/outputs/run_001/01a_normalized
