In [2]:
# %% [markdown]
# # 02a — General segmentation (gap + heading + bullets)
# Input  : outputs/run_001/01a_normalized/page_*_blocks.norm.json  (fallback: 01_blocks)
# Output : outputs/run_001/02a_segmented/page_*_blocks.seg.json
# QA:
#   - segment count, per-segment char totals, coverage % vs normalized

# %%
import json, re
from pathlib import Path
from typing import List, Dict, Any

RUN_ROOT = Path("outputs/run_001").resolve()
IN_DIRS  = [RUN_ROOT/"01a_normalized", RUN_ROOT/"01_blocks"]
OUT_DIR  = RUN_ROOT/"02a_segmented"
OUT_DIR.mkdir(parents=True, exist_ok=True)

def find_pages():
    for d in IN_DIRS:
        ps = sorted(d.glob("page_*_blocks*.json"))
        if ps:
            print("[INFO] Using:", d.name)
            return ps, d
    raise FileNotFoundError("No page_* JSONs in 01a_normalized or 01_blocks")

pages, SRC = find_pages()

HEAD_RX = re.compile(r"^(impression|diagnosis|microscopy|microscopic|gross|clinical details|comment|plan|advice|assessment)\b[:\-]?", re.I)
BULLET_RX = re.compile(r"^(\-|\•|•|●|▪|▶|►|\d+\)|\d+\.)\s+")

GAP_Y = 18      # gap between paragraphs
JOIN_SOFT = True

def is_heading(t: str)->bool:
    s = t.strip()
    if len(s) <= 3: 
        return False
    if HEAD_RX.match(s):
        return True
    # short ALLCAPS phrase
    letters = re.sub(r"[^A-Za-z]", "", s)
    if 3 <= len(letters) <= 24 and letters.isupper():
        return True
    return False

def same_para(prev, cur)->bool:
    # same paragraph if vertical gap small and not a new heading
    if is_heading(cur["text"]): 
        return False
    y_gap = cur["bbox"][1] - prev["bbox"][3]
    return y_gap <= GAP_Y

def segment_page(blocks: List[Dict[str,Any]]):
    # assume already sorted in 01a
    segs, cur = [], None
    for b in blocks:
        txt = (b.get("text") or "").strip()
        if not txt:
            continue

        if is_heading(txt):
            # start new segment
            if cur: 
                segs.append(cur); cur = None
            cur = {"title": txt, "items": [b], "bbox": list(b["bbox"])}
            continue

        if cur is None:
            # first segment (untitled)
            cur = {"title": None, "items": [b], "bbox": list(b["bbox"])}
        else:
            if same_para(cur["items"][-1], b):
                # append
                cur["items"].append(b)
                # expand bbox
                x0,y0,x1,y1 = cur["bbox"]
                bx0,by0,bx1,by1 = b["bbox"]
                cur["bbox"] = [min(x0,bx0), min(y0,by0), max(x1,bx1), max(y1,by1)]
            else:
                segs.append(cur)
                cur = {"title": None, "items": [b], "bbox": list(b["bbox"])}

    if cur: 
        segs.append(cur)

    # compact: join text within segments
    for s in segs:
        lines = [i["text"].strip() for i in s["items"] if i.get("text")]
        if JOIN_SOFT:
            txt = []
            for ln in lines:
                if BULLET_RX.match(ln):
                    txt.append("\n" + ln)
                else:
                    txt.append((" " if txt else "") + ln)
            s["text"] = "".join(txt).strip()
        else:
            s["text"] = "\n".join(lines).strip()

    return segs

def chars(blocks): return sum(len((b.get("text") or "")) for b in blocks)

for p in pages:
    data = json.loads(p.read_text(encoding="utf-8"))
    c0 = chars(data)
    segs = segment_page(data)
    # save per-page
    outp = OUT_DIR / f"{p.stem}.seg.json"
    outp.write_text(json.dumps(segs, ensure_ascii=False, indent=2), encoding="utf-8")

    c1 = sum(len(s["text"]) for s in segs)
    cov = (c1/c0*100) if c0 else 100.0
    print(f"✓ {p.name}: segments={len(segs)} coverage={cov:.1f}%")

print("\n✅ Segmentation complete →", OUT_DIR)


[INFO] Using: 01a_normalized
✓ page_001_blocks.norm.json: segments=3 coverage=100.1%
✓ page_002_blocks.norm.json: segments=6 coverage=100.1%
✓ page_003_blocks.norm.json: segments=4 coverage=100.1%
✓ page_004_blocks.norm.json: segments=4 coverage=100.2%

✅ Segmentation complete → /Users/balijepalli/Documents/GitHub/entheory-ai/notebooks/outputs/run_001/02a_segmented
