In [1]:
!pip install docling

Collecting docling
  Downloading docling-2.70.0-py3-none-any.whl.metadata (11 kB)
Collecting docling-core<3.0.0,>=2.50.1 (from docling-core[chunking]<3.0.0,>=2.50.1->docling)
  Downloading docling_core-2.61.0-py3-none-any.whl.metadata (7.6 kB)
Collecting docling-parse<5.0.0,>=4.7.0 (from docling)
  Downloading docling_parse-4.7.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting docling-ibm-models<4,>=3.9.1 (from docling)
  Downloading docling_ibm_models-3.11.0-py3-none-any.whl.metadata (7.2 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<6.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting rapidocr<4.0.0,>=3.3 (from docling)
  Downloading 

In [6]:
!pip install PyMuPDF
!pip install img2pdf

import fitz  # PyMuPDF
import img2pdf
import os
import shutil
import gc  # Garbage collector

# Config
input_pdf_path = "/content/bnbc6_chap1.pdf"
output_pdf_path = "/content/bnbc6_chap1_flat.pdf"
temp_dir = "/content/temp_flatten_pages"
DPI = 400  # Reduced from 300 to 200 (Sufficient for OCR, saves 50% RAM)

# 1. Setup temp directory
if os.path.exists(temp_dir):
    shutil.rmtree(temp_dir)
os.makedirs(temp_dir)

print(f"Phase 1: Rasterizing pages to disk (DPI={DPI})...")

# Open PDF
doc = fitz.open(input_pdf_path)
image_paths = []

for i, page in enumerate(doc):
    # Render page to image
    pix = page.get_pixmap(dpi=DPI)

    # Save to DISK immediately (keeps RAM clean)
    image_filename = f"{temp_dir}/page_{i:04d}.jpg"
    pix.save(image_filename)
    image_paths.append(image_filename)

    # Critical: Delete objects and force garbage collection to free RAM
    del pix
    gc.collect()

    if (i + 1) % 5 == 0:
        print(f"  Saved page {i + 1}/{len(doc)}")

doc.close()
print("Phase 1 Complete. Images saved to disk.")

# 2. Combine images into PDF using img2pdf (Streams directly from disk)
print("Phase 2: Stitching images back into PDF...")

with open(output_pdf_path, "wb") as f:
    f.write(img2pdf.convert(image_paths))

# 3. Cleanup
shutil.rmtree(temp_dir)
print(f"Success! Flattened PDF saved at: {output_pdf_path}")

Phase 1: Rasterizing pages to disk (DPI=400)...
  Saved page 5/37
  Saved page 10/37
  Saved page 15/37
  Saved page 20/37
  Saved page 25/37
  Saved page 30/37
  Saved page 35/37
Phase 1 Complete. Images saved to disk.
Phase 2: Stitching images back into PDF...
Success! Flattened PDF saved at: /content/bnbc6_chap1_flat.pdf


In [7]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat

# USE THE FLATTENED FILE
source = "/content/bnbc6_chap1_flat.pdf"

# Standard OCR options are sufficient now because there is NO text layer to confuse it
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_formula_enrichment = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

print("Processing flattened PDF...")
result = converter.convert(source)

# Export and Verify
structured_json = result.document.export_to_dict()

# This should now print the actual symbol (e.g., 'D', 'E') instead of GLYPH
first_text = structured_json["texts"][0]["text"]
print("First text item:", first_text)

# Save for your parser
import json
with open("bnbc6_chap1.json", "w", encoding="utf-8") as f:
    json.dump(structured_json, f, ensure_ascii=False, indent=4)

Processing flattened PDF...


[32m[INFO] 2026-01-30 12:54:52,228 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-30 12:54:52,233 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-30 12:54:52,334 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-30 12:54:52,341 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-30 12:54:52,784 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-30 12:54:52,786 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-30 12:54:52,790 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-30 12:54:52,791 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

First text item: ,,,O>


In [8]:
import json
output_filename = "bnbc6_chap1.json"

with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(structured_json, f, ensure_ascii=False, indent=4)

print(f"Success! Your structured data is saved in {output_filename}")

Success! Your structured data is saved in bnbc6_chap1.json


In [4]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.7


In [9]:
import json
import re
import unicodedata
from collections import defaultdict
from pathlib import Path
from difflib import SequenceMatcher
import fitz  # PyMuPDF

# ----------------------------
# Config (edit these paths)
# ----------------------------
INPUT_DOCLING_JSON = "/content/bnbc6_chap1.json"
INPUT_PDF = "/content/bnbc6_chap1_flat.pdf"

OUTPUT_DIR = "/content/structured_out"
IMAGES_DIR_NAME = "images"
EQUATIONS_DIR_NAME = "equations"
DPI = 200
IMAGE_FORMAT = "jpg"

# If Docling formula enrichment produces spaced-out LaTeX (each char separated by spaces),
# enable this heuristic de-spacing fix.
FIX_SPACED_LATEX = True


# ----------------------------
# Helpers: Docling JSON access
# ----------------------------
CLAUSE_RE = re.compile(r"^\s*(\d+(?:\.\d+)*)\s+(.*\S)\s*$")

# List marker patterns (start-of-line)
PAREN_MARKER_RE = re.compile(r"^\s*\(\s*([A-Za-z0-9]+|[ivxlcdmIVXLCDM]+)\s*\)\s+")
SIMPLE_MARKER_RE = re.compile(r"^\s*([A-Za-z]|\d+|[ivxlcdmIVXLCDM]+)([\.|\)])\s+")
BULLET_MARKER_RE = re.compile(r"^\s*([•\-*–—])\s+")

# Inline markers inside a single paragraph, e.g. "... requirements: (1) ... (2) ... (3) ..."
# We only treat a parenthesized token as a marker if it's preceded by start/whitespace/:/; and followed by space.
INLINE_PAREN_MARKER_RE = re.compile(r"(?:(?<=^)|(?<=[\s:;]))\(\s*([0-9]+|[A-Za-z]|[ivxlcdmIVXLCDM]+)\s*\)\s+")

# Strong table caption indicator: "Table <no>: <text>"
TABLE_CAPTION_STRONG_RE = re.compile(r"^\s*Table\s+\d+(?:\.\d+)*\s*:\s*\S", re.IGNORECASE)



def normalize_ws(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()


def load_docling_json(path: str) -> dict:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def resolve_ref(doc: dict, ref: str):
    """
    ref examples:
      '#/texts/93'
      '#/tables/0'
      '#/pictures/2'
      '#/groups/5'
    """
    m = re.match(r"^#/(texts|tables|pictures|groups)/(\d+)$", ref)
    if not m:
        return None, None, None
    kind, idx = m.group(1), int(m.group(2))
    return kind, idx, doc[kind][idx]


def extract_caption(doc: dict, captions) -> str:
    """Extract concatenated caption text from refs like [{'$ref':'#/texts/93'}]."""
    if not captions:
        return ""
    parts = []
    for c in captions:
        r = c.get("$ref")
        if not r:
            continue
        kind, _, obj = resolve_ref(doc, r)
        if kind == "texts":
            t = normalize_ws(obj.get("text") or "")
            if t:
                parts.append(t)
    return " ".join(parts).strip()


def walk_body_in_reading_order(doc: dict):
    """
    Body children contains a mixture of texts/tables/pictures and also '#/groups/*'.
    We DFS into groups so we don't miss content.
    """
    body = doc.get("body") or {}
    stack = []

    def push_children(children):
        for child in reversed(children or []):
            stack.append(child)

    push_children(body.get("children"))

    while stack:
        item = stack.pop()
        if not isinstance(item, dict) or "$ref" not in item:
            continue

        ref = item["$ref"]
        kind, _, obj = resolve_ref(doc, ref)

        if kind == "groups":
            push_children(obj.get("children"))
            continue

        yield ref


# ----------------------------
# Generic point parsing utilities
# ----------------------------
def leading_indent_spaces(s: str) -> int:
    if not s:
        return 0
    n = 0
    for ch in s:
        if ch == " ":
            n += 1
        elif ch == "\t":
            n += 4
        else:
            break
    return n


def parse_list_marker(raw_text: str, marker_field: str = "", allow_inline_start: bool = True):
    """
    Returns (marker, content, indent_spaces) if this looks like a point item; else (None, None, None).

    Priority:
      1) Docling-provided marker_field (e.g., '1.' or '*')
      2) '(a) ...', '(1) ...', '(i) ...' at START of text
      3) 'a) ...', '1. ...', 'i) ...' at START of text
      4) bullet char at START of text
    """
    if raw_text is None:
        return None, None, None

    indent = leading_indent_spaces(raw_text)
    s = raw_text.lstrip("\t ").rstrip()

    mf = (marker_field or "").strip()
    if mf:
        return mf, normalize_ws(s), indent

    if not allow_inline_start or not s:
        return None, None, None

    m = PAREN_MARKER_RE.match(s)
    if m:
        mk = f"({m.group(1)})"
        return mk, normalize_ws(s[m.end():]), indent

    m = SIMPLE_MARKER_RE.match(s)
    if m:
        mk = f"{m.group(1)}{m.group(2)}"
        return mk, normalize_ws(s[m.end():]), indent

    m = BULLET_MARKER_RE.match(s)
    if m:
        mk = m.group(1)
        return mk, normalize_ws(s[m.end():]), indent

    return None, None, None


def split_inline_enumeration(raw_text: str):
    """
    If raw_text contains multiple inline markers like:
      "Intro: (1) aaa (2) bbb (3) ccc"
    return:
      intro="Intro:" and items=[{"marker":"(1)","text":"aaa"}, ...]
    Otherwise returns (None, None).
    """
    if not raw_text:
        return None, None
    s = raw_text.strip()
    matches = list(INLINE_PAREN_MARKER_RE.finditer(s))
    if len(matches) < 2:
        return None, None

    intro = s[:matches[0].start()].strip()
    items = []
    for i, mm in enumerate(matches):
        mk = f"({mm.group(1)})"
        start = mm.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(s)
        content = normalize_ws(s[start:end]).strip(" ;")
        if content:
            items.append({"marker": mk, "text": content})
    if not items:
        return None, None
    return intro, items


# ----------------------------
# Repeating header/footer (boilerplate) detection
# ----------------------------
NOISE_CANONICALS = set()

CAPTION_TEXT_KEYS = set()

def _make_text_key_from_obj(obj: dict):
    """Stable-ish key for a text object so we can later skip it in clause text if it was used as a table caption."""
    raw = normalize_ws(obj.get("text") or "")
    if not raw:
        return None
    prov0 = (obj.get("prov") or [{}])[0]
    page_no = prov0.get("page_no")
    bb = prov0.get("bbox") or {}
    if page_no is None or not isinstance(bb, dict):
        return None
    l = bb.get("l"); t = bb.get("t"); r = bb.get("r"); b = bb.get("b")
    if None in (l, t, r, b):
        return None
    return (
        int(page_no),
        canonicalize_repeat_text(raw),
        round(float(l), 1),
        round(float(t), 1),
        round(float(r), 1),
        round(float(b), 1),
    )

def is_caption_text_obj(obj: dict) -> bool:
    k = _make_text_key_from_obj(obj)
    return k is not None and k in CAPTION_TEXT_KEYS

def _unicode_digit_mask(s: str) -> str:
    """Replace any Unicode decimal digit (Latin/Bengali/Arabic-Indic/etc.) with '#'."""
    out = []
    for ch in (s or ""):
        out.append("#" if unicodedata.category(ch) == "Nd" else ch)
    return "".join(out)

def canonicalize_repeat_text(s: str) -> str:
    """Canonical form for comparing repeated header/footer strings across pages."""
    s = normalize_ws(s)
    s = _unicode_digit_mask(s)
    # Keep letters/digits/underscore across scripts; drop punctuation
    s = re.sub(r"[^\w\s#]", " ", s, flags=re.UNICODE)
    s = normalize_ws(s).lower()
    # collapse repeated masked digits
    s = re.sub(r"(#\s*){2,}", "#", s)
    return s

def _percentile(vals, p: float):
    if not vals:
        return None
    vals = sorted(vals)
    k = int(round((p / 100.0) * (len(vals) - 1)))
    k = max(0, min(len(vals) - 1, k))
    return vals[k]

def detect_repeating_headers_footers(doc: dict,
                                     min_page_fraction: float = 0.6,
                                     band_percentile: float = 8.0) -> set:
    """
    Learn boilerplate (headers/footers) generically:
      - take text items in the top/bottom bands of each page (by bbox.t distribution)
      - canonicalize (mask digits, normalize)
      - mark items repeated across many distinct pages
    """
    page_ts = defaultdict(list)
    items = []

    for obj in doc.get("texts", []):
        raw = (obj.get("text") or "").strip()
        if not raw:
            continue
        prov0 = (obj.get("prov") or [{}])[0]
        page_no = prov0.get("page_no")
        bbox = prov0.get("bbox") or {}
        if page_no is None or not isinstance(bbox, dict):
            continue
        t = bbox.get("t")
        if t is None:
            continue
        page_ts[page_no].append(t)
        items.append((page_no, t, raw))

    pages = sorted(page_ts.keys())
    if not pages:
        return set()

    top_cut = {}
    bot_cut = {}
    for p in pages:
        ts = page_ts[p]
        top_cut[p] = _percentile(ts, 100.0 - band_percentile)
        bot_cut[p] = _percentile(ts, band_percentile)

    cand_pages = defaultdict(set)
    for page_no, t, raw in items:
        tc = top_cut.get(page_no)
        bc = bot_cut.get(page_no)
        if tc is None or bc is None:
            continue
        # candidate if it lives in either extreme band on that page
        if t >= tc or t <= bc:
            c = canonicalize_repeat_text(raw)
            if 5 <= len(c) <= 160:
                cand_pages[c].add(page_no)

    n_pages = len(pages)
    need = max(3, int(round(min_page_fraction * n_pages)))
    return {c for c, ps in cand_pages.items() if len(ps) >= need}


def is_noise_text(s: str) -> bool:
    """
    Heuristic to ignore headers/footers/page numbers that can appear between list items.
    This stays generic (no language-specific keywords):
      - learned repeating boilerplate via detect_repeating_headers_footers()
      - plus light heuristics for page numbers / punctuation soup
    """
    if s is None:
        return True
    t = s.strip()
    if not t:
        return True

    if NOISE_CANONICALS and canonicalize_repeat_text(t) in NOISE_CANONICALS:
        return True

    if re.fullmatch(r"\d{3,6}", t):  # page numbers like 3235 (Latin digits)
        return True
    if len(t) <= 12 and sum(ch.isalpha() for ch in t) < 2:
        return True
    # very punctuation-heavy short lines
    if len(t) <= 25 and sum(ch.isalnum() for ch in t) / max(1, len(t)) < 0.35:
        return True
    return False

def _reading_order_key(block: dict):
    """Sort by page then top-to-bottom then left-to-right using bbox."""
    page_no = block.get("page_no")
    bbox = block.get("bbox") or {}
    l = bbox.get("l") if isinstance(bbox, dict) else None
    t = bbox.get("t") if isinstance(bbox, dict) else None
    origin = (bbox.get("coord_origin") or "TOPLEFT").upper() if isinstance(bbox, dict) else "TOPLEFT"

    page_key = page_no if page_no is not None else 10**9
    y_key = 0 if t is None else ((-t) if origin == "BOTTOMLEFT" else t)
    x_key = l if l is not None else 0
    return (page_key, y_key, x_key)


# ----------------------------
# Smart nesting (indent + marker kind + lookahead)
# ----------------------------
def _is_roman(s: str) -> bool:
    if not s:
        return False
    s = s.strip().lower()
    return bool(re.fullmatch(r"[ivxlcdm]+", s))


def marker_kind(marker: str) -> str:
    if not marker:
        return "other"
    m = marker.strip()

    if m in {"•", "-", "*", "–", "—"}:
        return "bullet"

    core = re.sub(r"^[\(\[]\s*", "", m)
    core = re.sub(r"\s*[\)\]]$", "", core)
    core = re.sub(r"[\.)]$", "", core).strip()

    if core.isdigit():
        return "num"
    if len(core) == 1 and core.isalpha():
        return "alpha"
    if _is_roman(core):
        return "roman"
    return "other"


KIND_ORDER = {"num": 0, "alpha": 1, "roman": 2, "bullet": 3, "other": 3}


def _nesting_trigger_text(s: str) -> bool:
    """Non-hardcoded cue: a colon usually introduces a sublist."""
    return bool(s and s.strip().endswith(":"))


def _will_return_to_kind(flat_items, start_idx: int, target_kind: str) -> bool:
    """Lookahead: if we later return to target_kind, items in between are likely a sublist."""
    for j in range(start_idx + 1, len(flat_items)):
        k = marker_kind(flat_items[j].get("marker", ""))
        if k == target_kind:
            return True
        if KIND_ORDER.get(k, 3) < KIND_ORDER.get(target_kind, 3):
            return False
    return False


def _marker_num_value(marker: str):
    """Return integer value for numeric markers like '(8)' or '8.'; else None."""
    if not marker:
        return None
    m = marker.strip()
    core = re.sub(r"^[\(\[]\s*", "", m)
    core = re.sub(r"\s*[\)\]]$", "", core)
    core = re.sub(r"[.)]$", "", core).strip()
    return int(core) if core.isdigit() else None


def nest_list_items_smart(flat_items):
    """
    Build a nested list tree using:
      1) indentation (from bbox.l when available) - normalized per page to avoid page-to-page x-shifts
      2) marker-kind hierarchy + ':' cue + lookahead return

    Fix: avoid accidental nesting across page breaks for same-level numeric sequences
         (e.g., (8) at end of a page and (9) at top of next page).
    """
    root = []
    stack = []  # entries: {"indent": float, "kind": str, "order": int, "node": dict, "page_no": int}

    for i, it in enumerate(flat_items):
        node = {"marker": it.get("marker", ""), "text": it.get("text", ""), "children": []}
        # Use normalized indent when available (per-page baseline removed)
        ind = float(it.get("indent_norm", it.get("indent") or 0))
        page_no = it.get("page_no", None)

        k = marker_kind(node["marker"])
        k_order = KIND_ORDER.get(k, 3)

        if not stack:
            root.append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node, "page_no": page_no})
            continue

        prev = stack[-1]
        prev_page = prev.get("page_no", None)

        # Detect numeric sibling continuation (8 -> 9) to override misleading indent shifts.
        prev_num = _marker_num_value(prev["node"].get("marker", ""))
        curr_num = _marker_num_value(node.get("marker", ""))

        is_numeric_continuation = (
            k == "num"
            and prev.get("kind") == "num"
            and prev_num is not None
            and curr_num is not None
            and curr_num == prev_num + 1
        )

        # Indent-based nesting when it changes meaningfully.
        indent_diff = ind - prev["indent"]
        indent_is_informative = abs(indent_diff) >= 5.0  # bbox units are typically points

        # ---- FIX #1: across page breaks, ignore indent-based "nesting" for numeric continuations
        if indent_is_informative and (page_no is not None and prev_page is not None) and (page_no != prev_page) and is_numeric_continuation:
            # Force as sibling at the same level as prev.
            stack.pop()
            if not stack:
                root.append(node)
            else:
                stack[-1]["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node, "page_no": page_no})
            continue

        if indent_is_informative:
            while stack and ind <= stack[-1]["indent"]:
                stack.pop()
            if not stack:
                root.append(node)
            else:
                stack[-1]["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node, "page_no": page_no})
            continue

        # Marker-based inference when indent isn't helpful.
        prev_kind = prev["kind"]
        prev_order = prev["order"]

        prev_introduces_sublist = _nesting_trigger_text(prev["node"].get("text", ""))
        lookahead_sublist = _will_return_to_kind(flat_items, i, prev_kind)

        if (k_order > prev_order) and (prev_introduces_sublist or lookahead_sublist):
            prev["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node, "page_no": page_no})
            continue

        if k == prev_kind:
            # sibling
            stack.pop()
            if not stack:
                root.append(node)
            else:
                stack[-1]["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node, "page_no": page_no})
            continue

        if k_order <= prev_order:
            while stack and KIND_ORDER.get(stack[-1]["kind"], 3) >= k_order:
                stack.pop()
            if not stack:
                root.append(node)
            else:
                stack[-1]["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node, "page_no": page_no})
            continue

        # default: sibling
        stack.pop()
        if not stack:
            root.append(node)
        else:
            stack[-1]["node"]["children"].append(node)
        stack.append({"indent": ind, "kind": k, "order": k_order, "node": node, "page_no": page_no})

    return root


def blocks_to_text_and_lists(blocks):
    """
    blocks: list of either
      {"kind":"text","text":...}
      {"kind":"list_item","marker":..., "text":..., "indent":..., "page_no":..., "bbox":...}

    Returns:
      text: merged paragraphs (non-list blocks)
      lists: list of {"items":[nested...]} in reading order

    Fix: normalize list-item indents per page before nesting to avoid page-to-page x-offset artifacts.
    """
    text_parts = []
    lists = []

    i = 0
    while i < len(blocks):
        b = blocks[i]
        if b["kind"] == "text":
            t = b.get("text", "")
            if t and not is_noise_text(t):
                text_parts.append(t.strip())
            i += 1
            continue

        # collect list items, skipping noise text between them
        j = i
        run = []
        while j < len(blocks):
            bb = blocks[j]
            if bb["kind"] == "list_item":
                run.append(bb)
                j += 1
                continue
            if bb["kind"] == "text" and is_noise_text(bb.get("text", "")):
                j += 1
                continue
            break

        # Sort by bbox order if we have provenance (fixes e.g., 9, 11, 10).
        if any((x.get("page_no") is not None and x.get("bbox")) for x in run):
            run = sorted(run, key=_reading_order_key)

        # ---- FIX #2: per-page indent normalization
        # Some PDFs have different x-offsets per page; compare indents in a normalized coordinate system.
        per_page_min = {}
        for it in run:
            pn = it.get("page_no", None)
            ind = it.get("indent", None)
            if pn is None or ind is None:
                continue
            per_page_min[pn] = min(per_page_min.get(pn, float(ind)), float(ind))

        for it in run:
            pn = it.get("page_no", None)
            ind = it.get("indent", None)
            if pn is None or ind is None or pn not in per_page_min:
                it["indent_norm"] = float(ind or 0)
            else:
                it["indent_norm"] = float(ind) - float(per_page_min[pn])

        nested = nest_list_items_smart(run)
        lists.append({"items": nested})

        i = j

    text = "\n".join([p for p in text_parts if p]).strip()
    return text, lists

def format_list_items(items, level=0):
    """Render nested list items into plain text (for retrieval display)."""
    out = []
    indent = "  " * level
    for it in items or []:
        mk = it.get("marker", "")
        tx = it.get("text", "")
        line = f"{indent}{mk} {tx}".strip()
        if line:
            out.append(line)
        if it.get("children"):
            out.append(format_list_items(it["children"], level + 1))
    return "\n".join([x for x in out if x]).strip()


# ----------------------------
# Tables (robust)
# ----------------------------
def table_rows_robust(table_obj: dict):
    """
    Robustly converts Docling tables into rows[][].

    Handles:
      A) data["grid"][r][c] is a dict with "text"
      B) data["grid"][r][c] is an int index into data["table_cells"]
      C) missing/empty grid -> reconstruct from table_cells span metadata
    """
    data = table_obj.get("data") or {}
    grid = data.get("grid") or []
    table_cells = data.get("table_cells") or []

    num_rows = int(data.get("num_rows") or (len(grid) if grid else 0))
    num_cols = int(data.get("num_cols") or (len(grid[0]) if grid and grid[0] else 0))

    if grid and grid[0] and isinstance(grid[0][0], dict):
        rows = []
        for r in grid:
            rows.append([normalize_ws(cell.get("text", "")) if isinstance(cell, dict) else "" for cell in r])
        return {"num_rows": num_rows, "num_cols": num_cols, "rows": rows}

    if grid and grid[0] and isinstance(grid[0][0], int):
        rows = []
        for r in grid:
            row = []
            for idx in r:
                if isinstance(idx, int) and 0 <= idx < len(table_cells):
                    row.append(normalize_ws(table_cells[idx].get("text", "")))
                else:
                    row.append("")
            rows.append(row)
        return {"num_rows": num_rows, "num_cols": num_cols, "rows": rows}

    mat = [["" for _ in range(num_cols)] for _ in range(num_rows)]
    for cell in table_cells:
        txt = normalize_ws(cell.get("text", ""))
        r0 = cell.get("start_row_offset_idx", 0)
        r1 = cell.get("end_row_offset_idx", r0 + 1)
        c0 = cell.get("start_col_offset_idx", 0)
        c1 = cell.get("end_col_offset_idx", c0 + 1)
        for rr in range(r0, r1):
            for cc in range(c0, c1):
                if 0 <= rr < num_rows and 0 <= cc < num_cols:
                    mat[rr][cc] = txt

    return {"num_rows": num_rows, "num_cols": num_cols, "rows": mat}


def build_tables_json(doc: dict):
    """
    Build table JSON with:
      1) caption inference (when Docling doesn't attach captions)
      2) multi-page continuation merge (same table split across adjacent pages)
      3) semantic table_id derived from caption number (e.g., "6.2.22")

    Returns:
      (tables, table_id_map)
        - tables: merged table list
        - table_id_map: maps original table_{i:04d} -> merged/semantic table_id
    """

    # ----------------------------
    # Page geometry stats (coord-system agnostic)
    # ----------------------------
    def page_minmax_y():
        mm = defaultdict(lambda: [None, None])  # page -> [ymin, ymax]
        for t in doc.get("texts", []):
            prov0 = (t.get("prov") or [{}])[0]
            p = prov0.get("page_no")
            bb = prov0.get("bbox") or {}
            y = bb.get("t") if isinstance(bb, dict) else None
            if p is None or y is None:
                continue
            lo, hi = mm[p]
            mm[p][0] = y if lo is None else min(lo, y)
            mm[p][1] = y if hi is None else max(hi, y)
        return {k: (v[0], v[1]) for k, v in mm.items() if v[0] is not None and v[1] is not None and v[1] > v[0]}

    mm = page_minmax_y()
    print(f"DEBUG build_tables_json: mm pages = {sorted(mm.keys())}")

    def y_norm(page_no, y):
        if page_no not in mm or y is None:
            return None
        lo, hi = mm[page_no]
        return (y - lo) / (hi - lo) if hi > lo else None

    def infer_top_is_high_by_page():
        """
        Decide whether the physical page-top corresponds to high y values (True) or low y values (False).
        Heuristic: whichever extreme band has longer average text is treated as the header side.
        """
        out = {}
        for p in mm.keys():
            hi_band = []
            lo_band = []
            for obj in doc.get("texts", []):
                prov0 = (obj.get("prov") or [{}])[0]
                if prov0.get("page_no") != p:
                    continue
                bb = prov0.get("bbox") or {}
                if not isinstance(bb, dict):
                    continue
                tt = bb.get("t"); bbv = bb.get("b")
                if tt is None or bbv is None:
                    continue
                ymid = (tt + bbv) / 2.0
                yn = y_norm(p, ymid)
                if yn is None:
                    continue
                txt = normalize_ws(obj.get("text") or "")
                if not txt:
                    continue
                if yn >= 0.90:
                    hi_band.append(len(txt))
                elif yn <= 0.10:
                    lo_band.append(len(txt))

            if hi_band and lo_band:
                out[p] = (sum(hi_band) / len(hi_band)) >= (sum(lo_band) / len(lo_band))
            elif hi_band and not lo_band:
                out[p] = True
            elif lo_band and not hi_band:
                out[p] = False
            else:
                out[p] = True
        return out

    top_is_high = infer_top_is_high_by_page()

    def phys_topness(page_no, y):
        """
        Convert y into a 'physical topness' score in [0,1],
        where 1 means physically near the top of the page.
        """
        yn = y_norm(page_no, y)
        if yn is None:
            return None
        return yn if top_is_high.get(page_no, True) else (1.0 - yn)

    def x_overlap_ratio(a_l, a_r, b_l, b_r):
        if None in (a_l, a_r, b_l, b_r):
            return 0.0
        inter = max(0.0, min(a_r, b_r) - max(a_l, b_l))
        aw = max(1e-6, a_r - a_l)
        bw = max(1e-6, b_r - b_l)
        return inter / min(aw, bw)

    # ----------------------------
    # Caption parsing + inference
    # ----------------------------
    TABLE_NUM_RE = re.compile(r"(?i)\b(?:table|tbl\.?)\s*([0-9]+(?:\.[0-9]+)*)")

    def parse_table_number(caption: str):
        if not caption:
            return None
        m = TABLE_NUM_RE.search(caption)
        return m.group(1) if m else None

    def sanitize_caption(caption: str, rows) -> str:
      caption = normalize_ws(caption or "")
      if not caption:
          return ""

      # IMPORTANT: never discard a strong "Table X.Y:" caption even if it matches row[0]
      if TABLE_CAPTION_STRONG_RE.match(caption):
          return caption

      # Otherwise, prevent using column headers as captions
      if rows:
          hdr = normalize_ws(" ".join([c for c in (rows[0] or []) if c]))
          if hdr and canonicalize_repeat_text(caption) == canonicalize_repeat_text(hdr):
              return ""
      return caption


    def infer_caption_from_first_row(rows):
        """
        If Docling swallowed the caption into the first table row, recover it.
        Prefer a single cell that matches "Table <no>: ..." over concatenating the whole row.
        """
        if not rows or not rows[0]:
            return ""

        cells = [normalize_ws(c) for c in rows[0] if normalize_ws(c)]

        # 1) If any single cell is a strong caption, use that (best behavior for your case)
        for c in cells:
            if TABLE_CAPTION_STRONG_RE.match(c):
                return c

        # 2) Otherwise, try the concatenated row
        row0_joined = normalize_ws(" ".join(cells))
        if TABLE_CAPTION_STRONG_RE.match(row0_joined):
            return row0_joined

        return ""


    def infer_table_caption(page_no, table_bbox, rows):
        """
        Find a likely caption line near the table (usually above it),
        using layout + proximity, without hardcoding language.
        """
        if page_no is None or not isinstance(table_bbox, dict) or page_no not in mm:
            print(f"DEBUG: Early return - page_no={page_no}, in_mm={page_no in mm if page_no else 'N/A'}")
            return ""

        t_l = table_bbox.get("l"); t_r = table_bbox.get("r")
        t_t = table_bbox.get("t"); t_b = table_bbox.get("b")
        if None in (t_l, t_r, t_t, t_b):
            print(f"DEBUG: Early return - bbox incomplete")
            return ""

        # Determine table physical top boundary (y coordinate)
        table_low, table_high = (t_t, t_b) if t_t < t_b else (t_b, t_t)
        span = mm[page_no][1] - mm[page_no][0]

        # Use coord_origin directly from bbox instead of flawed heuristic
        coord_origin = (table_bbox.get("coord_origin") or "").upper()
        # In BOTTOMLEFT, high y = physical top; in TOPLEFT, low y = physical top
        top_high = (coord_origin == "BOTTOMLEFT")

        table_phys_top = table_high if top_high else table_low

        print(f"DEBUG: page={page_no}, table_bbox={table_bbox}")
        print(f"DEBUG: span={span}, coord_origin={coord_origin}, top_high={top_high}, table_phys_top={table_phys_top}")

        header_row_text = ""
        if rows:
            header_row_text = normalize_ws(" ".join([c for c in (rows[0] or []) if c]))

        best = None  # (score, raw, obj)
        for max_gap_frac in (0.08, 0.22):
            max_gap = max_gap_frac * span

            for obj in doc.get("texts", []):
                prov0 = (obj.get("prov") or [{}])[0]
                if prov0.get("page_no") != page_no:
                    continue
                bb = prov0.get("bbox") or {}
                if not isinstance(bb, dict):
                    continue
                l = bb.get("l"); r = bb.get("r")
                tt = bb.get("t"); bbv = bb.get("b")
                if None in (l, r, tt, bbv):
                    continue

                raw = normalize_ws(obj.get("text") or "")
                if not raw or is_noise_text(raw):
                    continue
                strong = bool(TABLE_CAPTION_STRONG_RE.match(raw))
                max_len = 260 if strong else 180
                if len(raw) > max_len:
                    continue

                x_overlap = x_overlap_ratio(l, r, t_l, t_r)
                x_min = 0.10 if strong else 0.30
                if x_overlap < x_min:
                    continue

                text_low, text_high = (tt, bbv) if tt < bbv else (bbv, tt)

                # Gap: caption is just above table physical top
                # For BOTTOMLEFT (top_high=True): caption_bottom (text_low) should be just above table_top (table_high)
                #   gap = caption_bottom - table_top = text_low - table_phys_top (should be small positive when caption is just above)
                # For TOPLEFT (top_high=False): caption_bottom (text_high) should be just above table_top (table_low)
                #   gap = table_top - caption_bottom = table_phys_top - text_high (should be small positive when caption is just above)
                tol = (0.03 if strong else 0.015) * span
                if top_high:
                    # BOTTOMLEFT: caption's low y should be >= table's high y (caption is above)
                    gap = text_low - table_phys_top
                else:
                    # TOPLEFT: caption's high y should be <= table's low y (caption is above)
                    gap = table_phys_top - text_high

                # Debug: Show candidates that start with "Table"
                if "Table" in raw or "table" in raw.lower():
                    print(f"DEBUG CANDIDATE: '{raw[:80]}...'")
                    print(f"  text_bbox: l={l}, t={tt}, r={r}, b={bbv}")
                    print(f"  text_low={text_low}, text_high={text_high}")
                    print(f"  x_overlap={x_overlap:.3f}, gap={gap:.3f}, tol={tol:.3f}, max_gap={max_gap:.3f}")
                    print(f"  gap_check: gap < -tol = {gap < -tol}, gap > max_gap = {gap > max_gap}")

                eff_max_gap = max(max_gap, 0.35 * span) if strong else max_gap
                neg_tol = (0.06 * span) if strong else tol   # allow overlap for "Table X: ..."
                if gap < -neg_tol or gap > eff_max_gap:
                    continue
                gap = max(0.0, gap)


                # Avoid picking the header row itself if it appears as separate text
                if header_row_text and canonicalize_repeat_text(raw) == canonicalize_repeat_text(header_row_text):
                    continue

                # Score: closer is better; mild bonus for digits
                score = - (gap / max(1e-6, span))
                if strong:
                    score += 0.10
                if any(unicodedata.category(ch) == "Nd" for ch in raw):
                    score += 0.02
                if len(raw) <= 90:
                    score += 0.01

                if best is None or score > best[0]:
                    best = (score, raw, obj)

            if best is not None:
                break

        if best:
            k = _make_text_key_from_obj(best[2])
            if k is not None:
                CAPTION_TEXT_KEYS.add(k)
            return best[1]
        return ""

    # ----------------------------
    # Build raw tables
    # ----------------------------
    raw_tables = []
    for i, tbl in enumerate(doc.get("tables", [])):
        provs = tbl.get("prov") or []
        page_no = provs[0].get("page_no") if provs else None
        bbox = provs[0].get("bbox") if provs else None

        t = table_rows_robust(tbl)
        rows = t.get("rows") or []

        caption = extract_caption(doc, tbl.get("captions"))
        if not caption:
            caption = infer_table_caption(page_no, bbox or {}, rows)
        if not caption:
            caption = infer_caption_from_first_row(rows)

        caption = sanitize_caption(caption, rows)

        raw_id = f"table_{i:04d}"
        raw_tables.append({
            "table_id": raw_id,  # temporary; will be remapped later
            "raw_table_id": raw_id,
            "page_no": page_no,
            "bbox": bbox,
            "caption": caption,
            "num_rows": t.get("num_rows"),
            "num_cols": t.get("num_cols"),
            "rows": rows,
        })

    # Sort by page then physical-topness (top-to-bottom reading order)
    def tbl_key(t):
        p = t.get("page_no") or 10**9
        bb = t.get("bbox") or {}
        tt = bb.get("t")
        topn = phys_topness(p, tt) if tt is not None else None
        # Higher physical topness first, so reading order top->bottom
        return (p, -(topn if topn is not None else -1))

    raw_tables = sorted(raw_tables, key=tbl_key)

    def header_candidates(table, max_rows=3):
      """
      Return canonicalized header-like strings from the first few rows.
      We use the full row text (not just [0][0]) because Docling often splits headers.
      """
      rows = table.get("rows") or []
      cands = []
      for i in range(min(max_rows, len(rows))):
          row_txt = " ".join([c for c in (rows[i] or []) if normalize_ws(c)])
          row_txt = canonicalize_repeat_text(row_txt)
          if row_txt and len(row_txt) >= 6:
              cands.append(row_txt)
      return cands


    def header_similarity(prev, cur):
        a = header_candidates(prev)
        b = header_candidates(cur)
        if not a or not b:
            return 0.0
        best = 0.0
        for x in a:
            for y in b:
                best = max(best, SequenceMatcher(None, x, y).ratio())
        return best


    def table_x_overlap(prev, cur):
        pb = prev.get("bbox") or {}
        cb = cur.get("bbox") or {}
        return x_overlap_ratio(pb.get("l"), pb.get("r"), cb.get("l"), cb.get("r"))


    def should_merge(prev, cur):
        # Basic adjacency
        if prev.get("page_no") is None or cur.get("page_no") is None:
            return False
        if cur["page_no"] != prev["page_no"] + 1:
            return False
        if cur.get("num_cols") != prev.get("num_cols"):
            return False

        # Continuation cues from captions/table numbers (strongest)
        prev_cap = (prev.get("caption") or "").strip()
        cur_cap  = (cur.get("caption")  or "").strip()
        prev_no = table_no_from_caption(prev_cap)
        cur_no  = table_no_from_caption(cur_cap)

        # Geometry overlap sanity (prevents accidental merges of unrelated tables)
        xo = table_x_overlap(prev, cur)
        if xo < 0.55:  # tune 0.45–0.70 depending on your docs
            return False

        # If both have numbers and they match -> merge even if headers differ
        if prev_no and cur_no and prev_no == cur_no:
            return True

        # If prev has a number and current has no caption/number -> likely continuation
        if prev_no and not cur_no and not cur_cap:
            return True

        # If prev has any caption and current has none -> likely continuation
        if prev_cap and not cur_cap:
            return True

        # Otherwise, allow fuzzy header match (instead of exact)
        sim = header_similarity(prev, cur)
        if sim >= 0.78:  # tune 0.70–0.85
            return True

        # Final fallback: your spatial “bottom of prev / top of cur” rule
        pb = (prev.get("bbox") or {}).get("b")
        ct = (cur.get("bbox") or {}).get("t")
        if pb is None or ct is None:
            return False

        pt = phys_topness(prev["page_no"], pb)
        tt = phys_topness(cur["page_no"], ct)
        if pt is None or tt is None:
            return False

        prev_bottomness = 1.0 - pt
        cur_topness = tt
        thresh = 0.80
        return (prev_bottomness >= thresh) or (cur_topness >= thresh)

    _TABLE_NO_RE = re.compile(r"(?i)\btable\b\.?\s*([0-9]+(?:\.[0-9]+)*)")

    def table_no_from_caption(caption: str):
        if not caption:
            return None
        m = _TABLE_NO_RE.search(caption)
        return m.group(1) if m else None


    def rows_equal(a, b):
        return canonicalize_repeat_text(" ".join(a or [])) == canonicalize_repeat_text(" ".join(b or []))

    # ----------------------------
    # Merge continuation fragments
    # ----------------------------
    merged = []
    table_id_map = {}

    for cur in raw_tables:
        if not merged:
            merged.append(cur)
            table_id_map[cur["raw_table_id"]] = cur["table_id"]
            continue

        prev = merged[-1]
        if should_merge(prev, cur):
            # Drop duplicated header row if present
            cur_rows = cur.get("rows") or []
            if cur_rows and (prev.get("rows") or []) and rows_equal(cur_rows[0], prev["rows"][0]):
                cur_rows = cur_rows[1:]

            prev["rows"].extend(cur_rows)
            prev["num_rows"] = len(prev["rows"])
            prev.setdefault("page_span", [prev.get("page_no"), prev.get("page_no")])
            prev["page_span"][1] = cur.get("page_no")

            # Keep earliest non-empty caption
            if not prev.get("caption") and cur.get("caption"):
                prev["caption"] = cur["caption"]

            table_id_map[cur["raw_table_id"]] = prev["table_id"]
        else:
            merged.append(cur)
            table_id_map[cur["raw_table_id"]] = cur["table_id"]

    # ----------------------------
    # Rename tables to semantic IDs from captions
    # ----------------------------
    final_id_map = {}
    used = set()
    for t in merged:
        old_id = t.get("table_id")
        num = parse_table_number(t.get("caption", ""))
        if num:
            new_id = num
            if new_id in used:
                new_id = f"{new_id}_p{t.get('page_no')}"
            used.add(new_id)
        else:
            new_id = old_id

        t["table_id"] = new_id
        final_id_map[old_id] = new_id

    # Update mapping for raw ids -> final ids
    table_id_map = {raw: final_id_map.get(mid, mid) for raw, mid in table_id_map.items()}

    return merged, table_id_map


def build_figures_json(doc: dict):
    """
    Build figure JSON with:
      1) caption extraction/inference from nearby text
      2) semantic figure_id derived from caption number (e.g., "6.1.1")

    Returns:
      (figures, figure_id_map)
        - figures: list of figure records (no file yet)
        - figure_id_map: maps raw figure_{idx:04d} -> final figure_id
    """

    def page_minmax_y():
        mm = defaultdict(lambda: [None, None])
        for t in doc.get("texts", []):
            prov0 = (t.get("prov") or [{}])[0]
            p = prov0.get("page_no")
            bb = prov0.get("bbox") or {}
            y = bb.get("t") if isinstance(bb, dict) else None
            if p is None or y is None:
                continue
            lo, hi = mm[p]
            mm[p][0] = y if lo is None else min(lo, y)
            mm[p][1] = y if hi is None else max(hi, y)
        return {k: (v[0], v[1]) for k, v in mm.items() if v[0] is not None and v[1] is not None and v[1] > v[0]}

    mm = page_minmax_y()

    def y_norm(page_no, y):
        if page_no not in mm or y is None:
            return None
        lo, hi = mm[page_no]
        return (y - lo) / (hi - lo) if hi > lo else None

    def infer_top_is_high_by_page():
        out = {}
        for p in mm.keys():
            hi_band, lo_band = [], []
            for obj in doc.get("texts", []):
                prov0 = (obj.get("prov") or [{}])[0]
                if prov0.get("page_no") != p:
                    continue
                bb = prov0.get("bbox") or {}
                if not isinstance(bb, dict):
                    continue
                tt = bb.get("t"); bbv = bb.get("b")
                if tt is None or bbv is None:
                    continue
                ymid = (tt + bbv) / 2.0
                yn = y_norm(p, ymid)
                if yn is None:
                    continue
                txt = normalize_ws(obj.get("text") or "")
                if not txt:
                    continue
                if yn >= 0.90:
                    hi_band.append(len(txt))
                elif yn <= 0.10:
                    lo_band.append(len(txt))
            if hi_band and lo_band:
                out[p] = (sum(hi_band) / len(hi_band)) >= (sum(lo_band) / len(lo_band))
            elif hi_band and not lo_band:
                out[p] = True
            elif lo_band and not hi_band:
                out[p] = False
            else:
                out[p] = True
        return out

    top_is_high = infer_top_is_high_by_page()

    def x_overlap_ratio(a_l, a_r, b_l, b_r):
        if None in (a_l, a_r, b_l, b_r):
            return 0.0
        inter = max(0.0, min(a_r, b_r) - max(a_l, b_l))
        aw = max(1e-6, a_r - a_l)
        bw = max(1e-6, b_r - b_l)
        return inter / min(aw, bw)

    FIGURE_NUM_RE = re.compile(r"(?i)\b(?:figure|fig\.?)\s*([0-9]+(?:\.[0-9]+)*)")

    def parse_figure_number(caption: str):
        if not caption:
            return None
        m = FIGURE_NUM_RE.search(caption)
        return m.group(1) if m else None

    FIGURE_CAPTION_STRONG_RE = re.compile(
    r"^\s*(?:Figure|Fig\.?)\s+\d+(?:\.\d+)*\s*[:\-–]?\s*\S",
    re.IGNORECASE
    )


    def infer_figure_caption(page_no, fig_bbox):
      if page_no is None or not isinstance(fig_bbox, dict) or page_no not in mm:
          return ""

      f_l = fig_bbox.get("l"); f_r = fig_bbox.get("r")
      f_t = fig_bbox.get("t"); f_b = fig_bbox.get("b")
      if None in (f_l, f_r, f_t, f_b):
          return ""

      # Use coord_origin directly (more reliable than heuristics)
      coord_origin = (fig_bbox.get("coord_origin") or "").upper()
      top_high = (coord_origin == "BOTTOMLEFT")  # high y is physical top in BOTTOMLEFT

      fig_low, fig_high = (f_t, f_b) if f_t < f_b else (f_b, f_t)
      span = mm[page_no][1] - mm[page_no][0]

      fig_phys_top = fig_high if top_high else fig_low
      fig_phys_bot = fig_low if top_high else fig_high

      best = None  # (score, raw, obj)

      # Search further because captions can be separated by notes
      for max_gap_frac in (0.25, 0.55):
          max_gap = max_gap_frac * span

          for obj in doc.get("texts", []):
              prov0 = (obj.get("prov") or [{}])[0]
              if prov0.get("page_no") != page_no:
                  continue
              bb = prov0.get("bbox") or {}
              if not isinstance(bb, dict):
                  continue

              l = bb.get("l"); r = bb.get("r")
              tt = bb.get("t"); bbv = bb.get("b")
              if None in (l, r, tt, bbv):
                  continue

              raw = normalize_ws(obj.get("text") or "")
              if not raw or is_noise_text(raw):
                  continue

              strong = bool(FIGURE_CAPTION_STRONG_RE.match(raw))
              max_len = 320 if strong else 180
              if len(raw) > max_len:
                  continue

              # X overlap: relax for strong captions
              if x_overlap_ratio(l, r, f_l, f_r) < (0.10 if strong else 0.25):
                  continue

              text_low, text_high = (tt, bbv) if tt < bbv else (bbv, tt)

              # gaps above/below (physical)
              gap_above = (text_low - fig_phys_top) if top_high else (fig_phys_top - text_high)
              gap_below = (fig_phys_bot - text_high) if top_high else (text_low - fig_phys_bot)

              # Prefer below-caption first; allow far gaps only if it's a strong "Figure X" line
              candidates = []
              tol = (0.06 if strong else 0.02) * span
              eff_max_gap = max_gap if not strong else max(max_gap, 0.70 * span)

              if -tol <= gap_below <= eff_max_gap:
                  candidates.append(max(0.0, gap_below))
              if -tol <= gap_above <= eff_max_gap:
                  candidates.append(max(0.0, gap_above))

              if not candidates:
                  continue

              gap = min(candidates)

              score = -(gap / max(1e-6, span))
              if strong:
                  score += 0.15
              if raw.lower().startswith(("figure", "fig.")):
                  score += 0.05

              if best is None or score > best[0]:
                  best = (score, raw, obj)

          if best is not None:
              break

      if best:
          k = _make_text_key_from_obj(best[2])
          if k is not None:
              CAPTION_TEXT_KEYS.add(k)
          return best[1]

      return ""

    figures = []
    figure_id_map = {}
    used = set()

    for i, pic in enumerate(doc.get("pictures", [])):
        provs = pic.get("prov") or []
        page_no = provs[0].get("page_no") if provs else None
        bbox = provs[0].get("bbox") if provs else None

        caption = extract_caption(doc, pic.get("captions"))
        if not caption:
            caption = infer_figure_caption(page_no, bbox or {})
        caption = normalize_ws(caption)

        raw_id = f"figure_{i:04d}"
        num = parse_figure_number(caption)
        if num:
            new_id = num
            if new_id in used:
                new_id = f"{new_id}_p{page_no}"
            used.add(new_id)
        else:
            new_id = raw_id

        figure_id_map[raw_id] = new_id
        figures.append({
            "figure_id": new_id,
            "raw_figure_id": raw_id,
            "page_no": page_no,
            "bbox": bbox,
            "caption": caption,
        })

    return figures, figure_id_map


def bbox_to_fitz_rect(bbox: dict, page_height: float):
    """
    Convert Docling bbox to a PyMuPDF Rect.
    Docling bbox is usually BOTTOMLEFT. PyMuPDF uses TOPLEFT origin.
    """
    if not bbox:
        return None
    x0, y0, x1, y1 = bbox.get("l"), bbox.get("t"), bbox.get("r"), bbox.get("b")
    if None in (x0, y0, x1, y1):
        return None
    # Convert y if bbox origin is bottom-left:
    # y_top = page_height - y_max, y_bottom = page_height - y_min
    origin = (bbox.get("coord_origin") or "").upper()
    if origin == "BOTTOMLEFT":
        y0, y1 = page_height - y0, page_height - y1
    y0, y1 = sorted([y0, y1])
    return fitz.Rect(x0, y0, x1, y1)

def extract_figures_from_pdf(figures: list, pdf_path: str, out_dir: str, dpi: int, image_format: str):
    """Render and save figure crops given figure records with bbox/page metadata."""
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    pdf = fitz.open(pdf_path)
    images_meta = []

    for fig in figures:
        page_no = fig.get("page_no")
        bbox = fig.get("bbox")
        figure_id = fig.get("figure_id")
        if not page_no or not bbox or not figure_id:
            continue

        page_index = page_no - 1
        if page_index < 0 or page_index >= pdf.page_count:
            continue

        page = pdf.load_page(page_index)
        clip = bbox_to_fitz_rect(bbox, page_height=page.rect.height)
        if clip is None:
            continue
        clip = clip & page.rect

        zoom = dpi / 72.0
        pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), clip=clip, alpha=False)

        safe_id = str(figure_id).replace(".", "_")
        fname = f"figure_{safe_id}_p{page_no}.{image_format}"
        fpath = out_dir / fname
        pix.save(str(fpath))

        images_meta.append(
            {
                "figure_id": figure_id,
                "page_no": page_no,
                "bbox": bbox,
                "caption": fig.get("caption", ""),
                "file": str(fpath),
            }
        )

    pdf.close()
    return images_meta

# ----------------------------
# Equations (FORMULA items)
# ----------------------------
def is_formula_text_item(text_obj: dict) -> bool:
    return (text_obj.get("label") or "").strip().upper() == "FORMULA"


def maybe_despace_latex(latex: str) -> str:
    s = latex.strip()
    toks = s.split()
    if len(toks) < 8:
        return s
    single = sum(1 for t in toks if len(t) == 1)
    if single / max(1, len(toks)) >= 0.6:
        return "".join(toks)
    return s


def extract_formula_latex(text_obj: dict) -> str:
    raw = text_obj.get("latex") or text_obj.get("text") or ""
    raw = normalize_ws(raw.strip())
    return maybe_despace_latex(raw) if FIX_SPACED_LATEX else raw


def extract_equations_from_pdf(doc: dict, pdf_path: str, out_dir: str, dpi: int, image_format: str):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    pdf = fitz.open(pdf_path)
    equations_meta = []

    for idx, t in enumerate(doc.get("texts", []) or []):
        if not is_formula_text_item(t):
            continue

        provs = t.get("prov") or []
        if not provs:
            continue

        prov = provs[0]
        page_no = prov.get("page_no")
        bbox = prov.get("bbox")
        if not page_no or not bbox:
            continue

        page_index = page_no - 1
        if page_index < 0 or page_index >= pdf.page_count:
            continue

        page = pdf.load_page(page_index)
        clip = bbox_to_fitz_rect(bbox, page_height=page.rect.height)
        if clip is None:
            continue

        clip = clip & page.rect
        zoom = dpi / 72.0
        pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), clip=clip, alpha=False)

        eq_id = f"eq_{idx:05d}"
        fname = f"{eq_id}_p{page_no}.{image_format}"
        fpath = out_dir / fname
        pix.save(str(fpath))

        equations_meta.append(
            {
                "equation_id": eq_id,
                "page_no": page_no,
                "bbox": bbox,
                "latex": extract_formula_latex(t),
                "file": str(fpath),
            }
        )

    pdf.close()
    return equations_meta


# ----------------------------
# Clause tree
# ----------------------------
def build_clause_tree(doc: dict, table_id_map=None, table_caption_map=None, figure_id_map=None, figure_caption_map=None):
    """
    Builds nodes keyed by clause id.
    Stores:
      - text: paragraphs (excluding list items)
      - lists: nested list blocks
      - equations: refs + latex
      - figures: captions only
      - tables: refs + captions
    """
    nodes = {}
    root_id = "ROOT"
    nodes[root_id] = {
        "id": root_id,
        "title": "",
        "children": [],
        "tables": [],
        "figures": [],
        "equations": [],
        "text": "",
        "lists": [],
        "_blocks": [],
    }
    current_id = root_id
    table_id_map = table_id_map or {}
    table_caption_map = table_caption_map or {}
    figure_id_map = figure_id_map or {}
    figure_caption_map = figure_caption_map or {}


    def ensure_node(cid: str):
        if cid not in nodes:
            nodes[cid] = {
                "id": cid,
                "title": "",
                "children": [],
                "tables": [],
                "figures": [],
                "equations": [],
                "text": "",
                "lists": [],
                "_blocks": [],
            }

    def parent_id(cid: str) -> str:
        parts = cid.split(".")
        return root_id if len(parts) <= 1 else ".".join(parts[:-1])

    def add_child(pid: str, cid: str):
        if cid not in nodes[pid]["children"]:
            nodes[pid]["children"].append(cid)

    def add_text_block(cid: str, txt: str):
        txt = (txt or "").rstrip()
        if not txt:
            return
        nodes[cid]["_blocks"].append({"kind": "text", "text": txt})

    def add_list_block(cid: str, marker: str, txt: str, indent: float, page_no=None, bbox=None):
        nodes[cid]["_blocks"].append(
            {
                "kind": "list_item",
                "marker": marker,
                "text": txt,
                "indent": indent,
                "page_no": page_no,
                "bbox": bbox,
            }
        )

    for ref in walk_body_in_reading_order(doc):
        kind, idx, obj = resolve_ref(doc, ref)

        if kind == "texts":
            label = (obj.get("label") or "").strip().lower()

            # --- STRATEGY: ENHANCED LATEX EXTRACTION ---
            # Prioritize the 'latex' field if it exists, as it contains
            # the conversion of GLYPH symbols.
            latex_val = obj.get("latex")
            if latex_val:
                raw = f"$${extract_formula_latex(obj)}$$"
            else:
                raw = obj.get("text") or ""
            # -------------------------------------------

            if is_caption_text_obj(obj):
                continue
            if not raw.strip():
                continue

            provs = obj.get("prov") or []
            prov0 = provs[0] if provs else {}
            page_no = prov0.get("page_no")
            bbox = prov0.get("bbox")
            geo_indent = float(bbox.get("l")) if isinstance(bbox, dict) and bbox.get("l") is not None else 0.0

            # Clause header detection
            m = CLAUSE_RE.match(raw.strip())
            if m:
                cid = m.group(1)
                rest = m.group(2).strip()

                ensure_node(cid)
                pid = parent_id(cid)
                ensure_node(pid)
                add_child(pid, cid)

                depth = len(cid.split("."))
                if depth <= 3:
                    if not nodes[cid]["title"]:
                        nodes[cid]["title"] = rest
                else:
                    add_text_block(cid, rest)

                current_id = cid
                continue

            # Regular list and text processing
            marker_field = obj.get("marker") or ""
            mk, content, indent_spaces = parse_list_marker(raw_text=raw, marker_field=marker_field, allow_inline_start=True)

            if label == "list_item" or mk:
                if mk and content:
                    add_list_block(current_id, mk, content, indent=geo_indent if geo_indent else float(indent_spaces), page_no=page_no, bbox=bbox)
                else:
                    add_text_block(current_id, normalize_ws(raw))
            else:
                add_text_block(current_id, normalize_ws(raw))
        elif kind == "tables":
            raw_id = f"table_{idx:04d}"
            merged_id = table_id_map.get(raw_id, raw_id)
            cap = table_caption_map.get(merged_id, "")

            # Avoid duplicates when multiple page-fragments map to the same merged table
            if not (nodes[current_id]["tables"] and nodes[current_id]["tables"][-1].get("table_id") == merged_id):
                nodes[current_id]["tables"].append({"table_id": merged_id, "caption": cap})

        elif kind == "pictures":
            raw_id = f"figure_{idx:04d}"
            fig_id = figure_id_map.get(raw_id, raw_id)
            cap = figure_caption_map.get(fig_id, "") or extract_caption(doc, obj.get("captions"))
            if not nodes[current_id]["figures"] or nodes[current_id]["figures"][-1].get("figure_id") != fig_id:
                nodes[current_id]["figures"].append({"figure_id": fig_id, "caption": cap})

    # Finalize blocks into text + nested lists
    for nid in list(nodes.keys()):
        blocks = nodes[nid].get("_blocks", [])
        text, lists = blocks_to_text_and_lists(blocks)
        nodes[nid]["text"] = text
        nodes[nid]["lists"] = lists
        nodes[nid].pop("_blocks", None)

    return {"root": root_id, "nodes": nodes}


# ----------------------------
# Retrieval helper (optional)
# ----------------------------
def collect_text_recursive(structured: dict, clause_id: str) -> str:
    nodes = structured["nodes"]
    if clause_id not in nodes:
        return ""
    n = nodes[clause_id]
    chunks = []

    if clause_id != "ROOT" and n.get("title"):
        chunks.append(f"{clause_id} {n['title']}".strip())
    if n.get("text"):
        chunks.append(n["text"])

    for lst in n.get("lists", []):
        rendered = format_list_items(lst.get("items", []))
        if rendered:
            chunks.append(rendered)

    for fig in n.get("figures", []):
        if fig.get("caption"):
            chunks.append(fig["caption"])
    for tbl in n.get("tables", []):
        if tbl.get("caption"):
            chunks.append(tbl["caption"])
    for eq in n.get("equations", []):
        if eq.get("latex"):
            chunks.append(f"$$ {eq['latex']} $$")

    for child in n.get("children", []):
        child_txt = collect_text_recursive(structured, child)
        if child_txt:
            chunks.append(child_txt)

    return "\n".join(chunks).strip()


# ----------------------------
# Main
# ----------------------------
def main():
    out_base = Path(OUTPUT_DIR)
    out_base.mkdir(parents=True, exist_ok=True)
    images_dir = out_base / IMAGES_DIR_NAME
    eq_dir = out_base / EQUATIONS_DIR_NAME

    doc = load_docling_json(INPUT_DOCLING_JSON)

    # Learn repeating headers/footers once and use it everywhere via is_noise_text()
    global NOISE_CANONICALS
    NOISE_CANONICALS = detect_repeating_headers_footers(doc)

    # Build (and merge) tables first so clauses can reference merged IDs + captions
    tables_struct, table_id_map = build_tables_json(doc)
    table_caption_map = {t["table_id"]: t.get("caption", "") for t in tables_struct}

    figures_struct, figure_id_map = build_figures_json(doc)
    figure_caption_map = {f["figure_id"]: f.get("caption", "") for f in figures_struct}

    clauses_struct = build_clause_tree(
        doc,
        table_id_map=table_id_map,
        table_caption_map=table_caption_map,
        figure_id_map=figure_id_map,
        figure_caption_map=figure_caption_map,
    )

    images_struct = extract_figures_from_pdf(figures_struct, INPUT_PDF, str(images_dir), dpi=DPI, image_format=IMAGE_FORMAT)
    equations_struct = extract_equations_from_pdf(doc, INPUT_PDF, str(eq_dir), dpi=DPI, image_format=IMAGE_FORMAT)

    clauses_path = out_base / "structured_clauses.json"
    tables_path = out_base / "structured_tables.json"
    images_path = out_base / "structured_images.json"
    equations_path = out_base / "structured_equations.json"

    clauses_path.write_text(json.dumps(clauses_struct, ensure_ascii=False, indent=2), encoding="utf-8")
    tables_path.write_text(json.dumps(tables_struct, ensure_ascii=False, indent=2), encoding="utf-8")
    images_path.write_text(json.dumps(images_struct, ensure_ascii=False, indent=2), encoding="utf-8")
    equations_path.write_text(json.dumps(equations_struct, ensure_ascii=False, indent=2), encoding="utf-8")

    # quick stats
    nodes = clauses_struct["nodes"]
    list_blocks = sum(len(n.get("lists", [])) for n in nodes.values())
    list_items = 0
    def count_items(items):
        nonlocal list_items
        for it in items:
            list_items += 1
            count_items(it.get("children", []))
    for n in nodes.values():
        for lst in n.get("lists", []):
            count_items(lst.get("items", []))

    print("Saved:")
    print(" -", clauses_path)
    print(" -", tables_path)
    print(" -", images_path)
    print(" -", equations_path)
    print("\nCounts:")
    print(" clauses:", len(nodes))
    print(" tables:", len(tables_struct))
    print(" figures:", len(images_struct))
    print(" equations:", len(equations_struct))
    print(" list blocks:", list_blocks)
    print(" list items:", list_items)


if __name__ == "__main__":
    main()

#%%


DEBUG build_tables_json: mm pages = [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
DEBUG: Early return - page_no=2, in_mm=False
DEBUG: Early return - page_no=3, in_mm=False
DEBUG: page=4, table_bbox={'l': 129.060791015625, 't': 665.3757400512695, 'r': 484.0224914550781, 'b': 157.935546875, 'coord_origin': 'BOTTOMLEFT'}
DEBUG: span=46.30645243326819, coord_origin=BOTTOMLEFT, top_high=True, table_phys_top=665.3757400512695
Saved:
 - /content/structured_out/structured_clauses.json
 - /content/structured_out/structured_tables.json
 - /content/structured_out/structured_images.json
 - /content/structured_out/structured_equations.json

Counts:
 clauses: 79
 tables: 6
 figures: 1
 equations: 1
 list blocks: 37
 list items: 109
