In [1]:
!pip install docling

Collecting docling
  Downloading docling-2.68.0-py3-none-any.whl.metadata (11 kB)
Collecting docling-core<3.0.0,>=2.50.1 (from docling-core[chunking]<3.0.0,>=2.50.1->docling)
  Downloading docling_core-2.59.0-py3-none-any.whl.metadata (7.7 kB)
Collecting docling-parse<5.0.0,>=4.7.0 (from docling)
  Downloading docling_parse-4.7.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting docling-ibm-models<4,>=3.9.1 (from docling)
  Downloading docling_ibm_models-3.10.3-py3-none-any.whl.metadata (7.3 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting rapidocr<4.0.0,>=3.3 (from docling)
  Downloading

In [2]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat

source = "/content/bnbc6.pdf"

# Enable formula enrichment so equations become TextItems with label FORMULA and carry LaTeX text
pipeline_options = PdfPipelineOptions()
pipeline_options.do_formula_enrichment = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

result = converter.convert(source)

# Full Docling JSON export
structured_json = result.document.export_to_dict()

print("Converted. First text item:", structured_json["texts"][0]["text"])


FileNotFoundError: [Errno 2] No such file or directory: '/content/bnbc6.pdf'

In [3]:
import json
output_filename = "bnbc6.json"

with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(structured_json, f, ensure_ascii=False, indent=4)

print(f"Success! Your structured data is saved in {output_filename}")

Success! Your structured data is saved in bnbc6.json


In [3]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.7


In [4]:
import json
import re
from pathlib import Path
import fitz  # PyMuPDF

# ----------------------------
# Config (edit these paths)
# ----------------------------
INPUT_DOCLING_JSON = "/content/bnbc6.json"
INPUT_PDF = "/content/bnbc6.pdf"

OUTPUT_DIR = "/content/structured_out"
IMAGES_DIR_NAME = "images"
EQUATIONS_DIR_NAME = "equations"
DPI = 200
IMAGE_FORMAT = "jpg"

# If Docling formula enrichment produces spaced-out LaTeX (each char separated by spaces),
# enable this heuristic de-spacing fix.
FIX_SPACED_LATEX = True


# ----------------------------
# Helpers: Docling JSON access
# ----------------------------
CLAUSE_RE = re.compile(r"^\s*(\d+(?:\.\d+)*)\s+(.*\S)\s*$")

# List marker patterns (start-of-line)
PAREN_MARKER_RE = re.compile(r"^\s*\(\s*([A-Za-z0-9]+|[ivxlcdmIVXLCDM]+)\s*\)\s+")
SIMPLE_MARKER_RE = re.compile(r"^\s*([A-Za-z]|\d+|[ivxlcdmIVXLCDM]+)([\.|\)])\s+")
BULLET_MARKER_RE = re.compile(r"^\s*([•\-*–—])\s+")

# Inline markers inside a single paragraph, e.g. "... requirements: (1) ... (2) ... (3) ..."
# We only treat a parenthesized token as a marker if it's preceded by start/whitespace/:/; and followed by space.
INLINE_PAREN_MARKER_RE = re.compile(r"(?:(?<=^)|(?<=[\s:;]))\(\s*([0-9]+|[A-Za-z]|[ivxlcdmIVXLCDM]+)\s*\)\s+")


def normalize_ws(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()


def load_docling_json(path: str) -> dict:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def resolve_ref(doc: dict, ref: str):
    """
    ref examples:
      '#/texts/93'
      '#/tables/0'
      '#/pictures/2'
      '#/groups/5'
    """
    m = re.match(r"^#/(texts|tables|pictures|groups)/(\d+)$", ref)
    if not m:
        return None, None, None
    kind, idx = m.group(1), int(m.group(2))
    return kind, idx, doc[kind][idx]


def extract_caption(doc: dict, captions) -> str:
    """Extract concatenated caption text from refs like [{'$ref':'#/texts/93'}]."""
    if not captions:
        return ""
    parts = []
    for c in captions:
        r = c.get("$ref")
        if not r:
            continue
        kind, _, obj = resolve_ref(doc, r)
        if kind == "texts":
            t = normalize_ws(obj.get("text") or "")
            if t:
                parts.append(t)
    return " ".join(parts).strip()


def walk_body_in_reading_order(doc: dict):
    """
    Body children contains a mixture of texts/tables/pictures and also '#/groups/*'.
    We DFS into groups so we don't miss content.
    """
    body = doc.get("body") or {}
    stack = []

    def push_children(children):
        for child in reversed(children or []):
            stack.append(child)

    push_children(body.get("children"))

    while stack:
        item = stack.pop()
        if not isinstance(item, dict) or "$ref" not in item:
            continue

        ref = item["$ref"]
        kind, _, obj = resolve_ref(doc, ref)

        if kind == "groups":
            push_children(obj.get("children"))
            continue

        yield ref


# ----------------------------
# Generic point parsing utilities
# ----------------------------
def leading_indent_spaces(s: str) -> int:
    if not s:
        return 0
    n = 0
    for ch in s:
        if ch == " ":
            n += 1
        elif ch == "\t":
            n += 4
        else:
            break
    return n


def parse_list_marker(raw_text: str, marker_field: str = "", allow_inline_start: bool = True):
    """
    Returns (marker, content, indent_spaces) if this looks like a point item; else (None, None, None).

    Priority:
      1) Docling-provided marker_field (e.g., '1.' or '*')
      2) '(a) ...', '(1) ...', '(i) ...' at START of text
      3) 'a) ...', '1. ...', 'i) ...' at START of text
      4) bullet char at START of text
    """
    if raw_text is None:
        return None, None, None

    indent = leading_indent_spaces(raw_text)
    s = raw_text.lstrip("\t ").rstrip()

    mf = (marker_field or "").strip()
    if mf:
        return mf, normalize_ws(s), indent

    if not allow_inline_start or not s:
        return None, None, None

    m = PAREN_MARKER_RE.match(s)
    if m:
        mk = f"({m.group(1)})"
        return mk, normalize_ws(s[m.end():]), indent

    m = SIMPLE_MARKER_RE.match(s)
    if m:
        mk = f"{m.group(1)}{m.group(2)}"
        return mk, normalize_ws(s[m.end():]), indent

    m = BULLET_MARKER_RE.match(s)
    if m:
        mk = m.group(1)
        return mk, normalize_ws(s[m.end():]), indent

    return None, None, None


def split_inline_enumeration(raw_text: str):
    """
    If raw_text contains multiple inline markers like:
      "Intro: (1) aaa (2) bbb (3) ccc"
    return:
      intro="Intro:" and items=[{"marker":"(1)","text":"aaa"}, ...]
    Otherwise returns (None, None).
    """
    if not raw_text:
        return None, None
    s = raw_text.strip()
    matches = list(INLINE_PAREN_MARKER_RE.finditer(s))
    if len(matches) < 2:
        return None, None

    intro = s[:matches[0].start()].strip()
    items = []
    for i, mm in enumerate(matches):
        mk = f"({mm.group(1)})"
        start = mm.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(s)
        content = normalize_ws(s[start:end]).strip(" ;")
        if content:
            items.append({"marker": mk, "text": content})
    if not items:
        return None, None
    return intro, items


def is_noise_text(s: str) -> bool:
    """
    Heuristic to ignore headers/footers/page numbers that can appear between list items.
    This is intentionally generic (no document-specific keywords).
    """
    if s is None:
        return True
    t = s.strip()
    if not t:
        return True
    if re.fullmatch(r"\d{3,6}", t):  # page numbers like 3235
        return True
    if len(t) <= 12 and sum(ch.isalpha() for ch in t) < 2:
        return True
    # very punctuation-heavy short lines
    if len(t) <= 25 and sum(ch.isalnum() for ch in t) / max(1, len(t)) < 0.35:
        return True
    return False


# ----------------------------
# Reading-order sort key (for list runs)
# ----------------------------
def _reading_order_key(block: dict):
    """Sort by page then top-to-bottom then left-to-right using bbox."""
    page_no = block.get("page_no")
    bbox = block.get("bbox") or {}
    l = bbox.get("l") if isinstance(bbox, dict) else None
    t = bbox.get("t") if isinstance(bbox, dict) else None
    origin = (bbox.get("coord_origin") or "TOPLEFT").upper() if isinstance(bbox, dict) else "TOPLEFT"

    page_key = page_no if page_no is not None else 10**9
    y_key = 0 if t is None else ((-t) if origin == "BOTTOMLEFT" else t)
    x_key = l if l is not None else 0
    return (page_key, y_key, x_key)


# ----------------------------
# Smart nesting (indent + marker kind + lookahead)
# ----------------------------
def _is_roman(s: str) -> bool:
    if not s:
        return False
    s = s.strip().lower()
    return bool(re.fullmatch(r"[ivxlcdm]+", s))


def marker_kind(marker: str) -> str:
    if not marker:
        return "other"
    m = marker.strip()

    if m in {"•", "-", "*", "–", "—"}:
        return "bullet"

    core = re.sub(r"^[\(\[]\s*", "", m)
    core = re.sub(r"\s*[\)\]]$", "", core)
    core = re.sub(r"[\.)]$", "", core).strip()

    if core.isdigit():
        return "num"
    if len(core) == 1 and core.isalpha():
        return "alpha"
    if _is_roman(core):
        return "roman"
    return "other"


KIND_ORDER = {"num": 0, "alpha": 1, "roman": 2, "bullet": 3, "other": 3}


def _nesting_trigger_text(s: str) -> bool:
    """Non-hardcoded cue: a colon usually introduces a sublist."""
    return bool(s and s.strip().endswith(":"))


def _will_return_to_kind(flat_items, start_idx: int, target_kind: str) -> bool:
    """Lookahead: if we later return to target_kind, items in between are likely a sublist."""
    for j in range(start_idx + 1, len(flat_items)):
        k = marker_kind(flat_items[j].get("marker", ""))
        if k == target_kind:
            return True
        if KIND_ORDER.get(k, 3) < KIND_ORDER.get(target_kind, 3):
            return False
    return False


def nest_list_items_smart(flat_items):
    """
    Build a nested list tree using:
      1) indentation (from bbox.l when available)
      2) marker-kind hierarchy + ':' cue + lookahead return
    """
    root = []
    stack = []  # entries: {"indent": float, "kind": str, "order": int, "node": dict}

    for i, it in enumerate(flat_items):
        node = {"marker": it.get("marker", ""), "text": it.get("text", ""), "children": []}
        ind = float(it.get("indent") or 0)

        k = marker_kind(node["marker"])
        k_order = KIND_ORDER.get(k, 3)

        if not stack:
            root.append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})
            continue

        prev = stack[-1]

        # Indent-based nesting when it changes meaningfully.
        indent_diff = ind - prev["indent"]
        indent_is_informative = abs(indent_diff) >= 5.0  # bbox units are typically points

        if indent_is_informative:
            while stack and ind <= stack[-1]["indent"]:
                stack.pop()
            if not stack:
                root.append(node)
            else:
                stack[-1]["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})
            continue

        # Marker-based inference when indent isn't helpful.
        prev_kind = prev["kind"]
        prev_order = prev["order"]

        prev_introduces_sublist = _nesting_trigger_text(prev["node"].get("text", ""))
        lookahead_sublist = _will_return_to_kind(flat_items, i, prev_kind)

        if (k_order > prev_order) and (prev_introduces_sublist or lookahead_sublist):
            prev["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})
            continue

        if k == prev_kind:
            # sibling
            stack.pop()
            if not stack:
                root.append(node)
            else:
                stack[-1]["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})
            continue

        if k_order <= prev_order:
            while stack and KIND_ORDER.get(stack[-1]["kind"], 3) >= k_order:
                stack.pop()
            if not stack:
                root.append(node)
            else:
                stack[-1]["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})
            continue

        # default: sibling
        stack.pop()
        if not stack:
            root.append(node)
        else:
            stack[-1]["node"]["children"].append(node)
        stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})

    return root


def blocks_to_text_and_lists(blocks):
    """
    blocks: list of either
      {"kind":"text","text":...}
      {"kind":"list_item","marker":..., "text":..., "indent":..., "page_no":..., "bbox":...}

    Returns:
      text: merged paragraphs (non-list blocks)
      lists: list of {"items":[nested...]} in reading order
    """
    text_parts = []
    lists = []

    i = 0
    while i < len(blocks):
        b = blocks[i]
        if b["kind"] == "text":
            t = b.get("text", "")
            if t and not is_noise_text(t):
                text_parts.append(t.strip())
            i += 1
            continue

        # collect list items, skipping noise text between them
        j = i
        run = []
        while j < len(blocks):
            bb = blocks[j]
            if bb["kind"] == "list_item":
                run.append(bb)
                j += 1
                continue
            if bb["kind"] == "text" and is_noise_text(bb.get("text", "")):
                j += 1
                continue
            break

        # Sort by bbox order if we have provenance (fixes e.g., 9, 11, 10).
        if any((x.get("page_no") is not None and x.get("bbox")) for x in run):
            run = sorted(run, key=_reading_order_key)

        nested = nest_list_items_smart(run)
        lists.append({"items": nested})

        i = j

    text = "\n".join([p for p in text_parts if p]).strip()
    return text, lists


def format_list_items(items, level=0):
    """Render nested list items into plain text (for retrieval display)."""
    out = []
    indent = "  " * level
    for it in items or []:
        mk = it.get("marker", "")
        tx = it.get("text", "")
        line = f"{indent}{mk} {tx}".strip()
        if line:
            out.append(line)
        if it.get("children"):
            out.append(format_list_items(it["children"], level + 1))
    return "\n".join([x for x in out if x]).strip()


# ----------------------------
# Tables (robust)
# ----------------------------
def table_rows_robust(table_obj: dict):
    """
    Robustly converts Docling tables into rows[][].

    Handles:
      A) data["grid"][r][c] is a dict with "text"
      B) data["grid"][r][c] is an int index into data["table_cells"]
      C) missing/empty grid -> reconstruct from table_cells span metadata
    """
    data = table_obj.get("data") or {}
    grid = data.get("grid") or []
    table_cells = data.get("table_cells") or []

    num_rows = int(data.get("num_rows") or (len(grid) if grid else 0))
    num_cols = int(data.get("num_cols") or (len(grid[0]) if grid and grid[0] else 0))

    if grid and grid[0] and isinstance(grid[0][0], dict):
        rows = []
        for r in grid:
            rows.append([normalize_ws(cell.get("text", "")) if isinstance(cell, dict) else "" for cell in r])
        return {"num_rows": num_rows, "num_cols": num_cols, "rows": rows}

    if grid and grid[0] and isinstance(grid[0][0], int):
        rows = []
        for r in grid:
            row = []
            for idx in r:
                if isinstance(idx, int) and 0 <= idx < len(table_cells):
                    row.append(normalize_ws(table_cells[idx].get("text", "")))
                else:
                    row.append("")
            rows.append(row)
        return {"num_rows": num_rows, "num_cols": num_cols, "rows": rows}

    mat = [["" for _ in range(num_cols)] for _ in range(num_rows)]
    for cell in table_cells:
        txt = normalize_ws(cell.get("text", ""))
        r0 = cell.get("start_row_offset_idx", 0)
        r1 = cell.get("end_row_offset_idx", r0 + 1)
        c0 = cell.get("start_col_offset_idx", 0)
        c1 = cell.get("end_col_offset_idx", c0 + 1)
        for rr in range(r0, r1):
            for cc in range(c0, c1):
                if 0 <= rr < num_rows and 0 <= cc < num_cols:
                    mat[rr][cc] = txt

    return {"num_rows": num_rows, "num_cols": num_cols, "rows": mat}


def build_tables_json(doc: dict) -> list:
    out = []
    for i, tbl in enumerate(doc.get("tables", [])):
        provs = tbl.get("prov") or []
        page_no = provs[0].get("page_no") if provs else None
        bbox = provs[0].get("bbox") if provs else None
        caption = extract_caption(doc, tbl.get("captions"))
        t = table_rows_robust(tbl)
        out.append(
            {
                "table_id": f"table_{i:04d}",
                "page_no": page_no,
                "bbox": bbox,
                "caption": caption,
                "num_rows": t["num_rows"],
                "num_cols": t["num_cols"],
                "rows": t["rows"],
            }
        )
    return out


# ----------------------------
# Images & geometry helpers
# ----------------------------
def bbox_to_fitz_rect(bbox: dict, page_height: float) -> fitz.Rect | None:
    """
    Docling bbox has coord_origin, often 'BOTTOMLEFT'.
    PyMuPDF uses TOPLEFT origin.
    """
    l = bbox.get("l")
    t = bbox.get("t")
    r = bbox.get("r")
    b = bbox.get("b")
    if None in (l, t, r, b):
        return None

    origin = (bbox.get("coord_origin") or "TOPLEFT").upper()

    if origin == "BOTTOMLEFT":
        y0 = page_height - t
        y1 = page_height - b
    else:
        y0 = t
        y1 = b

    x0, x1 = sorted([l, r])
    y0, y1 = sorted([y0, y1])
    return fitz.Rect(x0, y0, x1, y1)


def extract_figures_from_pdf(doc: dict, pdf_path: str, out_dir: str, dpi: int, image_format: str):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    pdf = fitz.open(pdf_path)
    images_meta = []

    for i, pic in enumerate(doc.get("pictures", [])):
        provs = pic.get("prov") or []
        if not provs:
            continue

        prov = provs[0]
        page_no = prov.get("page_no")
        bbox = prov.get("bbox")
        if not page_no or not bbox:
            continue

        page_index = page_no - 1
        if page_index < 0 or page_index >= pdf.page_count:
            continue

        page = pdf.load_page(page_index)
        clip = bbox_to_fitz_rect(bbox, page_height=page.rect.height)
        if clip is None:
            continue

        clip = clip & page.rect
        caption = extract_caption(doc, pic.get("captions"))

        zoom = dpi / 72.0
        pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), clip=clip, alpha=False)

        fname = f"figure_{i:04d}_p{page_no}.{image_format}"
        fpath = out_dir / fname
        pix.save(str(fpath))

        images_meta.append(
            {
                "figure_id": f"figure_{i:04d}",
                "page_no": page_no,
                "bbox": bbox,
                "caption": caption,
                "file": str(fpath),
            }
        )

    pdf.close()
    return images_meta


# ----------------------------
# Equations (FORMULA items)
# ----------------------------
def is_formula_text_item(text_obj: dict) -> bool:
    return (text_obj.get("label") or "").strip().upper() == "FORMULA"


def maybe_despace_latex(latex: str) -> str:
    s = latex.strip()
    toks = s.split()
    if len(toks) < 8:
        return s
    single = sum(1 for t in toks if len(t) == 1)
    if single / max(1, len(toks)) >= 0.6:
        return "".join(toks)
    return s


def extract_formula_latex(text_obj: dict) -> str:
    raw = text_obj.get("latex") or text_obj.get("text") or ""
    raw = normalize_ws(raw.strip())
    return maybe_despace_latex(raw) if FIX_SPACED_LATEX else raw


def extract_equations_from_pdf(doc: dict, pdf_path: str, out_dir: str, dpi: int, image_format: str):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    pdf = fitz.open(pdf_path)
    equations_meta = []

    for idx, t in enumerate(doc.get("texts", []) or []):
        if not is_formula_text_item(t):
            continue

        provs = t.get("prov") or []
        if not provs:
            continue

        prov = provs[0]
        page_no = prov.get("page_no")
        bbox = prov.get("bbox")
        if not page_no or not bbox:
            continue

        page_index = page_no - 1
        if page_index < 0 or page_index >= pdf.page_count:
            continue

        page = pdf.load_page(page_index)
        clip = bbox_to_fitz_rect(bbox, page_height=page.rect.height)
        if clip is None:
            continue

        clip = clip & page.rect
        zoom = dpi / 72.0
        pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), clip=clip, alpha=False)

        eq_id = f"eq_{idx:05d}"
        fname = f"{eq_id}_p{page_no}.{image_format}"
        fpath = out_dir / fname
        pix.save(str(fpath))

        equations_meta.append(
            {
                "equation_id": eq_id,
                "page_no": page_no,
                "bbox": bbox,
                "latex": extract_formula_latex(t),
                "file": str(fpath),
            }
        )

    pdf.close()
    return equations_meta


# ----------------------------
# Clause tree
# ----------------------------
def build_clause_tree(doc: dict):
    """
    Builds nodes keyed by clause id.
    Stores:
      - text: paragraphs (excluding list items)
      - lists: nested list blocks
      - equations: refs + latex
      - figures: captions only
      - tables: refs + captions
    """
    nodes = {}
    root_id = "ROOT"
    nodes[root_id] = {
        "id": root_id,
        "title": "",
        "children": [],
        "tables": [],
        "figures": [],
        "equations": [],
        "text": "",
        "lists": [],
        "_blocks": [],
    }
    current_id = root_id

    def ensure_node(cid: str):
        if cid not in nodes:
            nodes[cid] = {
                "id": cid,
                "title": "",
                "children": [],
                "tables": [],
                "figures": [],
                "equations": [],
                "text": "",
                "lists": [],
                "_blocks": [],
            }

    def parent_id(cid: str) -> str:
        parts = cid.split(".")
        return root_id if len(parts) <= 1 else ".".join(parts[:-1])

    def add_child(pid: str, cid: str):
        if cid not in nodes[pid]["children"]:
            nodes[pid]["children"].append(cid)

    def add_text_block(cid: str, txt: str):
        txt = (txt or "").rstrip()
        if not txt:
            return
        nodes[cid]["_blocks"].append({"kind": "text", "text": txt})

    def add_list_block(cid: str, marker: str, txt: str, indent: float, page_no=None, bbox=None):
        nodes[cid]["_blocks"].append(
            {
                "kind": "list_item",
                "marker": marker,
                "text": txt,
                "indent": indent,
                "page_no": page_no,
                "bbox": bbox,
            }
        )

    for ref in walk_body_in_reading_order(doc):
        kind, idx, obj = resolve_ref(doc, ref)

        if kind == "texts":
            label = (obj.get("label") or "").strip().lower()
            raw = obj.get("text") or ""
            if not raw.strip():
                continue

            provs = obj.get("prov") or []
            prov0 = provs[0] if provs else {}
            page_no = prov0.get("page_no")
            bbox = prov0.get("bbox")
            # Use bbox.l as a geometry-based indent when available (more reliable than whitespace).
            geo_indent = float(bbox.get("l")) if isinstance(bbox, dict) and bbox.get("l") is not None else 0.0

            # Clause header detection first
            m = CLAUSE_RE.match(raw.strip())
            if m:
                cid = m.group(1)
                rest = m.group(2).strip()

                ensure_node(cid)
                pid = parent_id(cid)
                ensure_node(pid)
                add_child(pid, cid)

                depth = len(cid.split("."))

                if depth <= 3:
                    if not nodes[cid]["title"]:
                        nodes[cid]["title"] = rest
                else:
                    add_text_block(cid, rest)

                current_id = cid
                continue

            # Equations
            if is_formula_text_item(obj):
                eq_id = f"eq_{idx:05d}"
                latex = extract_formula_latex(obj)
                nodes[current_id]["equations"].append({"equation_id": eq_id, "latex": latex})
                add_text_block(current_id, f"$$ {latex} $$" if latex else f"[EQ {eq_id}]")
                continue

            # Inline enumerations inside a paragraph: (1)...(2)...(3)...
            if label != "list_item":
                intro, items = split_inline_enumeration(raw)
                if items:
                    if intro:
                        add_text_block(current_id, normalize_ws(intro))
                    for it2 in items:
                        add_list_block(current_id, it2["marker"], it2["text"], indent=geo_indent, page_no=page_no, bbox=bbox)
                    continue

            # Regular list items
            marker_field = obj.get("marker") or ""
            mk, content, indent_spaces = parse_list_marker(raw_text=raw, marker_field=marker_field, allow_inline_start=True)

            if label == "list_item" or mk:
                if mk and content:
                    add_list_block(current_id, mk, content, indent=geo_indent if geo_indent else float(indent_spaces), page_no=page_no, bbox=bbox)
                else:
                    add_text_block(current_id, normalize_ws(raw))
            else:
                add_text_block(current_id, normalize_ws(raw))

        elif kind == "tables":
            caption = extract_caption(doc, obj.get("captions"))
            nodes[current_id]["tables"].append({"table_id": f"table_{idx:04d}", "caption": caption})

        elif kind == "pictures":
            caption = extract_caption(doc, obj.get("captions"))
            nodes[current_id]["figures"].append({"figure_id": f"figure_{idx:04d}", "caption": caption})

    # Finalize blocks into text + nested lists
    for nid in list(nodes.keys()):
        blocks = nodes[nid].get("_blocks", [])
        text, lists = blocks_to_text_and_lists(blocks)
        nodes[nid]["text"] = text
        nodes[nid]["lists"] = lists
        nodes[nid].pop("_blocks", None)

    return {"root": root_id, "nodes": nodes}


# ----------------------------
# Retrieval helper (optional)
# ----------------------------
def collect_text_recursive(structured: dict, clause_id: str) -> str:
    nodes = structured["nodes"]
    if clause_id not in nodes:
        return ""
    n = nodes[clause_id]
    chunks = []

    if clause_id != "ROOT" and n.get("title"):
        chunks.append(f"{clause_id} {n['title']}".strip())
    if n.get("text"):
        chunks.append(n["text"])

    for lst in n.get("lists", []):
        rendered = format_list_items(lst.get("items", []))
        if rendered:
            chunks.append(rendered)

    for fig in n.get("figures", []):
        if fig.get("caption"):
            chunks.append(fig["caption"])
    for tbl in n.get("tables", []):
        if tbl.get("caption"):
            chunks.append(tbl["caption"])
    for eq in n.get("equations", []):
        if eq.get("latex"):
            chunks.append(f"$$ {eq['latex']} $$")

    for child in n.get("children", []):
        child_txt = collect_text_recursive(structured, child)
        if child_txt:
            chunks.append(child_txt)

    return "\n".join(chunks).strip()


# ----------------------------
# Main
# ----------------------------
def main():
    out_base = Path(OUTPUT_DIR)
    out_base.mkdir(parents=True, exist_ok=True)
    images_dir = out_base / IMAGES_DIR_NAME
    eq_dir = out_base / EQUATIONS_DIR_NAME

    doc = load_docling_json(INPUT_DOCLING_JSON)

    clauses_struct = build_clause_tree(doc)
    tables_struct = build_tables_json(doc)
    images_struct = extract_figures_from_pdf(doc, INPUT_PDF, str(images_dir), dpi=DPI, image_format=IMAGE_FORMAT)
    equations_struct = extract_equations_from_pdf(doc, INPUT_PDF, str(eq_dir), dpi=DPI, image_format=IMAGE_FORMAT)

    clauses_path = out_base / "structured_clauses.json"
    tables_path = out_base / "structured_tables.json"
    images_path = out_base / "structured_images.json"
    equations_path = out_base / "structured_equations.json"

    clauses_path.write_text(json.dumps(clauses_struct, ensure_ascii=False, indent=2), encoding="utf-8")
    tables_path.write_text(json.dumps(tables_struct, ensure_ascii=False, indent=2), encoding="utf-8")
    images_path.write_text(json.dumps(images_struct, ensure_ascii=False, indent=2), encoding="utf-8")
    equations_path.write_text(json.dumps(equations_struct, ensure_ascii=False, indent=2), encoding="utf-8")

    # quick stats
    nodes = clauses_struct["nodes"]
    list_blocks = sum(len(n.get("lists", [])) for n in nodes.values())
    list_items = 0
    def count_items(items):
        nonlocal list_items
        for it in items:
            list_items += 1
            count_items(it.get("children", []))
    for n in nodes.values():
        for lst in n.get("lists", []):
            count_items(lst.get("items", []))

    print("Saved:")
    print(" -", clauses_path)
    print(" -", tables_path)
    print(" -", images_path)
    print(" -", equations_path)
    print("\nCounts:")
    print(" clauses:", len(nodes))
    print(" tables:", len(tables_struct))
    print(" figures:", len(images_struct))
    print(" equations:", len(equations_struct))
    print(" list blocks:", list_blocks)
    print(" list items:", list_items)


if __name__ == "__main__":
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'bnbc6.json'