In [1]:
!pip install docling

Collecting docling
  Downloading docling-2.68.0-py3-none-any.whl.metadata (11 kB)
Collecting docling-core<3.0.0,>=2.50.1 (from docling-core[chunking]<3.0.0,>=2.50.1->docling)
  Downloading docling_core-2.59.0-py3-none-any.whl.metadata (7.7 kB)
Collecting docling-parse<5.0.0,>=4.7.0 (from docling)
  Downloading docling_parse-4.7.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting docling-ibm-models<4,>=3.9.1 (from docling)
  Downloading docling_ibm_models-3.10.3-py3-none-any.whl.metadata (7.3 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting rapidocr<4.0.0,>=3.3 (from docling)
  Downloading

In [2]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat

source = "/content/bnbc6.pdf"

# Enable formula enrichment so equations become TextItems with label FORMULA and carry LaTeX text
pipeline_options = PdfPipelineOptions()
pipeline_options.do_formula_enrichment = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

result = converter.convert(source)

# Full Docling JSON export
structured_json = result.document.export_to_dict()

print("Converted. First text item:", structured_json["texts"][0]["text"])


[32m[INFO] 2026-01-15 08:44:36,604 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-15 08:44:36,610 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2026-01-15 08:44:36,613 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.5.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-15 08:44:38,278 [RapidOCR] download_file.py:82: Download size: 13.83MB[0m
[32m[INFO] 2026-01-15 08:44:38,442 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-15 08:44:38,444 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-15 08:44:38,786 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-15 08:44:38,787 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2026-01-

Converted. First text item: 3232


In [3]:
import json
output_filename = "bnbc6.json"

with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(structured_json, f, ensure_ascii=False, indent=4)

print(f"Success! Your structured data is saved in {output_filename}")

Success! Your structured data is saved in bnbc6.json


In [4]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m87.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.7


In [6]:
import json
import re
from pathlib import Path
import fitz  # PyMuPDF

# ----------------------------
# Config (edit these paths)
# ----------------------------
INPUT_DOCLING_JSON = "/content/bnbc6.json"
INPUT_PDF = "/content/bnbc6.pdf"

OUTPUT_DIR = "/content/structured_out"
IMAGES_DIR_NAME = "images"
EQUATIONS_DIR_NAME = "equations"
DPI = 200
IMAGE_FORMAT = "jpg"

# If Docling formula enrichment produces spaced-out LaTeX (each char separated by spaces),
# enable this heuristic de-spacing fix.
FIX_SPACED_LATEX = True


# ----------------------------
# Helpers: Docling JSON access
# ----------------------------
CLAUSE_RE = re.compile(r"^\s*(\d+(?:\.\d+)*)\s+(.*\S)\s*$")


def normalize_ws(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()


def load_docling_json(path: str) -> dict:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def resolve_ref(doc: dict, ref: str):
    """
    ref examples:
      '#/texts/93'
      '#/tables/0'
      '#/pictures/2'
      '#/groups/5'
    """
    m = re.match(r"^#/(texts|tables|pictures|groups)/(\d+)$", ref)
    if not m:
        return None, None, None
    kind, idx = m.group(1), int(m.group(2))
    return kind, idx, doc[kind][idx]


def extract_caption(doc: dict, captions) -> str:
    """
    captions is typically a list like [{'$ref': '#/texts/93'}]
    """
    if not captions:
        return ""
    parts = []
    for c in captions:
        r = c.get("$ref")
        if not r:
            continue
        kind, _, obj = resolve_ref(doc, r)
        if kind == "texts":
            t = normalize_ws(obj.get("text") or "")
            if t:
                parts.append(t)
    return " ".join(parts).strip()


def walk_body_in_reading_order(doc: dict):
    """
    Body children contains a mixture of texts/tables/pictures and also '#/groups/*'.
    We DFS into groups so we don't miss content.
    """
    body = doc.get("body") or {}
    stack = []

    def push_children(children):
        for child in reversed(children or []):
            stack.append(child)

    push_children(body.get("children"))

    while stack:
        item = stack.pop()
        if not isinstance(item, dict) or "$ref" not in item:
            continue

        ref = item["$ref"]
        kind, _, obj = resolve_ref(doc, ref)

        if kind == "groups":
            push_children(obj.get("children"))
            continue

        yield ref


# ----------------------------
# Generic nested point/list parsing (handles (a), (1), (i), bullets, etc.)
# ----------------------------
PAREN_MARKER_RE = re.compile(r"^\s*\(\s*([A-Za-z0-9]+|[ivxlcdmIVXLCDM]+)\s*\)\s+")
SIMPLE_MARKER_RE = re.compile(r"^\s*([A-Za-z]|\d+|[ivxlcdmIVXLCDM]+)([\.|\)])\s+")
BULLET_MARKER_RE = re.compile(r"^\s*([•\-*–—])\s+")


def leading_indent(s: str) -> int:
    # Count leading spaces (tabs are rare in PDFs; treat them as 4 spaces if present)
    if not s:
        return 0
    n = 0
    for ch in s:
        if ch == " ":
            n += 1
        elif ch == "\t":
            n += 4
        else:
            break
    return n


def parse_list_marker(raw_text: str, marker_field: str = "", allow_inline: bool = True):
    """
    Returns (marker, content, indent) if this looks like a point item; otherwise (None, None, None).

    Priority:
      1) Docling-provided marker_field (e.g., '1.' or '*')
      2) '(a) ...', '(1) ...', '(i) ...' in the text
      3) 'a) ...', '1. ...', 'i) ...' in the text
      4) bullet char in the text

    Notes:
      - We treat clause headers like '5.10.2.1 ...' separately before calling this.
      - allow_inline: if True, we attempt inline markers when no marker_field is present.
    """
    if raw_text is None:
        return None, None, None

    indent = leading_indent(raw_text)
    s = raw_text.lstrip("\t ").rstrip()

    # 1) marker field from Docling (often for enumerations or bullets)
    mf = (marker_field or "").strip()
    if mf:
        # Docling's `text` usually excludes the marker when marker_field is set
        content = normalize_ws(s)
        return mf, content, indent

    if not allow_inline or not s:
        return None, None, None

    # 2) (a) / (1) / (i)
    m = PAREN_MARKER_RE.match(s)
    if m:
        mk = f"({m.group(1)})"
        content = normalize_ws(s[m.end():])
        return mk, content, indent

    # 3) a) / 1. / i)
    m = SIMPLE_MARKER_RE.match(s)
    if m:
        mk = f"{m.group(1)}{m.group(2)}"
        content = normalize_ws(s[m.end():])
        return mk, content, indent

    # 4) bullets in text
    m = BULLET_MARKER_RE.match(s)
    if m:
        mk = m.group(1)
        content = normalize_ws(s[m.end():])
        return mk, content, indent

    return None, None, None


def _is_roman(s: str) -> bool:
    if not s:
        return False
    s = s.strip().lower()
    return bool(re.fullmatch(r"[ivxlcdm]+", s))


def marker_kind(marker: str) -> str:
    """
    Classify marker into kinds:
      "(1)", "1.", "2)" -> "num"
      "(a)", "a)"       -> "alpha"
      "(i)", "iv."      -> "roman"
      "•", "-", "*"     -> "bullet"
    """
    if not marker:
        return "other"
    m = marker.strip()

    # bullets
    if BULLET_MARKER_RE.match(m) or m in {"•", "-", "*", "–", "—"}:
        return "bullet"

    # strip wrapping punctuation
    core = re.sub(r"^[\(\[]\s*", "", m)
    core = re.sub(r"\s*[\)\]]$", "", core)
    core = re.sub(r"[.)]$", "", core).strip()

    if core.isdigit():
        return "num"
    if len(core) == 1 and core.isalpha():
        return "alpha"
    if _is_roman(core):
        return "roman"
    return "other"


KIND_ORDER = {"num": 0, "alpha": 1, "roman": 2, "bullet": 3, "other": 3}


def _nesting_trigger_text(s: str) -> bool:
    """
    Strong cue that a sublist follows. Your example uses ':'.
    """
    if not s:
        return False
    t = s.strip().lower()
    if t.endswith(":"):
        return True
    cues = [
        "as follows:",
        "the following:",
        "the following criteria:",
        "meets all of the following criteria:",
        "all of the following:",
        "based on:",
        "shall include:",
        "shall be based on:",
    ]
    return any(t.endswith(c) for c in cues)


def _will_return_to_kind(flat_items, start_idx: int, target_kind: str) -> bool:
    """
    Lookahead heuristic:
      (6) then (a)(b)(c) then (7)  => (a)(b)(c) belong under (6)
    """
    for j in range(start_idx + 1, len(flat_items)):
        k = marker_kind(flat_items[j].get("marker", ""))
        if k == target_kind:
            return True
        # if we see something shallower than target_kind, stop
        if KIND_ORDER.get(k, 3) < KIND_ORDER.get(target_kind, 3):
            return False
    return False


def nest_list_items_smart(flat_items):
    """
    Build a nested list tree using:
      1) indentation if informative
      2) marker-kind hierarchy + cues (':' / 'following criteria:' / lookahead return)
    """
    root = []
    stack = []  # entries: {"indent": int, "kind": str, "order": int, "node": dict}

    for i, it in enumerate(flat_items):
        node = {"marker": it.get("marker", ""), "text": it.get("text", ""), "children": []}
        ind = int(it.get("indent") or 0)

        k = marker_kind(node["marker"])
        k_order = KIND_ORDER.get(k, 3)

        if not stack:
            root.append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})
            continue

        prev = stack[-1]

        # 1) Trust indentation if it actually changes
        indent_diff = ind - prev["indent"]
        indent_is_informative = abs(indent_diff) >= 2  # tune if needed

        if indent_is_informative:
            while stack and ind <= stack[-1]["indent"]:
                stack.pop()
            if not stack:
                root.append(node)
            else:
                stack[-1]["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})
            continue

        # 2) Marker-based inference
        prev_kind = prev["kind"]
        prev_order = prev["order"]

        prev_introduces_sublist = _nesting_trigger_text(prev["node"].get("text", ""))
        lookahead_sublist = _will_return_to_kind(flat_items, i, prev_kind)

        # Deeper kind becomes child only if we have a cue (prevents over-nesting)
        if (k_order > prev_order) and (prev_introduces_sublist or lookahead_sublist):
            prev["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})
            continue

        # Same kind => sibling
        if k == prev_kind:
            stack.pop()
            if not stack:
                root.append(node)
            else:
                stack[-1]["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})
            continue

        # Shallower (or different but not deeper) => pop to a compatible parent
        if k_order <= prev_order:
            while stack and KIND_ORDER.get(stack[-1]["kind"], 3) >= k_order:
                stack.pop()
            if not stack:
                root.append(node)
            else:
                stack[-1]["node"]["children"].append(node)
            stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})
            continue

        # Default fallback: sibling of previous
        stack.pop()
        if not stack:
            root.append(node)
        else:
            stack[-1]["node"]["children"].append(node)
        stack.append({"indent": ind, "kind": k, "order": k_order, "node": node})

    return root

def blocks_to_text_and_lists(blocks):
    """
    blocks: list of either
      {"kind":"text","text":...}
      {"kind":"list_item","marker":..., "text":..., "indent":...}

    Returns:
      text: merged paragraphs (non-list blocks)
      lists: list of {"items":[nested...]} in reading order
    """
    text_parts = []
    lists = []

    i = 0
    while i < len(blocks):
        b = blocks[i]
        if b["kind"] == "text":
            t = b.get("text", "")
            if t:
                text_parts.append(t.strip())
            i += 1
            continue

        # collect a contiguous run of list items into one list block
        j = i
        run = []
        while j < len(blocks) and blocks[j]["kind"] == "list_item":
            run.append(blocks[j])
            j += 1

        nested = nest_list_items_smart(run)
        lists.append({"items": nested})
        i = j

    text = "\n".join([p for p in text_parts if p]).strip()
    return text, lists


def format_list_items(items, level=0):
    """
    Render nested list items into plain text (for retrieval display).
    """
    out = []
    indent = "  " * level
    for it in items or []:
        mk = it.get("marker", "")
        tx = it.get("text", "")
        line = f"{indent}{mk} {tx}".strip()
        if line:
            out.append(line)
        if it.get("children"):
            out.append(format_list_items(it["children"], level + 1))
    return "\n".join([x for x in out if x]).strip()


# ----------------------------
# Tables (FIXED)
# ----------------------------
def table_rows_robust(table_obj: dict):
    """
    Robustly converts Docling tables into rows[][].

    Handles:
      A) data["grid"][r][c] is a dict with "text"
      B) data["grid"][r][c] is an int index into data["table_cells"]
      C) missing/empty grid -> reconstruct from table_cells span metadata
    """
    data = table_obj.get("data") or {}
    grid = data.get("grid") or []
    table_cells = data.get("table_cells") or []

    num_rows = int(data.get("num_rows") or (len(grid) if grid else 0))
    num_cols = int(data.get("num_cols") or (len(grid[0]) if grid and grid[0] else 0))

    # Case A: grid holds dict cells directly
    if grid and grid[0] and isinstance(grid[0][0], dict):
        rows = []
        for r in grid:
            row = []
            for cell in r:
                if isinstance(cell, dict):
                    row.append(normalize_ws(cell.get("text", "")))
                else:
                    row.append("")
            rows.append(row)
        return {"num_rows": num_rows, "num_cols": num_cols, "rows": rows}

    # Case B: grid holds ints referencing table_cells
    if grid and grid[0] and isinstance(grid[0][0], int):
        rows = []
        for r in grid:
            row = []
            for idx in r:
                if isinstance(idx, int) and 0 <= idx < len(table_cells):
                    row.append(normalize_ws(table_cells[idx].get("text", "")))
                else:
                    row.append("")
            rows.append(row)
        return {"num_rows": num_rows, "num_cols": num_cols, "rows": rows}

    # Case C: reconstruct from table_cells span metadata
    mat = [["" for _ in range(num_cols)] for _ in range(num_rows)]
    for cell in table_cells:
        txt = normalize_ws(cell.get("text", ""))
        r0 = cell.get("start_row_offset_idx", 0)
        r1 = cell.get("end_row_offset_idx", r0 + 1)
        c0 = cell.get("start_col_offset_idx", 0)
        c1 = cell.get("end_col_offset_idx", c0 + 1)
        for rr in range(r0, r1):
            for cc in range(c0, c1):
                if 0 <= rr < num_rows and 0 <= cc < num_cols:
                    mat[rr][cc] = txt

    return {"num_rows": num_rows, "num_cols": num_cols, "rows": mat}


def build_tables_json(doc: dict) -> list:
    out = []
    for i, tbl in enumerate(doc.get("tables", [])):
        provs = tbl.get("prov") or []
        page_no = provs[0].get("page_no") if provs else None
        bbox = provs[0].get("bbox") if provs else None
        caption = extract_caption(doc, tbl.get("captions"))

        t = table_rows_robust(tbl)

        out.append(
            {
                "table_id": f"table_{i:04d}",
                "page_no": page_no,
                "bbox": bbox,
                "caption": caption,
                "num_rows": t["num_rows"],
                "num_cols": t["num_cols"],
                "rows": t["rows"],
            }
        )
    return out


# ----------------------------
# Images & geometry helpers
# ----------------------------
def bbox_to_fitz_rect(bbox: dict, page_height: float) -> fitz.Rect | None:
    """
    Docling bbox has coord_origin, often 'BOTTOMLEFT'.
    PyMuPDF uses TOPLEFT origin.
    """
    l = bbox.get("l")
    t = bbox.get("t")
    r = bbox.get("r")
    b = bbox.get("b")
    if None in (l, t, r, b):
        return None

    origin = (bbox.get("coord_origin") or "TOPLEFT").upper()

    if origin == "BOTTOMLEFT":
        # Convert y coords from bottom-origin to top-origin
        y0 = page_height - t
        y1 = page_height - b
    else:
        y0 = t
        y1 = b

    x0, x1 = sorted([l, r])
    y0, y1 = sorted([y0, y1])
    return fitz.Rect(x0, y0, x1, y1)


def extract_figures_from_pdf(doc: dict, pdf_path: str, out_dir: str, dpi: int, image_format: str):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    pdf = fitz.open(pdf_path)
    images_meta = []

    for i, pic in enumerate(doc.get("pictures", [])):
        provs = pic.get("prov") or []
        if not provs:
            continue

        prov = provs[0]  # usually only one
        page_no = prov.get("page_no")
        bbox = prov.get("bbox")
        if not page_no or not bbox:
            continue

        page_index = page_no - 1
        if page_index < 0 or page_index >= pdf.page_count:
            continue

        page = pdf.load_page(page_index)
        page_rect = page.rect

        clip = bbox_to_fitz_rect(bbox, page_height=page_rect.height)
        if clip is None:
            continue

        clip = clip & page_rect  # clamp to page
        caption = extract_caption(doc, pic.get("captions"))

        zoom = dpi / 72.0
        pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), clip=clip, alpha=False)

        fname = f"figure_{i:04d}_p{page_no}.{image_format}"
        fpath = out_dir / fname
        pix.save(str(fpath))

        images_meta.append(
            {
                "figure_id": f"figure_{i:04d}",
                "page_no": page_no,
                "bbox": bbox,
                "caption": caption,
                "file": str(fpath),
            }
        )

    pdf.close()
    return images_meta


# ----------------------------
# Equations (FORMULA items)
# ----------------------------
def is_formula_text_item(text_obj: dict) -> bool:
    return (text_obj.get("label") or "").strip().upper() == "FORMULA"


def maybe_despace_latex(latex: str) -> str:
    """
    Heuristic fix for cases where each character is separated by spaces.
    If most tokens are single characters, we join without spaces.
    """
    s = latex.strip()
    toks = s.split()
    if len(toks) < 8:
        return s
    single = sum(1 for t in toks if len(t) == 1)
    if single / max(1, len(toks)) >= 0.6:
        return "".join(toks)
    return s


def extract_formula_latex(text_obj: dict) -> str:
    # Some versions may store latex in a dedicated field; otherwise it's in "text"
    raw = text_obj.get("latex") or text_obj.get("text") or ""
    raw = raw.strip()
    raw = normalize_ws(raw)
    if FIX_SPACED_LATEX:
        raw = maybe_despace_latex(raw)
    return raw


def extract_equations_from_pdf(doc: dict, pdf_path: str, out_dir: str, dpi: int, image_format: str):
    """
    Crops each FORMULA text item from the PDF using its provenance bbox and stores it as an image.
    Also returns a list of equation metadata including LaTeX.
    """
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    pdf = fitz.open(pdf_path)
    equations_meta = []

    texts = doc.get("texts", []) or []
    for idx, t in enumerate(texts):
        if not is_formula_text_item(t):
            continue

        provs = t.get("prov") or []
        if not provs:
            continue

        prov = provs[0]
        page_no = prov.get("page_no")
        bbox = prov.get("bbox")
        if not page_no or not bbox:
            continue

        page_index = page_no - 1
        if page_index < 0 or page_index >= pdf.page_count:
            continue

        page = pdf.load_page(page_index)
        page_rect = page.rect

        clip = bbox_to_fitz_rect(bbox, page_height=page_rect.height)
        if clip is None:
            continue

        clip = clip & page_rect
        zoom = dpi / 72.0
        pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), clip=clip, alpha=False)

        eq_id = f"eq_{idx:05d}"
        fname = f"{eq_id}_p{page_no}.{image_format}"
        fpath = out_dir / fname
        pix.save(str(fpath))

        equations_meta.append(
            {
                "equation_id": eq_id,
                "page_no": page_no,
                "bbox": bbox,
                "latex": extract_formula_latex(t),
                "file": str(fpath),
            }
        )

    pdf.close()
    return equations_meta


# ----------------------------
# Clause tree (now includes equations + generic nested points)
# ----------------------------
def build_clause_tree(doc: dict):
    """
    Builds nodes keyed by clause id (e.g., '5.5', '5.5.1', '5.5.1.1').

    - Figures: stored as captions only (plus id).
    - Tables: referenced by id + caption.
    - Equations: referenced by id + LaTeX (extracted via do_formula_enrichment).
    - Nested points/lists: parsed from Docling list items + inline markers into `lists` with children.
      Markers can be (a), (1), (i), bullets, '1.', 'a)', etc.
    """
    nodes = {}
    root_id = "ROOT"
    nodes[root_id] = {
        "id": root_id,
        "title": "",
        "children": [],
        "tables": [],
        "figures": [],
        "equations": [],
        # final outputs
        "text": "",
        "lists": [],
        # internal parsing buffer
        "_blocks": [],
    }
    current_id = root_id

    def ensure_node(cid: str):
        if cid not in nodes:
            nodes[cid] = {
                "id": cid,
                "title": "",
                "children": [],
                "tables": [],
                "figures": [],
                "equations": [],
                "text": "",
                "lists": [],
                "_blocks": [],
            }

    def parent_id(cid: str) -> str:
        parts = cid.split(".")
        return root_id if len(parts) <= 1 else ".".join(parts[:-1])

    def add_child(pid: str, cid: str):
        if cid not in nodes[pid]["children"]:
            nodes[pid]["children"].append(cid)

    def add_text_block(cid: str, txt: str):
        txt = txt.rstrip()
        if not txt:
            return
        nodes[cid]["_blocks"].append({"kind": "text", "text": txt})

    def add_list_block(cid: str, marker: str, txt: str, indent: int):
        nodes[cid]["_blocks"].append({"kind": "list_item", "marker": marker, "text": txt, "indent": indent})

    for ref in walk_body_in_reading_order(doc):
        kind, idx, obj = resolve_ref(doc, ref)

        if kind == "texts":
            label = (obj.get("label") or "").strip().lower()
            raw = obj.get("text") or ""
            if not raw.strip():
                continue

            # Clause header detection should happen BEFORE list parsing
            m = CLAUSE_RE.match(raw.strip())
            if m:
                cid = m.group(1)
                rest = m.group(2).strip()

                ensure_node(cid)
                pid = parent_id(cid)
                ensure_node(pid)
                add_child(pid, cid)

                depth = len(cid.split("."))

                if depth <= 3:
                    if not nodes[cid]["title"]:
                        nodes[cid]["title"] = rest
                    # headings don't go into body text blocks
                else:
                    add_text_block(cid, rest)

                current_id = cid
                continue

            # Handle equations (FORMULA items)
            if is_formula_text_item(obj):
                eq_id = f"eq_{idx:05d}"
                latex = extract_formula_latex(obj)
                nodes[current_id]["equations"].append({"equation_id": eq_id, "latex": latex})
                if latex:
                    add_text_block(current_id, f"$$ {latex} $$")
                else:
                    add_text_block(current_id, f"[EQ {eq_id}]")
                continue

            # Handle list items (Docling label=list_item OR inline markers)
            marker_field = obj.get("marker") or ""
            mk, content, ind = parse_list_marker(raw_text=raw, marker_field=marker_field, allow_inline=True)

            # If Docling says it's a list_item OR we successfully detect a marker, treat as point
            if label == "list_item" or mk:
                # Some list_items are not really enumerated; only store as list if marker detected
                if mk and content:
                    add_list_block(current_id, mk, content, ind)
                else:
                    # No marker detected: treat as plain text
                    add_text_block(current_id, normalize_ws(raw))
            else:
                # Regular paragraph/text
                add_text_block(current_id, normalize_ws(raw))

        elif kind == "tables":
            caption = extract_caption(doc, obj.get("captions"))
            nodes[current_id]["tables"].append({"table_id": f"table_{idx:04d}", "caption": caption})

        elif kind == "pictures":
            caption = extract_caption(doc, obj.get("captions"))
            nodes[current_id]["figures"].append({"figure_id": f"figure_{idx:04d}", "caption": caption})

    # Finalize: turn blocks into `text` + `lists`
    for nid in list(nodes.keys()):
        blocks = nodes[nid].get("_blocks", [])
        text, lists = blocks_to_text_and_lists(blocks)
        nodes[nid]["text"] = text
        nodes[nid]["lists"] = lists
        nodes[nid].pop("_blocks", None)

    return {"root": root_id, "nodes": nodes}


# ----------------------------
# Optional: retrieval helper
# ----------------------------
def collect_text_recursive(structured: dict, clause_id: str) -> str:
    """
    Returns clause text + list items + descendants' text.
    """
    nodes = structured["nodes"]
    if clause_id not in nodes:
        return ""

    n = nodes[clause_id]
    chunks = []

    if clause_id != "ROOT" and n.get("title"):
        chunks.append(f"{clause_id} {n['title']}".strip())
    if n.get("text"):
        chunks.append(n["text"])

    # include nested points
    for lst in n.get("lists", []):
        rendered = format_list_items(lst.get("items", []))
        if rendered:
            chunks.append(rendered)

    # include captions / latex so outputs stay informative without embedding images/tables
    for fig in n.get("figures", []):
        if fig.get("caption"):
            chunks.append(fig["caption"])
    for tbl in n.get("tables", []):
        if tbl.get("caption"):
            chunks.append(tbl["caption"])
    for eq in n.get("equations", []):
        if eq.get("latex"):
            chunks.append(f"$$ {eq['latex']} $$")

    for child in n.get("children", []):
        child_txt = collect_text_recursive(structured, child)
        if child_txt:
            chunks.append(child_txt)

    return "\n".join(chunks).strip()


# ----------------------------
# Main
# ----------------------------
def main():
    out_base = Path(OUTPUT_DIR)
    out_base.mkdir(parents=True, exist_ok=True)
    images_dir = out_base / IMAGES_DIR_NAME
    eq_dir = out_base / EQUATIONS_DIR_NAME

    doc = load_docling_json(INPUT_DOCLING_JSON)

    # 1) Clause tree (with figure captions + equation latex refs + nested points)
    clauses_struct = build_clause_tree(doc)

    # 2) Tables as rows
    tables_struct = build_tables_json(doc)

    # 3) Figures as JPG + metadata
    images_struct = extract_figures_from_pdf(
        doc=doc,
        pdf_path=INPUT_PDF,
        out_dir=str(images_dir),
        dpi=DPI,
        image_format=IMAGE_FORMAT,
    )

    # 4) Equations as cropped JPG + LaTeX metadata (requires do_formula_enrichment=True during conversion)
    equations_struct = extract_equations_from_pdf(
        doc=doc,
        pdf_path=INPUT_PDF,
        out_dir=str(eq_dir),
        dpi=DPI,
        image_format=IMAGE_FORMAT,
    )

    # Save outputs
    clauses_path = out_base / "structured_clauses.json"
    tables_path = out_base / "structured_tables.json"
    images_path = out_base / "structured_images.json"
    equations_path = out_base / "structured_equations.json"

    clauses_path.write_text(json.dumps(clauses_struct, ensure_ascii=False, indent=2), encoding="utf-8")
    tables_path.write_text(json.dumps(tables_struct, ensure_ascii=False, indent=2), encoding="utf-8")
    images_path.write_text(json.dumps(images_struct, ensure_ascii=False, indent=2), encoding="utf-8")
    equations_path.write_text(json.dumps(equations_struct, ensure_ascii=False, indent=2), encoding="utf-8")

    # Quick sanity check
    nodes = clauses_struct["nodes"]
    list_items_count = sum(len(n.get("lists", [])) for n in nodes.values())
    print("Saved:")
    print(" -", clauses_path)
    print(" -", tables_path)
    print(" -", images_path)
    print(" -", equations_path)
    print("Images saved under:", images_dir)
    print("Equations saved under:", eq_dir)
    print("\nCounts:")
    print(" clauses:", len(nodes))
    print(" tables:", len(tables_struct))
    print(" figures:", len(images_struct))
    print(" equations:", len(equations_struct))
    print(" list blocks:", list_items_count)


if __name__ == "__main__":
    main()


Saved:
 - /content/structured_out/structured_clauses.json
 - /content/structured_out/structured_tables.json
 - /content/structured_out/structured_images.json
 - /content/structured_out/structured_equations.json
Images saved under: /content/structured_out/images
Equations saved under: /content/structured_out/equations

Counts:
 clauses: 15
 tables: 5
 figures: 0
 equations: 13
 list blocks: 4
