diff --git a/.gitignore b/.gitignore
index 3133382..ccb93a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,11 @@ build/
data/cache/
*.sqlite
+# Docling eval — keep FINDINGS.md and sources/, ignore generated run artifacts
+data/docling_eval/*
+!data/docling_eval/FINDINGS.md
+!data/docling_eval/sources/
+
# OS
.DS_Store
Thumbs.db
diff --git a/README.md b/README.md
index 9f6a2df..6376e50 100644
--- a/README.md
+++ b/README.md
@@ -208,6 +208,21 @@ html_parser.py
pdf_parser.py
text_cleaner.py
+Note on PDF table extraction (Docling refiner):
+
+The extraction stage uses an in-tree PDF parser (PyMuPDF + pdfplumber) as the
+default and a Docling TableFormer post-pass to refine table sections when an
+in-tree result looks broken or when the source URL is on a curated allowlist
+of publishers whose tables are known to be hard (CDC MMWR, certain WHO
+situation reports).
+
+The first PDF that triggers the refiner downloads the Docling layout and
+TableFormer models (~40 MB) to the HuggingFace cache (`~/.cache/huggingface/`)
+and holds them in memory (~1.5 GB) for the lifetime of the process. The
+feature is toggled with `ExtractionConfig.enable_docling_refiner` — when
+disabled, no Docling imports occur and behaviour matches the pre-refiner
+pipeline exactly.
+
---
## Insight Stage
diff --git a/bioscancast/extraction/chunking.py b/bioscancast/extraction/chunking.py
index ff02a14..0d5c14f 100644
--- a/bioscancast/extraction/chunking.py
+++ b/bioscancast/extraction/chunking.py
@@ -50,6 +50,7 @@ def normalize_chunks(
page_number=chunk.page_number,
table_data=None,
token_count=part_tokens,
+ extractor=chunk.extractor,
)
)
diff --git a/bioscancast/extraction/config.py b/bioscancast/extraction/config.py
index b80e1a5..19b595e 100644
--- a/bioscancast/extraction/config.py
+++ b/bioscancast/extraction/config.py
@@ -1,6 +1,7 @@
from __future__ import annotations
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from typing import List
@dataclass
@@ -10,4 +11,27 @@ class ExtractionConfig:
pdf_max_pages: int = 100
chunk_target_tokens: int = 800
chunk_max_tokens: int = 1500
+ user_agent: str = (
+ "BioScanCast/0.1 (+https://github.com/algorithmicgovernance/BioScanCast)"
+ )
+
+ # ---- Docling table refiner ----
+ enable_docling_refiner: bool = True
+ """Toggle the Docling post-pass that refines PDF table sections.
+
+ When False, no Docling imports occur and behaviour is identical to the
+ pre-refiner pipeline.
+ """
+
+ docling_source_allowlist: List[str] = field(
+ default_factory=lambda: [
+ "cdc.gov/mmwr/",
+ "cdn.who.int/media/docs/default-source/_sage-",
+ "cdn.who.int/media/docs/default-source/documents/emergencies/situation-reports/",
+ ]
+ )
+ """Source URL substrings known to contain hard tables. Match triggers Docling unconditionally."""
+
+ docling_sparse_cell_threshold: float = 0.5
+ """Non-empty-cell ratio below which a table is flagged as suspect and triggers Docling."""
impersonate: str = "chrome"
diff --git a/bioscancast/extraction/docling_refiner.py b/bioscancast/extraction/docling_refiner.py
new file mode 100644
index 0000000..edda060
--- /dev/null
+++ b/bioscancast/extraction/docling_refiner.py
@@ -0,0 +1,364 @@
+"""Docling-based table refiner.
+
+Optional post-pass over `ParsedContent` produced by `PdfParser`. When triggered
+(URL allowlist hit or a heuristic flag on a "broken" in-tree table), runs
+Docling's TableFormer on the original PDF bytes and replaces the in-tree table
+sections with Docling's rendering.
+
+Docling and its transitive deps (`transformers`, `torch`, ...) are intentionally
+*lazy-imported* — instantiating `DoclingTableRefiner` is the only path that
+touches them. When the feature flag is off, no Docling import ever happens.
+"""
+
+from __future__ import annotations
+
+import io
+import logging
+from dataclasses import replace
+from typing import Any, FrozenSet, List, Optional, Sequence, Tuple
+
+from .config import ExtractionConfig
+from .parsers.base import ParsedContent, SectionContent
+
+logger = logging.getLogger(__name__)
+
+
+class DoclingTableRefiner:
+ """Refines table sections in a `ParsedContent` using Docling.
+
+ The converter is constructed once per instance (Docling models cost
+ ~10-30s and ~1.5 GB RAM to load), so the pipeline should hold one
+ instance per process.
+ """
+
+ def __init__(
+ self,
+ config: ExtractionConfig,
+ *,
+ converter: Optional[Any] = None,
+ ) -> None:
+ self._config = config
+ # Allow dependency injection for tests; real construction is lazy.
+ self._converter = converter
+
+ # ---------- public API ----------
+
+ def refine(
+ self,
+ parsed: ParsedContent,
+ *,
+ source_url: str,
+ content: bytes,
+ ) -> ParsedContent:
+ """Return either the original `parsed` or a copy with table sections
+ replaced by Docling output.
+
+ Triggers (first match wins):
+ 1. `source_url` matches the configured allowlist.
+ 2. Any in-tree table looks "broken" by the heuristic.
+
+ Short-circuits to a no-op for OCR-required PDFs — Docling without OCR
+ cannot help there.
+ """
+ if parsed.is_partial and parsed.partial_reason == "requires_ocr":
+ logger.debug("docling refiner skipped: requires_ocr")
+ return parsed
+
+ # Always compute broken-table indices: even URL-triggered runs need
+ # them, so the merge step knows which in-tree sections to drop when
+ # Docling produces a different but better table on another page.
+ flagged = _broken_table_reasons(
+ parsed, threshold=self._config.docling_sparse_cell_threshold
+ )
+ broken_indices = frozenset(i for i, _ in flagged)
+
+ url_match = _should_refine_by_url(
+ source_url, self._config.docling_source_allowlist
+ )
+ if url_match:
+ logger.info(
+ "docling refiner triggered: source-allowlist hit for %s", source_url
+ )
+ elif flagged:
+ for _, reason in flagged:
+ logger.info("docling refiner triggered: %s", reason)
+ else:
+ logger.debug(
+ "docling refiner skipped: no trigger matched for %s", source_url
+ )
+ return parsed
+
+ return self._do_refine(parsed, content, broken_indices=broken_indices)
+
+ # ---------- internals ----------
+
+ def _do_refine(
+ self,
+ parsed: ParsedContent,
+ content: bytes,
+ *,
+ broken_indices: FrozenSet[int] = frozenset(),
+ ) -> ParsedContent:
+ try:
+ converter = self._get_converter()
+ except Exception as exc: # pragma: no cover - construction failures
+ logger.warning("docling converter unavailable: %s", exc)
+ return parsed
+
+ try:
+ result = converter.convert(content)
+ except Exception as exc:
+ logger.warning("docling conversion failed: %s", exc)
+ return parsed
+
+ docling_doc = getattr(result, "document", None)
+ if docling_doc is None:
+ logger.warning("docling result has no document; leaving parsed unchanged")
+ return parsed
+
+ return _merge_docling_tables_into_parsed(
+ parsed, docling_doc, broken_indices=broken_indices
+ )
+
+ def _get_converter(self) -> Any:
+ if self._converter is not None:
+ return self._converter
+ self._converter = _build_converter()
+ return self._converter
+
+
+# ---------- helpers ----------
+
+
+def _should_refine_by_url(source_url: str, allowlist: Sequence[str]) -> bool:
+ if not source_url:
+ return False
+ lowered = source_url.lower()
+ return any(pattern.lower() in lowered for pattern in allowlist)
+
+
+def _broken_table_reasons(
+ parsed: ParsedContent, *, threshold: float
+) -> List[Tuple[int, str]]:
+ """Inspect every table section in `parsed` and return `(section_index,
+ reason)` pairs for any that look broken.
+
+ A table is suspect when:
+ - non-empty-cell ratio < `threshold` and it has at least 3 rows and 2 cols
+ - more than half its rows have exactly one non-empty cell (over-segmentation)
+ """
+ flagged: List[Tuple[int, str]] = []
+ for i, section in enumerate(parsed.sections):
+ if section.chunk_type != "table" or not section.table_rows:
+ continue
+ rows = section.table_rows
+ if len(rows) < 3:
+ continue
+ max_cols = max((len(r) for r in rows), default=0)
+ if max_cols < 2:
+ continue
+
+ total_cells = sum(len(r) for r in rows)
+ if total_cells == 0:
+ continue
+ non_empty = sum(
+ 1 for row in rows for cell in row if cell and str(cell).strip()
+ )
+ ratio = non_empty / total_cells
+
+ page_label = section.page_number if section.page_number is not None else "?"
+
+ if ratio < threshold:
+ flagged.append(
+ (
+ i,
+ f"suspect table on page {page_label} "
+ f"(empty-cell ratio {ratio:.2f})",
+ )
+ )
+ continue
+
+ single_cell_rows = sum(
+ 1
+ for row in rows
+ if sum(1 for cell in row if cell and str(cell).strip()) == 1
+ )
+ if single_cell_rows > len(rows) / 2:
+ flagged.append(
+ (
+ i,
+ f"suspect table on page {page_label} "
+ f"(over-segmented: {single_cell_rows}/{len(rows)} rows have a single cell)",
+ )
+ )
+ return flagged
+
+
+def _merge_docling_tables_into_parsed(
+ parsed: ParsedContent,
+ docling_doc: Any,
+ *,
+ broken_indices: FrozenSet[int] = frozenset(),
+) -> ParsedContent:
+ """Replace in-tree table sections with Docling-rendered tables.
+
+ Strategy:
+ 1. Page-based matching: for each in-tree table section, find a Docling
+ table on the same page (in order of appearance) and replace.
+ 2. Drop unmatched in-tree table sections whose index is in
+ `broken_indices` (the heuristic flagged them as suspect).
+ 3. Insert any leftover Docling tables as new sections, at the
+ document-order position corresponding to their page.
+
+ `broken_indices` refers to indices into `parsed.sections` as it was
+ when the heuristic ran (i.e. the original section list).
+ """
+ docling_tables_by_page: dict[int, list] = {}
+ for table in getattr(docling_doc, "tables", []) or []:
+ prov = getattr(table, "prov", None) or []
+ if not prov:
+ continue
+ page_no = getattr(prov[0], "page_no", None)
+ if page_no is None:
+ continue
+ docling_tables_by_page.setdefault(page_no, []).append(table)
+
+ cursor: dict[int, int] = {}
+ matched_docling: set[int] = set()
+ new_sections: List[SectionContent] = []
+
+ for i, section in enumerate(parsed.sections):
+ if section.chunk_type != "table" or section.page_number is None:
+ new_sections.append(section)
+ continue
+
+ page = section.page_number
+ idx = cursor.get(page, 0)
+ candidates = docling_tables_by_page.get(page, [])
+ if idx < len(candidates):
+ docling_table = candidates[idx]
+ cursor[page] = idx + 1
+ new_rows = _docling_table_to_rows(docling_table)
+ if new_rows:
+ matched_docling.add(id(docling_table))
+ new_sections.append(
+ replace(
+ section,
+ table_rows=new_rows,
+ extractor="docling",
+ )
+ )
+ continue
+ # Fall through: Docling table empty -> treat as no match.
+
+ # No Docling replacement for this in-tree table.
+ if i in broken_indices:
+ # The heuristic confirmed this in-tree table is garbage;
+ # drop it rather than leave noise in the output.
+ continue
+ new_sections.append(section)
+
+ # Insert leftover Docling tables in page order.
+ leftover: List[Tuple[int, Any]] = []
+ for page_no, tables in docling_tables_by_page.items():
+ for table in tables:
+ if id(table) not in matched_docling:
+ leftover.append((page_no, table))
+ leftover.sort(key=lambda pair: pair[0])
+
+ for page_no, table in leftover:
+ rows = _docling_table_to_rows(table)
+ if not rows:
+ continue
+ insert_at = 0
+ for j, existing in enumerate(new_sections):
+ if (
+ existing.page_number is not None
+ and existing.page_number <= page_no
+ ):
+ insert_at = j + 1
+ new_sections.insert(
+ insert_at,
+ SectionContent(
+ section_path=None,
+ page_number=page_no,
+ text="",
+ chunk_type="table",
+ table_rows=rows,
+ extractor="docling",
+ ),
+ )
+
+ parsed.sections = new_sections
+ return parsed
+
+
+def _docling_table_to_rows(table: Any) -> List[List[str]]:
+ """Convert a Docling `TableItem` into row-major plain-string cells.
+
+ Walks `table.data.table_cells` directly so we don't pull in pandas.
+ Each cell carries `start_row_offset_idx`/`start_col_offset_idx`; we lay
+ them out on a grid of size `num_rows x num_cols` and stringify the text.
+ """
+ data = getattr(table, "data", None)
+ if data is None:
+ return []
+ cells = list(getattr(data, "table_cells", []) or [])
+ if not cells:
+ return []
+
+ num_rows = int(getattr(data, "num_rows", 0) or 0)
+ num_cols = int(getattr(data, "num_cols", 0) or 0)
+ if num_rows <= 0 or num_cols <= 0:
+ # Fall back to inferring shape from cell offsets.
+ num_rows = max(int(getattr(c, "end_row_offset_idx", 0) or 0) for c in cells)
+ num_cols = max(int(getattr(c, "end_col_offset_idx", 0) or 0) for c in cells)
+ if num_rows <= 0 or num_cols <= 0:
+ return []
+
+ grid: List[List[str]] = [["" for _ in range(num_cols)] for _ in range(num_rows)]
+ for cell in cells:
+ r = int(getattr(cell, "start_row_offset_idx", 0) or 0)
+ c = int(getattr(cell, "start_col_offset_idx", 0) or 0)
+ text = (getattr(cell, "text", "") or "").strip()
+ if 0 <= r < num_rows and 0 <= c < num_cols:
+ grid[r][c] = text
+ return grid
+
+
+def _build_converter() -> Any:
+ """Construct a thin wrapper around the real Docling `DocumentConverter`
+ that takes raw PDF bytes.
+
+ Imports are deferred to this function so that turning off the refiner
+ means no Docling/torch/transformers import ever happens. The wrapper
+ layer lets the refiner stay agnostic of Docling-specific stream types,
+ which keeps test injection simple.
+ """
+ from docling.datamodel.base_models import DocumentStream, InputFormat
+ from docling.datamodel.pipeline_options import (
+ PdfPipelineOptions,
+ TableFormerMode,
+ )
+ from docling.document_converter import DocumentConverter, PdfFormatOption
+
+ pipeline_options = PdfPipelineOptions(
+ do_ocr=False,
+ do_table_structure=True,
+ )
+ pipeline_options.table_structure_options.mode = TableFormerMode.FAST
+
+ real_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
+ },
+ )
+
+ class _BytesConverter:
+ def convert(self, content: bytes):
+ stream = DocumentStream(
+ name="document.pdf", stream=io.BytesIO(content)
+ )
+ return real_converter.convert(stream)
+
+ return _BytesConverter()
diff --git a/bioscancast/extraction/parsers/base.py b/bioscancast/extraction/parsers/base.py
index 393d283..95b176c 100644
--- a/bioscancast/extraction/parsers/base.py
+++ b/bioscancast/extraction/parsers/base.py
@@ -24,6 +24,9 @@ class SectionContent:
table_rows: Optional[List[List[str]]] = None
"""Row-major table data when chunk_type is 'table'."""
+ extractor: Optional[str] = None
+ """Which backend produced this section ('pymupdf', 'pdfplumber', 'docling', ...)."""
+
@dataclass
class ParsedContent:
diff --git a/bioscancast/extraction/parsers/pdf_parser.py b/bioscancast/extraction/parsers/pdf_parser.py
index 0ae263a..180b66c 100644
--- a/bioscancast/extraction/parsers/pdf_parser.py
+++ b/bioscancast/extraction/parsers/pdf_parser.py
@@ -67,12 +67,15 @@ def parse(self, content: bytes, *, source_url: str) -> ParsedContent:
# Extract tables with PyMuPDF
tables_on_page = self._extract_tables_pymupdf(page)
+ table_extractor = "pymupdf"
# If PyMuPDF found no tables, try pdfplumber as fallback
if not tables_on_page and self._page_looks_tabular(page):
tables_on_page = self._extract_tables_pdfplumber(
content, page_num
)
+ if tables_on_page:
+ table_extractor = "pdfplumber"
for table_rows in tables_on_page:
sections.append(
@@ -82,6 +85,7 @@ def parse(self, content: bytes, *, source_url: str) -> ParsedContent:
text="",
chunk_type="table",
table_rows=table_rows,
+ extractor=table_extractor,
)
)
@@ -129,6 +133,7 @@ def parse(self, content: bytes, *, source_url: str) -> ParsedContent:
page_number=page_number,
text=combined,
chunk_type="prose",
+ extractor="pymupdf",
)
)
current_text_parts = []
@@ -147,6 +152,7 @@ def parse(self, content: bytes, *, source_url: str) -> ParsedContent:
page_number=page_number,
text=combined,
chunk_type="prose",
+ extractor="pymupdf",
)
)
diff --git a/bioscancast/extraction/pipeline.py b/bioscancast/extraction/pipeline.py
index 76e9aed..0ae2d99 100644
--- a/bioscancast/extraction/pipeline.py
+++ b/bioscancast/extraction/pipeline.py
@@ -24,6 +24,8 @@ class ExtractionPipeline:
def __init__(self, *, config: ExtractionConfig | None = None) -> None:
self._config = config or ExtractionConfig()
self._parsers = get_parsers(pdf_max_pages=self._config.pdf_max_pages)
+ # Lazily constructed on first PDF that reaches the refiner step.
+ self._docling_refiner = None
def run(self, filtered_docs: List[FilteredDocument]) -> List[Document]:
"""Process documents in order of extraction_priority.
@@ -97,8 +99,28 @@ def extract_one(self, filtered_doc: FilteredDocument) -> Document:
fetch_result=fetch_result,
)
- # Step 4: Convert ParsedContent → Document with chunks
+ # Step 3b: Docling table refiner (PDFs only, feature-flagged)
document_type = self._detect_document_type(content_type)
+ if (
+ self._config.enable_docling_refiner
+ and document_type == "pdf"
+ ):
+ refiner = self._get_docling_refiner()
+ if refiner is not None:
+ try:
+ parsed = refiner.refine(
+ parsed,
+ source_url=filtered_doc.url,
+ content=fetch_result.content_bytes,
+ )
+ except Exception as exc:
+ logger.warning(
+ "Docling refiner failed for %s: %s",
+ filtered_doc.url,
+ exc,
+ )
+
+ # Step 4: Convert ParsedContent → Document with chunks
chunks = self._build_chunks(parsed, doc_id)
# Step 5: Normalize chunks
@@ -149,6 +171,23 @@ def extract_one(self, filtered_doc: FilteredDocument) -> Document:
extracted_dates=extracted_dates,
)
+ def _get_docling_refiner(self):
+ """Lazily build (and cache) the Docling refiner.
+
+ Returns None if the heavy Docling imports or model load fail — the
+ pipeline then falls back to the in-tree parser output unchanged.
+ """
+ if self._docling_refiner is not None:
+ return self._docling_refiner
+ try:
+ from .docling_refiner import DoclingTableRefiner
+
+ self._docling_refiner = DoclingTableRefiner(self._config)
+ except Exception as exc:
+ logger.warning("Docling refiner unavailable, continuing without: %s", exc)
+ self._docling_refiner = None
+ return self._docling_refiner
+
def _make_failed_document(
self,
fdoc: FilteredDocument,
@@ -191,6 +230,7 @@ def _build_chunks(
page_number=section.page_number,
table_data=section.table_rows,
token_count=approx_token_count(section.text),
+ extractor=section.extractor,
)
)
return chunks
diff --git a/bioscancast/schemas/document.py b/bioscancast/schemas/document.py
index b521bbc..d415995 100644
--- a/bioscancast/schemas/document.py
+++ b/bioscancast/schemas/document.py
@@ -38,6 +38,9 @@ class DocumentChunk:
token_count: Optional[int] = None
"""Approximate token count (tokeniser-dependent)."""
+ extractor: Optional[str] = None
+ """Backend that produced this chunk ('pymupdf', 'pdfplumber', 'docling', 'trafilatura', ...)."""
+
@dataclass
class Document:
diff --git a/bioscancast/tests/test_extraction_docling_refiner.py b/bioscancast/tests/test_extraction_docling_refiner.py
new file mode 100644
index 0000000..842c2db
--- /dev/null
+++ b/bioscancast/tests/test_extraction_docling_refiner.py
@@ -0,0 +1,613 @@
+"""Tests for bioscancast.extraction.docling_refiner.
+
+Docling is heavyweight (~1.5 GB RAM and ~10-30 s model load on construction).
+Every test in this module uses a fake converter injected into the refiner,
+so no real Docling model is ever loaded.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import List, Optional
+from unittest.mock import MagicMock
+
+import pytest
+
+from bioscancast.extraction.config import ExtractionConfig
+from bioscancast.extraction.docling_refiner import (
+ DoclingTableRefiner,
+ _broken_table_reasons,
+ _docling_table_to_rows,
+ _merge_docling_tables_into_parsed,
+ _should_refine_by_url,
+)
+from bioscancast.extraction.parsers.base import ParsedContent, SectionContent
+
+
+# ---------------------------------------------------------------------------
+# Stubs that mimic the bits of the Docling object model we touch
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class StubProv:
+ page_no: int
+
+
+@dataclass
+class StubTableCell:
+ start_row_offset_idx: int
+ end_row_offset_idx: int
+ start_col_offset_idx: int
+ end_col_offset_idx: int
+ text: str
+
+
+@dataclass
+class StubTableData:
+ num_rows: int
+ num_cols: int
+ table_cells: List[StubTableCell]
+
+
+@dataclass
+class StubTable:
+ data: StubTableData
+ prov: List[StubProv]
+
+
+@dataclass
+class StubDoclingDocument:
+ tables: List[StubTable] = field(default_factory=list)
+
+
+def _make_stub_table(
+ rows: List[List[str]], *, page_no: int
+) -> StubTable:
+ num_rows = len(rows)
+ num_cols = max(len(r) for r in rows) if rows else 0
+ cells = []
+ for r, row in enumerate(rows):
+ for c, value in enumerate(row):
+ cells.append(
+ StubTableCell(
+ start_row_offset_idx=r,
+ end_row_offset_idx=r + 1,
+ start_col_offset_idx=c,
+ end_col_offset_idx=c + 1,
+ text=value,
+ )
+ )
+ return StubTable(
+ data=StubTableData(num_rows=num_rows, num_cols=num_cols, table_cells=cells),
+ prov=[StubProv(page_no=page_no)],
+ )
+
+
+def _section(
+ chunk_type: str,
+ *,
+ page_number: Optional[int] = None,
+ table_rows: Optional[List[List[str]]] = None,
+ text: str = "",
+ extractor: Optional[str] = None,
+) -> SectionContent:
+ return SectionContent(
+ section_path=None,
+ page_number=page_number,
+ text=text,
+ chunk_type=chunk_type,
+ table_rows=table_rows,
+ extractor=extractor,
+ )
+
+
+# ---------------------------------------------------------------------------
+# _should_refine_by_url
+# ---------------------------------------------------------------------------
+
+
+class TestShouldRefineByUrl:
+ def test_match_in_allowlist(self):
+ assert _should_refine_by_url(
+ "https://www.cdc.gov/mmwr/volumes/75/wr/mm7509a1.htm",
+ ["cdc.gov/mmwr/"],
+ )
+
+ def test_case_insensitive(self):
+ assert _should_refine_by_url(
+ "https://WWW.CDC.GOV/MMWR/foo.pdf",
+ ["cdc.gov/mmwr/"],
+ )
+
+ def test_no_match(self):
+ assert not _should_refine_by_url(
+ "https://reuters.com/world/article", ["cdc.gov/mmwr/"]
+ )
+
+ def test_empty_url(self):
+ assert not _should_refine_by_url("", ["cdc.gov/mmwr/"])
+
+ def test_empty_allowlist(self):
+ assert not _should_refine_by_url("https://cdc.gov/mmwr/x", [])
+
+
+# ---------------------------------------------------------------------------
+# _broken_table_reasons
+# ---------------------------------------------------------------------------
+
+
+class TestBrokenTableReasons:
+ def test_healthy_table_passes(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section(
+ "table",
+ page_number=1,
+ table_rows=[
+ ["Country", "Cases"],
+ ["Sudan", "100"],
+ ["DRC", "250"],
+ ["Nigeria", "75"],
+ ],
+ ),
+ ],
+ )
+ assert _broken_table_reasons(parsed, threshold=0.5) == []
+
+ def test_sparse_table_flagged(self):
+ rows = [
+ ["A", "", "", "", "", "", "", "", "", "", "", "", ""],
+ ["", "", "", "", "", "", "", "", "", "", "", "", ""],
+ ["", "", "", "", "", "", "", "", "", "", "", "", ""],
+ ["", "", "", "", "", "", "", "", "", "", "", "", ""],
+ ["", "", "", "", "", "", "", "", "", "", "", "", ""],
+ ["", "", "", "", "", "", "", "", "", "", "", "", ""],
+ ]
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[_section("table", page_number=4, table_rows=rows)],
+ )
+ flagged = _broken_table_reasons(parsed, threshold=0.5)
+ assert len(flagged) == 1
+ idx, reason = flagged[0]
+ assert idx == 0
+ assert "page 4" in reason
+ assert "empty-cell ratio" in reason
+
+ def test_over_segmented_flagged(self):
+ # Most rows have only one non-empty cell -- looks like per-column over-segmentation.
+ rows = [
+ ["Header", "value"],
+ ["x", ""],
+ ["y", ""],
+ ["z", ""],
+ ["w", ""],
+ ["v", ""],
+ ]
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[_section("table", page_number=2, table_rows=rows)],
+ )
+ flagged = _broken_table_reasons(parsed, threshold=0.5)
+ assert len(flagged) == 1
+ idx, reason = flagged[0]
+ assert idx == 0
+ assert "over-segmented" in reason
+
+ def test_skips_non_table_sections(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[_section("prose", page_number=1, text="hello world")],
+ )
+ assert _broken_table_reasons(parsed, threshold=0.5) == []
+
+ def test_skips_too_small_tables(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section(
+ "table",
+ page_number=1,
+ table_rows=[["", ""], ["", ""]],
+ ),
+ ],
+ )
+ assert _broken_table_reasons(parsed, threshold=0.5) == []
+
+
+# ---------------------------------------------------------------------------
+# _docling_table_to_rows
+# ---------------------------------------------------------------------------
+
+
+class TestDoclingTableToRows:
+ def test_simple_grid(self):
+ stub = _make_stub_table(
+ [["Country", "Cases"], ["Sudan", "100"], ["DRC", "250"]],
+ page_no=1,
+ )
+ rows = _docling_table_to_rows(stub)
+ assert rows == [["Country", "Cases"], ["Sudan", "100"], ["DRC", "250"]]
+
+ def test_missing_data_returns_empty(self):
+ class _NoData:
+ data = None
+
+ assert _docling_table_to_rows(_NoData()) == []
+
+
+# ---------------------------------------------------------------------------
+# _merge_docling_tables_into_parsed
+# ---------------------------------------------------------------------------
+
+
+class TestMergeDoclingTables:
+ def test_replaces_in_tree_table_by_page(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section(
+ "table",
+ page_number=4,
+ table_rows=[["", ""], ["", ""], ["", ""]],
+ extractor="pymupdf",
+ ),
+ ],
+ )
+ docling_doc = StubDoclingDocument(
+ tables=[
+ _make_stub_table(
+ [["State", "Count"], ["NM", "9"], ["TX", "11"]],
+ page_no=4,
+ ),
+ ],
+ )
+
+ result = _merge_docling_tables_into_parsed(parsed, docling_doc)
+ assert len(result.sections) == 1
+ section = result.sections[0]
+ assert section.chunk_type == "table"
+ assert section.extractor == "docling"
+ assert section.table_rows == [
+ ["State", "Count"],
+ ["NM", "9"],
+ ["TX", "11"],
+ ]
+
+ def test_leaves_table_with_no_matching_page(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section(
+ "table",
+ page_number=4,
+ table_rows=[["A", "B"]],
+ extractor="pymupdf",
+ ),
+ ],
+ )
+ docling_doc = StubDoclingDocument(
+ tables=[
+ _make_stub_table([["X", "Y"]], page_no=7), # different page
+ ],
+ )
+
+ result = _merge_docling_tables_into_parsed(parsed, docling_doc)
+ assert result.sections[0].extractor == "pymupdf"
+ assert result.sections[0].table_rows == [["A", "B"]]
+
+ def test_multiple_tables_on_same_page_matched_in_order(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section(
+ "table",
+ page_number=2,
+ table_rows=[["?", "?"]],
+ extractor="pymupdf",
+ ),
+ _section(
+ "prose",
+ page_number=2,
+ text="some prose between",
+ extractor="pymupdf",
+ ),
+ _section(
+ "table",
+ page_number=2,
+ table_rows=[["?", "?"]],
+ extractor="pymupdf",
+ ),
+ ],
+ )
+ docling_doc = StubDoclingDocument(
+ tables=[
+ _make_stub_table([["first", "1"]], page_no=2),
+ _make_stub_table([["second", "2"]], page_no=2),
+ ],
+ )
+
+ result = _merge_docling_tables_into_parsed(parsed, docling_doc)
+ tables = [s for s in result.sections if s.chunk_type == "table"]
+ assert tables[0].table_rows == [["first", "1"]]
+ assert tables[0].extractor == "docling"
+ assert tables[1].table_rows == [["second", "2"]]
+ assert tables[1].extractor == "docling"
+ # The prose chunk in between is preserved.
+ assert any(s.chunk_type == "prose" for s in result.sections)
+
+ def test_leaves_prose_sections_alone(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section("prose", page_number=1, text="hello", extractor="pymupdf"),
+ ],
+ )
+ docling_doc = StubDoclingDocument(
+ tables=[_make_stub_table([["X", "Y"]], page_no=1)],
+ )
+
+ result = _merge_docling_tables_into_parsed(parsed, docling_doc)
+ # Prose untouched; the docling table is inserted as a new section.
+ prose = [s for s in result.sections if s.chunk_type == "prose"]
+ tables = [s for s in result.sections if s.chunk_type == "table"]
+ assert len(prose) == 1
+ assert prose[0].extractor == "pymupdf"
+ assert prose[0].text == "hello"
+ assert len(tables) == 1
+ assert tables[0].extractor == "docling"
+
+ def test_drops_unmatched_broken_intree_table(self):
+ # MMWR-style: in-tree's spurious table is on page 4, Docling's real
+ # table is on page 3. Page match fails. With broken_indices={0},
+ # the in-tree section is dropped and Docling's table is inserted.
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section(
+ "table",
+ page_number=4,
+ table_rows=[["", ""], ["", ""], ["", ""]],
+ extractor="pymupdf",
+ ),
+ ],
+ )
+ docling_doc = StubDoclingDocument(
+ tables=[
+ _make_stub_table(
+ [["Characteristic", "No. (%)"], ["Total", "99"], ["Sex", ""]],
+ page_no=3,
+ ),
+ ],
+ )
+
+ result = _merge_docling_tables_into_parsed(
+ parsed, docling_doc, broken_indices=frozenset([0])
+ )
+ tables = [s for s in result.sections if s.chunk_type == "table"]
+ assert len(tables) == 1
+ assert tables[0].extractor == "docling"
+ assert tables[0].page_number == 3
+ assert tables[0].table_rows[0] == ["Characteristic", "No. (%)"]
+
+ def test_keeps_unmatched_clean_intree_table(self):
+ # If an in-tree table didn't match Docling AND wasn't flagged broken,
+ # keep it (Docling missed a legitimate table).
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section(
+ "table",
+ page_number=4,
+ table_rows=[["A", "B"], ["1", "2"]],
+ extractor="pymupdf",
+ ),
+ ],
+ )
+ docling_doc = StubDoclingDocument(
+ tables=[_make_stub_table([["X", "Y"]], page_no=3)],
+ )
+
+ result = _merge_docling_tables_into_parsed(
+ parsed, docling_doc, broken_indices=frozenset()
+ )
+ tables = [s for s in result.sections if s.chunk_type == "table"]
+ # Original in-tree table preserved, plus inserted Docling table.
+ assert len(tables) == 2
+ extractors = sorted(t.extractor for t in tables)
+ assert extractors == ["docling", "pymupdf"]
+
+ def test_unmatched_docling_inserted_in_page_order(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section("prose", page_number=1, text="page1", extractor="pymupdf"),
+ _section("prose", page_number=5, text="page5", extractor="pymupdf"),
+ ],
+ )
+ docling_doc = StubDoclingDocument(
+ tables=[_make_stub_table([["X", "Y"]], page_no=3)],
+ )
+
+ result = _merge_docling_tables_into_parsed(parsed, docling_doc)
+ # Inserted Docling table should sit between the page-1 prose and the page-5 prose.
+ assert [s.page_number for s in result.sections] == [1, 3, 5]
+ assert result.sections[1].extractor == "docling"
+
+
+# ---------------------------------------------------------------------------
+# DoclingTableRefiner.refine() end-to-end with a fake converter
+# ---------------------------------------------------------------------------
+
+
+class _FakeResult:
+ def __init__(self, document):
+ self.document = document
+
+
+class _FakeConverter:
+ def __init__(self, document):
+ self._document = document
+ self.convert_calls = 0
+
+ def convert(self, _stream):
+ self.convert_calls += 1
+ return _FakeResult(self._document)
+
+
+class TestDoclingTableRefinerEndToEnd:
+ def _config(self) -> ExtractionConfig:
+ return ExtractionConfig(
+ enable_docling_refiner=True,
+ docling_source_allowlist=["cdc.gov/mmwr/"],
+ docling_sparse_cell_threshold=0.5,
+ )
+
+ def test_triggers_on_allowlist(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section(
+ "table",
+ page_number=1,
+ table_rows=[["Country", "Cases"], ["Sudan", "5"]],
+ extractor="pymupdf",
+ ),
+ ],
+ )
+ docling_doc = StubDoclingDocument(
+ tables=[_make_stub_table([["NM", "9"], ["TX", "11"]], page_no=1)],
+ )
+ converter = _FakeConverter(docling_doc)
+ refiner = DoclingTableRefiner(self._config(), converter=converter)
+
+ out = refiner.refine(
+ parsed,
+ source_url="https://www.cdc.gov/mmwr/volumes/75/wr/mm7509a1.htm",
+ content=b"%PDF-fake-bytes",
+ )
+
+ assert converter.convert_calls == 1
+ assert out.sections[0].extractor == "docling"
+ assert out.sections[0].table_rows == [["NM", "9"], ["TX", "11"]]
+
+ def test_triggers_on_heuristic(self):
+ # Sparse table -> heuristic fires even without allowlist match.
+ rows = [["A", "", "", ""], ["", "", "", ""], ["", "", "", ""]]
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section("table", page_number=3, table_rows=rows, extractor="pymupdf"),
+ ],
+ )
+ docling_doc = StubDoclingDocument(
+ tables=[_make_stub_table([["Region", "n"], ["X", "1"]], page_no=3)],
+ )
+ converter = _FakeConverter(docling_doc)
+ refiner = DoclingTableRefiner(self._config(), converter=converter)
+
+ out = refiner.refine(
+ parsed,
+ source_url="https://example.org/random.pdf",
+ content=b"%PDF-fake-bytes",
+ )
+
+ assert converter.convert_calls == 1
+ assert out.sections[0].extractor == "docling"
+
+ def test_no_trigger_leaves_parsed_unchanged(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section(
+ "table",
+ page_number=1,
+ table_rows=[
+ ["Country", "Cases"],
+ ["Sudan", "5"],
+ ["DRC", "100"],
+ ],
+ extractor="pymupdf",
+ ),
+ ],
+ )
+ converter = _FakeConverter(StubDoclingDocument())
+ refiner = DoclingTableRefiner(self._config(), converter=converter)
+
+ out = refiner.refine(
+ parsed,
+ source_url="https://reuters.com/world/africa/article",
+ content=b"%PDF-fake-bytes",
+ )
+
+ assert converter.convert_calls == 0
+ assert out.sections[0].extractor == "pymupdf"
+
+ def test_short_circuits_on_requires_ocr(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[],
+ is_partial=True,
+ partial_reason="requires_ocr",
+ )
+ converter = _FakeConverter(StubDoclingDocument())
+ refiner = DoclingTableRefiner(self._config(), converter=converter)
+
+ out = refiner.refine(
+ parsed,
+ source_url="https://www.cdc.gov/mmwr/volumes/75/wr/mm7509a1.htm",
+ content=b"%PDF-fake-bytes",
+ )
+
+ assert converter.convert_calls == 0
+ assert out is parsed
+
+ def test_converter_failure_falls_back_to_parsed(self):
+ parsed = ParsedContent(
+ raw_text="",
+ sections=[
+ _section(
+ "table",
+ page_number=1,
+ table_rows=[["A", "B"]],
+ extractor="pymupdf",
+ ),
+ ],
+ )
+ converter = MagicMock()
+ converter.convert.side_effect = RuntimeError("boom")
+ refiner = DoclingTableRefiner(self._config(), converter=converter)
+
+ out = refiner.refine(
+ parsed,
+ source_url="https://www.cdc.gov/mmwr/volumes/75/wr/mm7509a1.htm",
+ content=b"%PDF-fake-bytes",
+ )
+ assert out.sections[0].extractor == "pymupdf"
+
+
+# ---------------------------------------------------------------------------
+# Pipeline integration: extractor provenance flows through to DocumentChunk
+# ---------------------------------------------------------------------------
+
+
+def test_disabling_flag_skips_docling_construction(monkeypatch):
+ """With enable_docling_refiner=False the pipeline must never instantiate
+ a refiner (and therefore never touch any Docling import)."""
+ from bioscancast.extraction.pipeline import ExtractionPipeline
+
+ pipeline = ExtractionPipeline(
+ config=ExtractionConfig(enable_docling_refiner=False)
+ )
+
+ def _fail(*_a, **_kw):
+ raise AssertionError("DoclingTableRefiner should not be constructed")
+
+ monkeypatch.setattr(
+ "bioscancast.extraction.docling_refiner.DoclingTableRefiner.__init__",
+ _fail,
+ )
+ # Force the pipeline's path that decides whether to call the refiner.
+ assert pipeline._config.enable_docling_refiner is False
diff --git a/data/docling_eval/FINDINGS.md b/data/docling_eval/FINDINGS.md
new file mode 100644
index 0000000..734b014
--- /dev/null
+++ b/data/docling_eval/FINDINGS.md
@@ -0,0 +1,192 @@
+# Docling Evaluation — Biosecurity Sources
+
+Ran `scripts/eval_docling.py` against 8 real biosecurity sources (5 PDFs + 3 HTML). Full per-source metrics are in [`run_log.json`](run_log.json); this file summarises what the outputs look like and where Docling struggles for our use case (ingesting WHO/CDC/ECDC/Africa-CDC outbreak documents).
+
+Environment: Docling 2.90.0 + docling-core 2.74.0 in a fresh `.venv-docling` (Python 3.13, Windows, CPU-only). OCR disabled (`do_ocr=False`) and `TableFormerMode.FAST` — with OCR on, the first source alone took >11 minutes and still hadn't finished, so the reported timings are the "fast-path" numbers.
+
+## Summary
+
+| Source | Category | Pages | Tables | Chunks | Elapsed | Status |
+| --- | --- | ---:| ---:| ---:| ---:| --- |
+| WHO Mpox Sitrep #64 | PDF (WHO sitrep) | 15 | 1 | 38 | **274.6 s** (slow) | ok |
+| WHO Cholera Epi Update #34 | PDF (WHO sitrep) | 8 | 1 | 17 | 197.7 s | ok |
+| CDC MMWR — NM Measles (mm7509a1) | PDF (MMWR) | 5 | 1 | 20 | 110.5 s | ok |
+| ECDC CDTR Week 16 | PDF (ECDC) | 12 | 4 (all empty) | 28 | **324.6 s** (slow) | ok |
+| Africa CDC Weekly (April 2026) | PDF (Africa CDC) | 15 | 2 (all empty) | **0** | **523.4 s** (slow) | ok† |
+| Reuters — healthcare/pharma landing | HTML | — | — | — | 13.3 s | **error** (401) |
+| CIDRAP — Utah measles | HTML | 0 | 0 | 16 | 13.8 s | ok |
+| ProMED recent-posts listing | HTML | 0 | 1 | 17 | 13.8 s | ok |
+
+† Africa CDC returned 0 chunks — the PDF is fully image-based and yielded no extractable text with OCR off.
+
+7/8 succeeded, 3/8 breached the 240 s "slow" threshold, and 1 hard failure (Reuters, bot-protected). Total wall-clock for the 8 sources was ~25 minutes; first-run model download added ~40 MB and ~60 s on top.
+
+## What's in the Markdown
+
+### Tables — row/column structure and readable case counts
+
+| Source | Tables in doc | `num_rows × num_cols` | Readable from MD? |
+| --- | ---:| --- | --- |
+| WHO mpox sitrep | 1 | 9×4 | **Yes** — country / cases / deaths / reporting-countries readable: e.g. "Madagascar \| 368 \| 1 \| -". |
+| WHO cholera update | 1 | 21×8 | **Yes** — full cholera-by-region table (country, cases, deaths, CFR, cases-per-100k, monthly % change). "Democratic Republic of the Congo \| 6 543 \| 148 \| 2.3 \| 5 \| 39 \| 66". |
+| CDC MMWR | 1 | 17×2 | **Yes** — demographic/characteristic table rendered. |
+| ECDC CDTR | 4 detected | **all 0×0** | **No** — TableFormer flagged the table regions but returned empty cells. Case numbers that sit inside the tables are missing from the markdown; in the body text, inline counts ("Italy (63), Spain (36), France (16)") do come through. |
+| Africa CDC | 2 detected | **all 0×0** | **No** — image-only PDF, see below. |
+| CIDRAP | 0 | — | n/a (article doesn't have tables). |
+| ProMED listing | 1 | 157×2 | **Yes** — table of recent post titles by date renders cleanly. |
+
+Takeaway: Docling produces clean Markdown tables **when the PDF has native text tables** (the three WHO/CDC reports do). It silently degrades to empty cells when the tables are embedded as images or rely on OCR, and ECDC's CDTR layout falls into that bucket. For BioScanCast, this means case-count tables from WHO/CDC/MMWR are usable as-is, but ECDC/Africa-CDC tables will need either OCR-on fallback or an external data pipe.
+
+### Section headings
+
+Heading counts after conversion: mpox 40, cholera 15, MMWR 21, ECDC 30, CIDRAP 15, ProMED 7, Africa CDC 0. Order is preserved in all text-extractable sources:
+
+- WHO mpox: "## Highlights" → "## Epidemiological update" → "## Global monkeypox virus (MPXV) distribution" → "## Update on mpox outbreak transmission dynamics by virus clade" → "## Clade Ia MPXV" → "## Clade Ib MPXV" → … (matches the PDF's hierarchy).
+- CDC MMWR: "## Abstract" → "## Introduction" → "## Investigation and Outcomes" → "## Notification of Confirmed Measles Cases in Texas" → "## Characteristics of Outbreak-Related Measles Cases" → "## Public Health Response" → "## Discussion" → "## Limitations" → "## Implications for Public Health Practice".
+- ECDC CDTR: "## This week's topics" → "## Executive summary" → per-disease sections in order.
+
+Chunk `meta.headings` is populated, so the chunk exposes the full heading path (e.g. `['Measles Outbreak - New Mexico, 2025']`, `['Highlights']`). One caveat: the very first chunk of each doc has `headings=None` because it precedes the first `##` marker.
+
+### Reading order on multi-column PDFs
+
+MMWR is classic 2-column journal layout. The output reads correctly: paragraphs in column 1 flow into column 2 without interleaving, footnote markers (`*`, `†`, `§`) stay attached, and footnote bodies are placed near their markers. One quirk: the "INSIDE" sidebar (which lives in column 2 of page 1) gets spliced between body paragraphs rather than being lifted out — annoying for reading but not a correctness issue.
+
+### HTML: nav / ads / footers stripping
+
+Docling's HTML pipeline does **not** strip boilerplate.
+
+- CIDRAP article: lines 1–47 are the site nav (Topics & Projects, Podcasts, About, Search, …), line 48 is the actual article H1, the article body runs ~lines 48–77, and the remaining ~260 lines are other articles, "Choose newsletters" CTAs, and footer. The first chunk's heading is `['Main navigation']`, and chunk 15's headings include `['Tetanus still occurs among all ages in US, mainly in undervaccinated', 'Choose newsletters']` — so **unwanted content is definitely in the chunk stream**. For BioScanCast, HTML news articles will need a `trafilatura`-style pre-pass (we already use it in the existing extraction stage) before handing text to Docling, or a post-pass to filter chunks whose heading path contains "navigation" / "newsletters" / etc.
+- ProMED recent-posts: by coincidence the listing page is mostly a `
` of recent posts, which Docling preserves as a clean 157-row Markdown table. Good for headlines (MEASLES — ROMANIA, AVIAN INFLUENZA — INDIA (19), …) but not actual post bodies — those live at permalinks we didn't probe.
+
+### JavaScript-rendered sources
+
+- Reuters (`https://www.reuters.com/business/healthcare-pharmaceuticals/`): **fails** with `HTTPError: 401 Client Error: HTTP Forbidden` in 13 s. Docling uses a default `requests`/`httpx` fetch that doesn't pass a browser-like user-agent, and Reuters' Cloudflare front rejects it. Any Reuters or AP-equivalent source will need an out-of-band fetch (Playwright, explicit UA, or a news API).
+- ProMED listing page: the latest-posts table does render into the initial HTML, so Docling captured it fine. The individual post bodies behind each permalink are likely JS-rendered and would need a different approach.
+- CIDRAP: fully server-rendered, docling converted without issue.
+
+### Publication dates from metadata
+
+`pub_date` came back `None` for every source. Docling exposes `DoclingDocument.origin` but its only fields are `filename`, `mimetype`, `binary_hash` — no publication / creation date. Dates exist in the body text ("published 26 March 2026", "Week 16, 11–17 April 2026") but have to be extracted with a regex / LLM pass, not from document metadata. For BioScanCast, assume Docling won't give us a publication date; we need a separate parser over the first page.
+
+### Failures, timeouts, >3-minute runs
+
+- **Failure**: Reuters 401. Expected for any Cloudflare-fronted news site.
+- **>3 min (slow)**: WHO mpox 274.6 s, ECDC CDTR 324.6 s, Africa CDC 523.4 s. Average PDF ran at ~18 s/page with OCR off and TableFormer FAST on CPU; the mpox + ECDC PDFs are layout-dense (figures + tables + multi-column), and the Africa CDC PDF is pure images so the pipeline still runs layout detection on every page.
+- **No hangs, no timeouts** — just slow.
+- First-run model cost: ~40 MB of downloads the first time (RapidOCR det/rec models — still downloaded even with `do_ocr=False`, but not used; layout-heron, tableformer). Once cached, subsequent runs skip the download.
+
+### Africa CDC failure mode (0 chunks)
+
+The markdown for `africa_cdc_weekly_apr2026.md` is 266 bytes — 15 lines, each ``. The PDF has 15 pages but Docling extracted zero text because it's published as a scanned/rasterised document rather than native-text PDF. OCR would be needed to recover anything; see the OCR cost section below for why that's not viable on this hardware.
+
+## OCR cost evaluation (ECDC CDTR week 16)
+
+Follow-up run via `scripts/eval_docling_ocr_cost.py`, using `convert(page_range=...)` to time individual pages and project full-doc cost. Results in [data/docling_eval/ocr/per_page_cost.json](ocr/per_page_cost.json):
+
+| Mode | Mean per page | Projected 12-page doc | Extra bytes vs OCR-off baseline |
+| --- | ---:| ---:| --- |
+| `do_ocr=False` (baseline) | 22.5 s | ~4.5 min | — (3214 B on p5, 3721 B on p10) |
+| `do_ocr=True`, bitmap-only (default) | 132.6 s | ~26.5 min | **+57 B on p1, +0 B on p5/p10** |
+| `do_ocr=True`, `force_full_page_ocr=True` | 1055.8 s on p5 alone | ~3.5 hours | **less** content (2753 B vs 3214 B) — OCR overwrote the clean text layer |
+
+The earlier full-doc OCR-on run was killed at 42 min before ECDC even finished — that was `force_full_page_ocr=True`. Even the saner default (~26.5 min projected) returns essentially nothing for ECDC because the "4 tables detected but 0×0" in the OCR-off run are layout-detection **false positives on chart/figure regions**, not real tables. The case counts ECDC actually publishes ("Italy (63), Spain (36), France (16) and Poland (five)") are already in the text-flow prose that OCR-off captures. Africa CDC was skipped — full-page OCR projection ≈3.5 hours per 15-page doc is unworkable on CPU.
+
+Practical conclusion: **don't enable Docling OCR on this hardware**. Use OCR-off everywhere. For genuinely scanned PDFs like Africa CDC, route to a different ingestion path (external OCR service, GPU host, or simply skip).
+
+## Recommendations for the BioScanCast pipeline
+
+1. **Keep OCR off everywhere** (`do_ocr=False`). The OCR cost evaluation above showed bitmap-only OCR adds ~110 s/page of CPU work and recovers near-zero content on ECDC; full-page OCR is worse. For scanned-only PDFs (Africa CDC), OCR is the only path but the wall-clock makes it infeasible on CPU — handle out-of-band.
+2. **HTML pre-filter**. Keep the existing `trafilatura` main-content extraction in the pipeline; hand Docling the cleaned article HTML rather than raw URLs, or drop Docling for HTML entirely and use the current HTML path. Nav/footer chunks from Docling's HTML pipeline are not useful.
+3. **Reuters/AP**: Docling's default fetcher can't bypass Cloudflare (401). Feed it pre-fetched HTML from a UA-spoofing fetcher (the `curl` test in [data/docling_eval/sources/](sources/) showed that path works for CIDRAP), or skip news HTML sources in the Docling path.
+4. **Publication date**: plan a separate extractor; Docling doesn't expose it. Tier as: HTML `` / JSON-LD via trafilatura → PDF `/CreationDate` via PyMuPDF (noisy) → regex over the first chunk's body text.
+5. **Budget wall-clock**: expect 2-5 minutes/PDF on CPU even with OCR off; mpox sitrep was 4.5 min, ECDC 5.4 min, Africa CDC 8.7 min (and useless without OCR). A cron-driven BioScanCast scan that touches 10+ PDFs will want a worker pool or a GPU host; don't put this behind a synchronous API call.
+6. **Tables**: the WHO/MMWR tables we care about (country/case/death matrices) come through cleanly as Markdown — downstream code can parse them with a simple Markdown-table reader. ECDC's "tables" are charts/figures and need to be read from the surrounding prose instead.
+
+## Head-to-head: Docling vs. in-tree `PdfParser`
+
+Run via `scripts/eval_intree_pdf.py` against the same 5 local PDFs in `data/docling_eval/sources/`. In-tree stack: PyMuPDF + pdfplumber-fallback + font-size heading heuristic + `