algorithmicgovernance · rapsoj · May 18, 2026 · May 12, 2026 · May 18, 2026
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,11 @@ build/
 data/cache/
 *.sqlite
 
+# Docling eval — keep FINDINGS.md and sources/, ignore generated run artifacts
+data/docling_eval/*
+!data/docling_eval/FINDINGS.md
+!data/docling_eval/sources/
+
 # OS
 .DS_Store
 Thumbs.db
diff --git a/README.md b/README.md
@@ -208,6 +208,21 @@ html_parser.py
 pdf_parser.py
 text_cleaner.py
 
+Note on PDF table extraction (Docling refiner):
+
+The extraction stage uses an in-tree PDF parser (PyMuPDF + pdfplumber) as the
+default and a Docling TableFormer post-pass to refine table sections when an
+in-tree result looks broken or when the source URL is on a curated allowlist
+of publishers whose tables are known to be hard (CDC MMWR, certain WHO
+situation reports).
+
+The first PDF that triggers the refiner downloads the Docling layout and
+TableFormer models (~40 MB) to the HuggingFace cache (`~/.cache/huggingface/`)
+and holds them in memory (~1.5 GB) for the lifetime of the process. The
+feature is toggled with `ExtractionConfig.enable_docling_refiner` — when
+disabled, no Docling imports occur and behaviour matches the pre-refiner
+pipeline exactly.
+
 ---
 
 ## Insight Stage

diff --git a/bioscancast/extraction/chunking.py b/bioscancast/extraction/chunking.py
@@ -50,6 +50,7 @@ def normalize_chunks(
                     page_number=chunk.page_number,
                     table_data=None,
                     token_count=part_tokens,
+                    extractor=chunk.extractor,
                 )
             )
 

diff --git a/bioscancast/extraction/config.py b/bioscancast/extraction/config.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from typing import List
 
 
 @dataclass
@@ -10,4 +11,27 @@ class ExtractionConfig:
     pdf_max_pages: int = 100
     chunk_target_tokens: int = 800
     chunk_max_tokens: int = 1500
+    user_agent: str = (
+        "BioScanCast/0.1 (+https://github.com/algorithmicgovernance/BioScanCast)"
+    )
+
+    # ---- Docling table refiner ----
+    enable_docling_refiner: bool = True
+    """Toggle the Docling post-pass that refines PDF table sections.
+
+    When False, no Docling imports occur and behaviour is identical to the
+    pre-refiner pipeline.
+    """
+
+    docling_source_allowlist: List[str] = field(
+        default_factory=lambda: [
+            "cdc.gov/mmwr/",
+            "cdn.who.int/media/docs/default-source/_sage-",
+            "cdn.who.int/media/docs/default-source/documents/emergencies/situation-reports/",
+        ]
+    )
+    """Source URL substrings known to contain hard tables. Match triggers Docling unconditionally."""
+
+    docling_sparse_cell_threshold: float = 0.5
+    """Non-empty-cell ratio below which a table is flagged as suspect and triggers Docling."""
     impersonate: str = "chrome"