algorithmicgovernance · rapsoj · May 28, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/README.md b/README.md
@@ -366,6 +366,42 @@ human_comparison.py
 
 Used to compare model forecasts against human forecasts.
 
+## Historical-replay mode (benchmarking against human forecasters)
+
+When benchmarking the pipeline against human forecasters on past questions,
+the model must not be allowed to see sources that didn't exist (or contained
+different content) at the time the human forecasted. Historical-replay mode
+enforces this by reading a single per-question field, `ForecastQuestion.as_of_date`:
+
+- When `as_of_date` is `None` (default), the pipeline behaves exactly as in
+  live mode. No code paths change.
+- When `as_of_date` is set, the search backend receives `end_date=as_of_date`,
+  the cache key incorporates the cutoff, post-retrieval filtering drops any
+  result dated after the cutoff (and any undated result whose date cannot be
+  cheaply recovered), dashboard URLs are rewritten to the closest Wayback
+  snapshot at or before the cutoff (or suppressed if none exists), and the
+  extraction stage fetches from Wayback. Wayback fallback to live is logged
+  at INFO and recorded in `Document.fetch_strategy`, never silent.
+
+The LLM "historical roleplay" prompt is *not* automatically enabled by
+`as_of_date`; it lives behind a separate `historical_roleplay=True` flag on
+`SearchStagePipeline` because its effect on query quality is harder to
+predict. Turn it on for the benchmark and off for production.
+
+What this mode does NOT fix: the LLMs themselves were trained on data that
+postdates many of our benchmark questions. Retrieval fairness ≠ model
+fairness. The `retrieval_free_baseline_forecast` metric in
+`bioscancast/stages/eval_stage/contamination.py` reports how well the LLM
+forecasts with no evidence at all; a small gap between that and the full
+pipeline is itself evidence of training-data leakage and must be reported
+alongside the headline Brier/log scores.
+
+`filter_caught_contamination_rate` is also exposed by the same module. It
+is a **lower bound** on contamination — it only counts post-cutoff results
+whose `published_date` is known. Undated results and results whose content
+changed post-cutoff are invisible to it. Reports MUST surface this caveat;
+the metric's docstring repeats it for the same reason.
+
 ---
 
 # Datasets

diff --git a/bioscancast/extraction/fetcher.py b/bioscancast/extraction/fetcher.py
@@ -7,6 +7,8 @@
 
 from curl_cffi import requests as curl_requests
 
+from bioscancast.stages.search_stage.wayback import closest_snapshot_before
+
 from .config import ExtractionConfig
 
 logger = logging.getLogger(__name__)
@@ -25,6 +27,8 @@ class FetchResult:
     content_bytes: Optional[bytes]
     fetched_at: datetime
     error: Optional[str]
+    fetch_strategy: str = "live"
+    snapshot_timestamp: Optional[datetime] = None
 
 
 def _sniff_content_type(content: bytes) -> Optional[str]:
@@ -51,22 +55,72 @@ def fetch(
     url: str,
     *,
     config: ExtractionConfig | None = None,
+    as_of_date: Optional[datetime] = None,
 ) -> FetchResult:
     """Fetch a URL and return the result. Never raises on network errors.
 
     Uses curl_cffi with a browser TLS fingerprint (configurable via
     ExtractionConfig.impersonate) to avoid Cloudflare/JA3-based blocks that
     reject httpx and requests. The impersonation profile sets a matching
     User-Agent automatically.
+
+    Historical-replay mode: when ``as_of_date`` is set the function first
+    asks Wayback for the closest capture at-or-before that date and fetches
+    the raw snapshot bytes via the ``id_`` modifier. The returned FetchResult
+    carries ``fetch_strategy="wayback"`` and ``snapshot_timestamp`` set to
+    the capture time. If no snapshot exists, or the Wayback fetch errors,
+    we fall back to a live fetch and tag the result
+    ``fetch_strategy="wayback_fallback_to_live"`` so audit reports can see
+    the leak. The fallback is logged at INFO — never silent.
     """
+    if as_of_date is not None:
+        snapshot = closest_snapshot_before(url, as_of_date)
+        if snapshot is not None:
+            snapshot_dt, snapshot_url = snapshot
+            wb_result = _fetch_via_curl(
+                target_url=snapshot_url,
+                reported_url=url,
+                config=config,
+            )
+            if wb_result.error is None and wb_result.content_bytes is not None:
+                wb_result.fetch_strategy = "wayback"
+                wb_result.snapshot_timestamp = snapshot_dt
+                return wb_result
+            logger.info(
+                "Wayback fetch failed for %s (snapshot %s, error=%s); "
+                "falling back to live",
+                url, snapshot_dt.isoformat(), wb_result.error,
+            )
+        else:
+            logger.info(
+                "No Wayback snapshot for %s at-or-before %s; falling back to live",
+                url, as_of_date.isoformat(),
+            )
+        live_result = _fetch_via_curl(target_url=url, reported_url=url, config=config)
+        live_result.fetch_strategy = "wayback_fallback_to_live"
+        return live_result
+
+    return _fetch_via_curl(target_url=url, reported_url=url, config=config)
+
+
+def _fetch_via_curl(
+    *,
+    target_url: str,
+    reported_url: str,
+    config: ExtractionConfig | None,
+) -> FetchResult:
+    """Issue the actual HTTP GET. ``target_url`` is what we hit (may be a
+    Wayback ``id_`` URL); ``reported_url`` is what we record in
+    ``FetchResult.url`` so downstream consumers see the original publisher
+    URL, not archive.org."""
     cfg = config or ExtractionConfig()
     fetched_at = datetime.now(timezone.utc)
 
     try:
         # curl_cffi's streaming Response is not a context manager in the
         # installed version, so we close it explicitly in a finally block.
         response = curl_requests.get(
-            url,
+            target_url,
             stream=True,
             timeout=cfg.fetch_timeout_seconds,
             impersonate=cfg.impersonate,
@@ -76,7 +130,7 @@ def fetch(
             content_length = response.headers.get("content-length")
             if content_length and int(content_length) > cfg.fetch_max_bytes:
                 return FetchResult(
-                    url=url,
+                    url=reported_url,
                     final_url=str(response.url),
                     status_code=response.status_code,
                     content_type=_normalize_content_type(
@@ -95,7 +149,7 @@ def fetch(
                 total += len(chunk)
                 if total > cfg.fetch_max_bytes:
                     return FetchResult(
-                        url=url,
+                        url=reported_url,
                         final_url=str(response.url),
                         status_code=response.status_code,
                         content_type=_normalize_content_type(
@@ -118,7 +172,7 @@ def fetch(
                 raw_ct = _sniff_content_type(content_bytes) or raw_ct
 
             return FetchResult(
-                url=url,
+                url=reported_url,
                 final_url=str(response.url),
                 status_code=response.status_code,
                 content_type=raw_ct,
@@ -130,10 +184,10 @@ def fetch(
             response.close()
 
     except Exception as exc:
-        logger.warning("Fetch failed for %s: %s", url, exc)
+        logger.warning("Fetch failed for %s: %s", target_url, exc)
         return FetchResult(
-            url=url,
-            final_url=url,
+            url=reported_url,
+            final_url=reported_url,
             status_code=None,
             content_type=None,
             content_bytes=None,