algorithmicgovernance · rapsoj · May 18, 2026 · May 12, 2026 · May 12, 2026 · May 17, 2026
diff --git a/bioscancast/filtering/llm_filter.py b/bioscancast/filtering/llm_filter.py
@@ -1,14 +1,11 @@
 from __future__ import annotations
 
 import json
-from typing import Dict, List, Protocol
-
-from .models import FilterDecision, ForecastQuestion, SearchResult
+from typing import Dict, List
 
+from bioscancast.llm.client import LLMClient
 
-class LLMClient(Protocol):
-    def generate_json(self, prompt: str) -> dict:
-        ...
+from .models import FilterDecision, ForecastQuestion, SearchResult
 
 
 def build_filter_prompt(
@@ -20,7 +17,8 @@ def build_filter_prompt(
             "You are filtering search results for a biosecurity forecasting pipeline. "
             "Keep only candidates likely to contain relevant factual evidence for forecasting. "
             "Prefer official, primary, recent, and event-specific sources. "
-            "Reject low-information, generic, duplicated, or weakly relevant pages."
+            "Reject low-information, generic, duplicated, or weakly relevant pages. "
+            "Return your response as JSON matching the output_schema below."
         ),
         "question": {
             "id": question.id,

diff --git a/bioscancast/filtering/pipeline.py b/bioscancast/filtering/pipeline.py
@@ -2,10 +2,12 @@
 
 from typing import List, Optional
 
+from bioscancast.llm.client import LLMClient
+
 from .config import FILTER_CONFIG
 from .deduplication import deduplicate_filtered_documents
 from .heuristics import heuristic_filter
-from .llm_filter import LLMClient, llm_filter_candidates
+from .llm_filter import llm_filter_candidates
 from .models import FilterDecision, FilteredDocument, ForecastQuestion, SearchResult
 from .postprocess import assign_extraction_hints, build_filtered_documents, cap_per_domain_and_type
 from .reranker import rerank_borderline_candidates, split_for_llm_review

diff --git a/bioscancast/insight/config.py b/bioscancast/insight/config.py
@@ -16,6 +16,7 @@
     "embedding_model": "text-embedding-3-small",
     "max_input_tokens_per_run": 500_000,
     "max_chunks_per_document": 12,
+    "extraction_max_output_tokens": 4096,
 }
 
 
@@ -32,6 +33,10 @@ class InsightConfig:
     embedding_model: str = "text-embedding-3-small"
     max_input_tokens_per_run: int = 500_000
     max_chunks_per_document: int = 12
+    extraction_max_output_tokens: int = 4096
+    """Per-call cap on LLM output tokens for chunk extraction. The default
+    1024 ceiling in LLMClient.generate_json truncates dense pages (e.g. the
+    ECDC CDTR) mid-JSON; 4096 leaves comfortable headroom."""
 
     @classmethod
     def from_dict(cls, d: dict) -> InsightConfig:

diff --git a/bioscancast/insight/extraction/chunk_extractor.py b/bioscancast/insight/extraction/chunk_extractor.py
@@ -109,6 +109,7 @@ def extract_facts_from_chunk(
     llm_client: LLMClient,
     *,
     model: str,
+    max_tokens: int = 4096,
 ) -> tuple[list[InsightRecord], LLMResponse]:
     """Extract structured facts from a single chunk via LLM.
 
@@ -118,6 +119,8 @@ def extract_facts_from_chunk(
         question: Forecast question for context.
         llm_client: LLM client (fake or real).
         model: Model identifier for extraction.
+        max_tokens: Per-call output-token cap. Raise this if the model is
+            truncating dense pages mid-JSON.
 
     Returns:
         Tuple of (list of InsightRecords, LLMResponse for budget tracking).
@@ -130,6 +133,7 @@ def extract_facts_from_chunk(
         user=user,
         schema=schema,
         model=model,
+        max_tokens=max_tokens,
     )
 
     facts_raw = response.content.get("facts", [])

diff --git a/bioscancast/insight/extraction/prompts.py b/bioscancast/insight/extraction/prompts.py
@@ -115,7 +115,18 @@ def build_extraction_prompt(
                     "summary": {"type": ["string", "null"]},
                     "quote": {"type": "string"},
                 },
-                "required": ["event_type", "confidence", "quote"],
+                "required": [
+                    "event_type",
+                    "confidence",
+                    "location",
+                    "pathogen",
+                    "metric_name",
+                    "metric_value",
+                    "metric_unit",
+                    "event_date",
+                    "summary",
+                    "quote",
+                ],
                 "additionalProperties": False,
             },
         },

diff --git a/bioscancast/insight/pipeline.py b/bioscancast/insight/pipeline.py
@@ -120,6 +120,7 @@ def run(
                     question,
                     self._llm,
                     model=config.cheap_model,
+                    max_tokens=config.extraction_max_output_tokens,
                 )
                 budget.record(response)
                 all_records.extend(records)

diff --git a/bioscancast/insight/retrieval/embeddings.py b/bioscancast/insight/retrieval/embeddings.py
@@ -20,6 +20,23 @@
     from bioscancast.llm.base import LLMClient
 
 
+def _embeddable_text(chunk: DocumentChunk) -> str:
+    """Build a non-empty text representation of a chunk for embedding.
+
+    Table chunks have empty ``.text`` because their content lives in
+    ``.table_data``; rendering the rows inline lets the embedding model
+    see the actual values. Falls back to the heading or a placeholder
+    if both are empty, since the embeddings API rejects empty strings.
+    """
+    text = chunk.text or ""
+    if not text.strip() and chunk.table_data:
+        rows = [" | ".join(str(cell) for cell in row) for row in chunk.table_data]
+        text = "\n".join(rows)
+    if chunk.heading:
+        text = f"{chunk.heading} {text}".strip()
+    return text or (chunk.heading or "[empty chunk]")
+
+
 def embed_chunks(
     chunks: list[DocumentChunk],
     llm_client: LLMClient,
@@ -47,9 +64,7 @@ def embed_chunks(
 
     for i, chunk in enumerate(chunks):
         if chunk.chunk_id not in cache:
-            text = chunk.text
-            if chunk.heading:
-                text = chunk.heading + " " + text
+            text = _embeddable_text(chunk)
             texts_to_embed.append(text)
             indices_to_embed.append(i)
 

diff --git a/bioscancast/llm/openai_client.py b/bioscancast/llm/openai_client.py
@@ -7,11 +7,14 @@
 from __future__ import annotations
 
 import json
+import logging
 import os
 from typing import Optional
 
 from .base import LLMResponse
 
+logger = logging.getLogger(__name__)
+
 
 class OpenAILLMClient:
     """Production LLM client using the OpenAI API.
@@ -63,8 +66,25 @@ def generate_json(
             seed=self._seed,
         )
         raw_text = response.choices[0].message.content or "{}"
-        content = json.loads(raw_text)
         usage = response.usage
+        try:
+            content = json.loads(raw_text)
+        except json.JSONDecodeError as exc:
+            # Malformed JSON from the model — typically caused by hitting
+            # max_tokens mid-string. Don't raise: the HTTP call succeeded
+            # and we already paid for the tokens, so preserve the budget
+            # numbers and let the caller handle an empty content dict.
+            logger.warning(
+                "generate_json: dropping unparseable model response "
+                "(model=%s, output_tokens=%s, finish_reason=%s, err=%s). "
+                "Raw text head: %r",
+                response.model,
+                usage.completion_tokens if usage else None,
+                response.choices[0].finish_reason,
+                exc,
+                raw_text[:200],
+            )
+            content = {}
         return LLMResponse(
             content=content,
             input_tokens=usage.prompt_tokens if usage else 0,