diff --git a/bioscancast/filtering/llm_filter.py b/bioscancast/filtering/llm_filter.py index 057c649..2bcaa70 100644 --- a/bioscancast/filtering/llm_filter.py +++ b/bioscancast/filtering/llm_filter.py @@ -1,14 +1,11 @@ from __future__ import annotations import json -from typing import Dict, List, Protocol - -from .models import FilterDecision, ForecastQuestion, SearchResult +from typing import Dict, List +from bioscancast.llm.client import LLMClient -class LLMClient(Protocol): - def generate_json(self, prompt: str) -> dict: - ... +from .models import FilterDecision, ForecastQuestion, SearchResult def build_filter_prompt( @@ -20,7 +17,8 @@ def build_filter_prompt( "You are filtering search results for a biosecurity forecasting pipeline. " "Keep only candidates likely to contain relevant factual evidence for forecasting. " "Prefer official, primary, recent, and event-specific sources. " - "Reject low-information, generic, duplicated, or weakly relevant pages." + "Reject low-information, generic, duplicated, or weakly relevant pages. " + "Return your response as JSON matching the output_schema below." ), "question": { "id": question.id, diff --git a/bioscancast/filtering/pipeline.py b/bioscancast/filtering/pipeline.py index 7f1f1f7..23443d9 100644 --- a/bioscancast/filtering/pipeline.py +++ b/bioscancast/filtering/pipeline.py @@ -2,10 +2,12 @@ from typing import List, Optional +from bioscancast.llm.client import LLMClient + from .config import FILTER_CONFIG from .deduplication import deduplicate_filtered_documents from .heuristics import heuristic_filter -from .llm_filter import LLMClient, llm_filter_candidates +from .llm_filter import llm_filter_candidates from .models import FilterDecision, FilteredDocument, ForecastQuestion, SearchResult from .postprocess import assign_extraction_hints, build_filtered_documents, cap_per_domain_and_type from .reranker import rerank_borderline_candidates, split_for_llm_review diff --git a/bioscancast/insight/config.py b/bioscancast/insight/config.py index 37be3f4..05cc4af 100644 --- a/bioscancast/insight/config.py +++ b/bioscancast/insight/config.py @@ -16,6 +16,7 @@ "embedding_model": "text-embedding-3-small", "max_input_tokens_per_run": 500_000, "max_chunks_per_document": 12, + "extraction_max_output_tokens": 4096, } @@ -32,6 +33,10 @@ class InsightConfig: embedding_model: str = "text-embedding-3-small" max_input_tokens_per_run: int = 500_000 max_chunks_per_document: int = 12 + extraction_max_output_tokens: int = 4096 + """Per-call cap on LLM output tokens for chunk extraction. The default + 1024 ceiling in LLMClient.generate_json truncates dense pages (e.g. the + ECDC CDTR) mid-JSON; 4096 leaves comfortable headroom.""" @classmethod def from_dict(cls, d: dict) -> InsightConfig: diff --git a/bioscancast/insight/extraction/chunk_extractor.py b/bioscancast/insight/extraction/chunk_extractor.py index 2f82a0c..2a4666b 100644 --- a/bioscancast/insight/extraction/chunk_extractor.py +++ b/bioscancast/insight/extraction/chunk_extractor.py @@ -109,6 +109,7 @@ def extract_facts_from_chunk( llm_client: LLMClient, *, model: str, + max_tokens: int = 4096, ) -> tuple[list[InsightRecord], LLMResponse]: """Extract structured facts from a single chunk via LLM. @@ -118,6 +119,8 @@ def extract_facts_from_chunk( question: Forecast question for context. llm_client: LLM client (fake or real). model: Model identifier for extraction. + max_tokens: Per-call output-token cap. Raise this if the model is + truncating dense pages mid-JSON. Returns: Tuple of (list of InsightRecords, LLMResponse for budget tracking). @@ -130,6 +133,7 @@ def extract_facts_from_chunk( user=user, schema=schema, model=model, + max_tokens=max_tokens, ) facts_raw = response.content.get("facts", []) diff --git a/bioscancast/insight/extraction/prompts.py b/bioscancast/insight/extraction/prompts.py index fbd3a50..b3bdfd8 100644 --- a/bioscancast/insight/extraction/prompts.py +++ b/bioscancast/insight/extraction/prompts.py @@ -115,7 +115,18 @@ def build_extraction_prompt( "summary": {"type": ["string", "null"]}, "quote": {"type": "string"}, }, - "required": ["event_type", "confidence", "quote"], + "required": [ + "event_type", + "confidence", + "location", + "pathogen", + "metric_name", + "metric_value", + "metric_unit", + "event_date", + "summary", + "quote", + ], "additionalProperties": False, }, }, diff --git a/bioscancast/insight/pipeline.py b/bioscancast/insight/pipeline.py index 57896c7..4557672 100644 --- a/bioscancast/insight/pipeline.py +++ b/bioscancast/insight/pipeline.py @@ -120,6 +120,7 @@ def run( question, self._llm, model=config.cheap_model, + max_tokens=config.extraction_max_output_tokens, ) budget.record(response) all_records.extend(records) diff --git a/bioscancast/insight/retrieval/embeddings.py b/bioscancast/insight/retrieval/embeddings.py index 6eae70d..adaf4c5 100644 --- a/bioscancast/insight/retrieval/embeddings.py +++ b/bioscancast/insight/retrieval/embeddings.py @@ -20,6 +20,23 @@ from bioscancast.llm.base import LLMClient +def _embeddable_text(chunk: DocumentChunk) -> str: + """Build a non-empty text representation of a chunk for embedding. + + Table chunks have empty ``.text`` because their content lives in + ``.table_data``; rendering the rows inline lets the embedding model + see the actual values. Falls back to the heading or a placeholder + if both are empty, since the embeddings API rejects empty strings. + """ + text = chunk.text or "" + if not text.strip() and chunk.table_data: + rows = [" | ".join(str(cell) for cell in row) for row in chunk.table_data] + text = "\n".join(rows) + if chunk.heading: + text = f"{chunk.heading} {text}".strip() + return text or (chunk.heading or "[empty chunk]") + + def embed_chunks( chunks: list[DocumentChunk], llm_client: LLMClient, @@ -47,9 +64,7 @@ def embed_chunks( for i, chunk in enumerate(chunks): if chunk.chunk_id not in cache: - text = chunk.text - if chunk.heading: - text = chunk.heading + " " + text + text = _embeddable_text(chunk) texts_to_embed.append(text) indices_to_embed.append(i) diff --git a/bioscancast/llm/openai_client.py b/bioscancast/llm/openai_client.py index 7dc4f04..6cc5ee9 100644 --- a/bioscancast/llm/openai_client.py +++ b/bioscancast/llm/openai_client.py @@ -7,11 +7,14 @@ from __future__ import annotations import json +import logging import os from typing import Optional from .base import LLMResponse +logger = logging.getLogger(__name__) + class OpenAILLMClient: """Production LLM client using the OpenAI API. @@ -63,8 +66,25 @@ def generate_json( seed=self._seed, ) raw_text = response.choices[0].message.content or "{}" - content = json.loads(raw_text) usage = response.usage + try: + content = json.loads(raw_text) + except json.JSONDecodeError as exc: + # Malformed JSON from the model — typically caused by hitting + # max_tokens mid-string. Don't raise: the HTTP call succeeded + # and we already paid for the tokens, so preserve the budget + # numbers and let the caller handle an empty content dict. + logger.warning( + "generate_json: dropping unparseable model response " + "(model=%s, output_tokens=%s, finish_reason=%s, err=%s). " + "Raw text head: %r", + response.model, + usage.completion_tokens if usage else None, + response.choices[0].finish_reason, + exc, + raw_text[:200], + ) + content = {} return LLMResponse( content=content, input_tokens=usage.prompt_tokens if usage else 0,