Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions bioscancast/filtering/llm_filter.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
from __future__ import annotations

import json
from typing import Dict, List, Protocol

from .models import FilterDecision, ForecastQuestion, SearchResult
from typing import Dict, List

from bioscancast.llm.client import LLMClient

class LLMClient(Protocol):
def generate_json(self, prompt: str) -> dict:
...
from .models import FilterDecision, ForecastQuestion, SearchResult


def build_filter_prompt(
Expand All @@ -20,7 +17,8 @@ def build_filter_prompt(
"You are filtering search results for a biosecurity forecasting pipeline. "
"Keep only candidates likely to contain relevant factual evidence for forecasting. "
"Prefer official, primary, recent, and event-specific sources. "
"Reject low-information, generic, duplicated, or weakly relevant pages."
"Reject low-information, generic, duplicated, or weakly relevant pages. "
"Return your response as JSON matching the output_schema below."
),
"question": {
"id": question.id,
Expand Down
4 changes: 3 additions & 1 deletion bioscancast/filtering/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

from typing import List, Optional

from bioscancast.llm.client import LLMClient

from .config import FILTER_CONFIG
from .deduplication import deduplicate_filtered_documents
from .heuristics import heuristic_filter
from .llm_filter import LLMClient, llm_filter_candidates
from .llm_filter import llm_filter_candidates
from .models import FilterDecision, FilteredDocument, ForecastQuestion, SearchResult
from .postprocess import assign_extraction_hints, build_filtered_documents, cap_per_domain_and_type
from .reranker import rerank_borderline_candidates, split_for_llm_review
Expand Down
5 changes: 5 additions & 0 deletions bioscancast/insight/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"embedding_model": "text-embedding-3-small",
"max_input_tokens_per_run": 500_000,
"max_chunks_per_document": 12,
"extraction_max_output_tokens": 4096,
}


Expand All @@ -32,6 +33,10 @@ class InsightConfig:
embedding_model: str = "text-embedding-3-small"
max_input_tokens_per_run: int = 500_000
max_chunks_per_document: int = 12
extraction_max_output_tokens: int = 4096
"""Per-call cap on LLM output tokens for chunk extraction. The default
1024 ceiling in LLMClient.generate_json truncates dense pages (e.g. the
ECDC CDTR) mid-JSON; 4096 leaves comfortable headroom."""

@classmethod
def from_dict(cls, d: dict) -> InsightConfig:
Expand Down
4 changes: 4 additions & 0 deletions bioscancast/insight/extraction/chunk_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def extract_facts_from_chunk(
llm_client: LLMClient,
*,
model: str,
max_tokens: int = 4096,
) -> tuple[list[InsightRecord], LLMResponse]:
"""Extract structured facts from a single chunk via LLM.

Expand All @@ -118,6 +119,8 @@ def extract_facts_from_chunk(
question: Forecast question for context.
llm_client: LLM client (fake or real).
model: Model identifier for extraction.
max_tokens: Per-call output-token cap. Raise this if the model is
truncating dense pages mid-JSON.

Returns:
Tuple of (list of InsightRecords, LLMResponse for budget tracking).
Expand All @@ -130,6 +133,7 @@ def extract_facts_from_chunk(
user=user,
schema=schema,
model=model,
max_tokens=max_tokens,
)

facts_raw = response.content.get("facts", [])
Expand Down
13 changes: 12 additions & 1 deletion bioscancast/insight/extraction/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,18 @@ def build_extraction_prompt(
"summary": {"type": ["string", "null"]},
"quote": {"type": "string"},
},
"required": ["event_type", "confidence", "quote"],
"required": [
"event_type",
"confidence",
"location",
"pathogen",
"metric_name",
"metric_value",
"metric_unit",
"event_date",
"summary",
"quote",
],
"additionalProperties": False,
},
},
Expand Down
1 change: 1 addition & 0 deletions bioscancast/insight/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def run(
question,
self._llm,
model=config.cheap_model,
max_tokens=config.extraction_max_output_tokens,
)
budget.record(response)
all_records.extend(records)
Expand Down
21 changes: 18 additions & 3 deletions bioscancast/insight/retrieval/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,23 @@
from bioscancast.llm.base import LLMClient


def _embeddable_text(chunk: DocumentChunk) -> str:
"""Build a non-empty text representation of a chunk for embedding.

Table chunks have empty ``.text`` because their content lives in
``.table_data``; rendering the rows inline lets the embedding model
see the actual values. Falls back to the heading or a placeholder
if both are empty, since the embeddings API rejects empty strings.
"""
text = chunk.text or ""
if not text.strip() and chunk.table_data:
rows = [" | ".join(str(cell) for cell in row) for row in chunk.table_data]
text = "\n".join(rows)
if chunk.heading:
text = f"{chunk.heading} {text}".strip()
return text or (chunk.heading or "[empty chunk]")


def embed_chunks(
chunks: list[DocumentChunk],
llm_client: LLMClient,
Expand Down Expand Up @@ -47,9 +64,7 @@ def embed_chunks(

for i, chunk in enumerate(chunks):
if chunk.chunk_id not in cache:
text = chunk.text
if chunk.heading:
text = chunk.heading + " " + text
text = _embeddable_text(chunk)
texts_to_embed.append(text)
indices_to_embed.append(i)

Expand Down
22 changes: 21 additions & 1 deletion bioscancast/llm/openai_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
from __future__ import annotations

import json
import logging
import os
from typing import Optional

from .base import LLMResponse

logger = logging.getLogger(__name__)


class OpenAILLMClient:
"""Production LLM client using the OpenAI API.
Expand Down Expand Up @@ -63,8 +66,25 @@ def generate_json(
seed=self._seed,
)
raw_text = response.choices[0].message.content or "{}"
content = json.loads(raw_text)
usage = response.usage
try:
content = json.loads(raw_text)
except json.JSONDecodeError as exc:
# Malformed JSON from the model — typically caused by hitting
# max_tokens mid-string. Don't raise: the HTTP call succeeded
# and we already paid for the tokens, so preserve the budget
# numbers and let the caller handle an empty content dict.
logger.warning(
"generate_json: dropping unparseable model response "
"(model=%s, output_tokens=%s, finish_reason=%s, err=%s). "
"Raw text head: %r",
response.model,
usage.completion_tokens if usage else None,
response.choices[0].finish_reason,
exc,
raw_text[:200],
)
content = {}
return LLMResponse(
content=content,
input_tokens=usage.prompt_tokens if usage else 0,
Expand Down