In [1]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata

import sys
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))


# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""

    def __init__(self, content: str = "BCL2"):
        self.content = content

    def __str__(self):
        return self.content


class MockStructuredOutput:
    """Mock structured output for template parsing."""

    def __init__(self, **kwargs):
        self.count = kwargs.get("count", 46)
        self.target = kwargs.get("target", "BCL2")
        self.subunits = kwargs.get("subunits", 4)
        self.diseases = kwargs.get("diseases", ["asthma", "bronchitis", "pneumonia"])
        self.mentions_bcl2_protein = kwargs.get("mentions_bcl2_protein", True)
        self.mentions_apoptosis_regulation = kwargs.get("mentions_apoptosis_regulation", False)
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def model_dump(self):
        return self.dict()


def create_mock_chat_model(default_response: str = "BCL2"):
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse(default_response)
    mock.ainvoke.return_value = MockLLMResponse(default_response)
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


def create_mock_benchmark():
    """Create a mock Benchmark object for demonstrations."""
    from karenina import Benchmark

    benchmark = Benchmark.create(
        name="Demo Benchmark", description="Mock benchmark for documentation", version="1.0.0", creator="Documentation"
    )
    return benchmark


# Patch all LLM providers before any imports
_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch(
        "karenina.infrastructure.llm.interface.init_chat_model_unified",
        side_effect=lambda **kwargs: create_mock_chat_model(),
    ),
]

for p in _llm_patches:
    p.start()


# Helper to replace file paths in examples
def temp_path(filename: str) -> Path:
    """Get a temporary file path for documentation examples."""
    return TEMP_DIR / filename


# Cleanup on kernel shutdown
import atexit
import shutil


def _cleanup():
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print("✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")

✓ Mock setup complete
✓ Temp directory: /var/folders/34/129m5tdd04vf10ptyj12w6f80000gp/T/karenina_docs_opgi7rlv
✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src


# Answer Templates

Templates define how to evaluate LLM responses programmatically. This guide covers what templates are, why they're useful, and how to create them automatically or manually.

**Quick Navigation:**

- [What Are Templates?](#what-are-templates) - Core concepts and structure
- [Why Use Templates?](#why-use-templates) - Benefits and use cases
- [Automatic Template Generation](#automatic-template-generation-recommended) - Recommended LLM-based approach
- [Manual Template Creation](#manual-template-creation-advanced) - Advanced custom templates
- [When to Use Which Approach](#when-to-use-which-approach) - Decision guide for automatic vs manual
- [Complete Example](#complete-example) - End-to-end workflow

## What Are Templates?

**Answer templates** are Pydantic classes that specify:

- **What information to extract** from free-text LLM responses
- **How to verify correctness** by comparing extracted data against expected answers
- **The structure of expected answers** (e.g., a drug name, a number, a list of items)

Templates enable **LLM-as-a-judge evaluation**: The answering model generates free text, and the judge model extracts structured data from that text using the template schema. The template then programmatically verifies correctness.

## Why Use Templates?

Templates provide several key benefits:

1. **Flexible Input**: Answering models can respond naturally without strict formatting constraints
2. **Structured Evaluation**: Judge models extract specific fields, making evaluation deterministic
3. **Programmatic Verification**: The `verify()` method implements custom logic for checking correctness
4. **Reusable Patterns**: Templates can be generated automatically for common question types
5. **Transparent Logic**: Evaluation criteria are explicit and inspectable

## Automatic Template Generation (Recommended)

**The recommended approach** is to let Karenina automatically generate templates using an LLM. This is fast, consistent, and works well for most question types.

### Basic Generation

In [2]:
from karenina import Benchmark
from karenina.schemas import ModelConfig

# Create benchmark and add questions
benchmark = Benchmark.create(name="Genomics Knowledge Benchmark")

benchmark.add_question(question="How many chromosomes are in a human somatic cell?", raw_answer="46")

benchmark.add_question(question="What is the approved drug target of Venetoclax?", raw_answer="BCL2")

benchmark.add_question(question="How many protein subunits does hemoglobin A have?", raw_answer="4")

# Generate templates for all questions
# Note: Using individual parameters instead of model_config
print("Generating templates...")
results = benchmark.generate_all_templates(
    model="gpt-4.1-mini", model_provider="openai", temperature=0.1, interface="langchain"
)

print(f"✓ Generated {len(results)} templates successfully")

Generating templates...
✓ Generated 3 templates successfully


**What happens:**

1. Karenina sends each question + answer to the LLM
2. The LLM generates a Pydantic class tailored to that specific question
3. The template is automatically validated and associated with the question
4. Questions are marked as "finished" and ready for verification

### Generated Template Example

For the question "What is the approved drug target of Venetoclax?" with answer "BCL2", the LLM might generate:

```python
class Answer(BaseAnswer):
    target: str = Field(description="The protein target mentioned in the response")

    def model_post_init(self, __context):
        self.correct = {"target": "BCL2"}

    def verify(self) -> bool:
        return self.target.strip().upper() == self.correct["target"].upper()
```

This template:

- Extracts the `target` field from free-text responses
- Compares it case-insensitively against "BCL2"
- Returns `True` if they match, `False` otherwise

### How Automatic Template Generation Works

Understanding the template generation process helps you troubleshoot issues and make informed decisions about when to use automatic vs manual templates.

**High-Level Process:**

When you call `generate_all_templates()`, Karenina performs a **three-phase structured generation** for each question:

**Phase 1: Ground Truth Extraction**

The LLM analyzes the question-answer pair and generates a JSON specification defining the attributes needed for verification.

Example for "What is the approved drug target of Venetoclax?" (answer: "BCL2"):

```json
{
  "attributes": [
    {
      "name": "mentions_bcl2_protein",
      "type": "bool",
      "ground_truth": true
    },
    {
      "name": "mentions_apoptosis_regulation",
      "type": "bool",
      "ground_truth": false
    }
  ]
}
```

**Important design principle:** The system strongly **suggests boolean-based evaluation** rather than free-text string matching. Text-based assessment is typically converted to boolean checks for concept presence.

**Phase 2: Field Description Generation**

Using the ground truth specification, the LLM generates clear instructions for each attribute that will guide the judge model during response parsing.

Example output:

```json
{
  "field_descriptions": {
    "mentions_bcl2_protein": "Answer with true if the response mentions BCL2 or semantically related terms; otherwise answer false.",
    "mentions_apoptosis_regulation": "Answer with true if the response discusses apoptosis regulation mechanisms; otherwise answer false."
  }
}
```

These descriptions emphasize **semantic equivalence** over exact string matching.

**Phase 3: Code Generation**

Karenina programmatically builds the Pydantic class using the structured outputs from Phases 1 and 2. The generated code includes:

- Field definitions with judge instructions from Phase 2
- The `model_post_init()` method with ground truth values from Phase 1
- The `verify()` method with type-appropriate comparison logic
- The `verify_granular()` method for partial credit (multi-attribute templates only)

**Validation and Storage**

After generation, Karenina validates the Python code and stores it with the question. If validation fails, the system retries with error context (up to 3 attempts total).

**What Makes This Approach Effective:**

- **Structured Outputs**: JSON schema validation ensures consistent, parseable results from the LLM
- **Semantic Evaluation**: Boolean attributes capture concept presence, making verification robust to paraphrasing
- **Type Safety**: Enforced constraints prevent ambiguous evaluation strategies
- **Retry Logic**: Failed validations trigger automatic regeneration with error context
- **Partial Credit**: Multi-attribute templates support granular scoring automatically

**Why Boolean Attributes?**

The system strongly prefers boolean attributes over string extraction because:

- **Flexibility**: Judges check if concepts are present, not exact phrases
- **Deterministic**: `true`/`false` comparisons are unambiguous
- **Robust**: Handles paraphrasing, synonyms, and variations naturally
- **Avoids pitfalls**: No need for case normalization, fuzzy matching, or string similarity thresholds

**Trade-off: Speed vs. Rigor**

The current approach **may expose ground truth to the judge model** through field descriptions. For example, asking "Answer with true if the response mentions BCL2" reveals that BCL2 is the expected answer. The judge becomes aware of what's "correct" rather than acting as a pure information extractor.

**Alternative approach (more rigorous but requires manual curation):**

- Have the judge extract information **without knowing the correct answer**
- Field descriptions would be neutral (e.g., "Extract the protein target mentioned")
- All verification logic stays in the `verify()` method
- Judge models act as pure parsers, not evaluators

**Current approach (faster but less rigorous):**

- Field descriptions include hints about correctness
- Allows automated template generation with minimal manual curation
- Judge models perform some evaluation during extraction
- Faster to deploy at scale

This is a **design trade-off**: a more rigorous benchmark requires more manual template curation, while the current automated approach prioritizes speed and scalability at the cost of some methodological purity.

If you need the more rigorous approach, see [Manual Template Creation](#manual-template-creation-advanced) for how to write templates with neutral field descriptions.

**When Generation Might Fail:**

Template generation works well for most questions, but may struggle with:

- **Highly ambiguous questions** where even the ground truth is unclear
- **Complex compositional logic** requiring interdependent attribute checks
- **Domain-specific tolerance requirements** (e.g., "within 10% is acceptable")
- **Unusual answer formats** that don't fit the structured attribute model

In these cases, you can fall back to [manual template creation](#manual-template-creation-advanced).

## Manual Template Creation (Advanced)

For full control over evaluation logic, you can write templates manually. This is useful for complex verification requirements or custom validation rules.

### Basic Template Structure

Templates inherit from `BaseAnswer` and must include these **three required components**:

**1. Field Definitions**

Fields specify what data to extract. Each field should have a clear description that guides the judge LLM:

In [3]:
from pydantic import Field

# String fields
target: str = Field(description="The protein target mentioned in the response")

# Integer/Float fields
count: int = Field(description="The number of items mentioned")
score: float = Field(description="Accuracy score 0.0-1.0", ge=0.0, le=1.0)

# Boolean fields (recommended for rigorous evaluation)
mentions_bcl2: bool = Field(description="Extract whether BCL2 protein is mentioned")

# List fields
proteins: list[str] = Field(description="List of proteins mentioned")

print("Field type examples shown above")

Field type examples shown above


**2. `model_post_init(self, __context)` Method** (required)

- **Purpose**: Initialize the ground truth values after Pydantic constructs the model
- **Returns**: `None` (no return value)
- **Usage**: Set `self.correct` dictionary with expected values

```python
def model_post_init(self, __context):
    self.correct = {"count": 46}
```

**3. `verify(self) -> bool` Method** (required)

- **Purpose**: Determine if the extracted answer is correct
- **Returns**: `bool` - `True` if correct, `False` if incorrect
- **Usage**: Compare extracted field values against `self.correct`

```python
def verify(self) -> bool:
    return self.count == self.correct["count"]
```

**Complete Example:**

```python
class Answer(BaseAnswer):
    count: int = Field(description="The number of chromosomes mentioned in the response")

    def model_post_init(self, __context):
        self.correct = {"count": 46}

    def verify(self) -> bool:
        return self.count == self.correct["count"]
```

### Optional Method: Granular Scoring

**`verify_granular(self) -> float`** (optional)

- **Purpose**: Calculate partial credit for multi-attribute templates
- **Returns**: `float` between 0.0 and 1.0 representing the fraction of correct attributes
- **Usage**: Count matching attributes and return the percentage
- **Note**: Automatically generated for multi-attribute templates; rarely needed for manual templates

In [4]:
# Example verify_granular implementation
def verify_granular(self) -> float:
    correct_count = 0
    total_count = 2

    if self.field1 == self.correct["field1"]:
        correct_count += 1
    if self.field2 == self.correct["field2"]:
        correct_count += 1

    return correct_count / total_count


print("verify_granular example shown above")

verify_granular example shown above


### Adding Manual Templates to Questions

You can add templates in three ways:

"**Option 1: Pass template as a string (recommended for notebooks)**"

In [5]:
# Note: In notebooks, use string-based templates instead of class definitions
# because inspect.getsource() cannot extract source from notebook cells
# For file-based code, you can pass Answer classes directly

template_code = """class Answer(BaseAnswer):
    target: str = Field(description="The protein target mentioned")

    def model_post_init(self, __context):
        self.correct = {"target": "BCL2"}

    def verify(self) -> bool:
        return self.target.strip().upper() == self.correct["target"].upper()
"""

benchmark2 = Benchmark.create(name="Manual Template Example")
benchmark2.add_question(
    question="What is the approved drug target of Venetoclax?",
    raw_answer="BCL2",
    answer_template=template_code,
    finished=True,
)

print("✓ Question added with manual template (string-based for notebook compatibility)")

✓ Question added with manual template (string-based for notebook compatibility)


"**Note about notebooks:**\n\nIn Jupyter notebooks, classes defined in cells cannot have their source code automatically extracted by `inspect.getsource()`. For notebook development, use string-based templates (as shown in Option 1 above). For production code in `.py` files, you can pass Answer classes directly.\n\n**For file-based code:**\n\n- Classes defined in `.py` files: `inspect.getsource()` captures the source code automatically\n- For exec-created classes: Set `YourClassName._source_code` manually\n\n**For notebooks:**\n\n- Always use string-based templates with `answer_template=template_code`\n- The system will validate and store the template code directly"

**Option 2: Pass template code as a string**

In [6]:
template_code = """class Answer(BaseAnswer):
    target: str = Field(description="The protein target mentioned")

    def model_post_init(self, __context):
        self.correct = {"target": "BCL2"}

    def verify(self) -> bool:
        return self.target.strip().upper() == self.correct["target"].upper()
"""

benchmark3 = Benchmark.create(name="String Template Example")
benchmark3.add_question(
    question="What is the approved drug target of Venetoclax?",
    raw_answer="BCL2",
    answer_template=template_code,
    finished=True,  # Mark as ready for verification
)

print("✓ Question added with string template")

✓ Question added with string template


**Option 3: Add template to existing question**

In [7]:
benchmark4 = Benchmark.create(name="Add Template Later Example")
question_id = benchmark4.add_question(question="How many protein subunits does hemoglobin A have?", raw_answer="4")

# Later, add the template using add_answer_template
template_code = """class Answer(BaseAnswer):
    count: int = Field(description="The number of subunits mentioned")

    def model_post_init(self, __context):
        self.correct = {"count": 4}

    def verify(self) -> bool:
        return self.count == self.correct["count"]
"""

benchmark4.add_answer_template(question_id, template_code)

print("✓ Template added to existing question")

✓ Template added to existing question


### Complex Template Example

For more sophisticated evaluation, you can include multiple fields and custom logic:

In [8]:
template_code = """class Answer(BaseAnswer):
    diseases: list[str] = Field(description="List of diseases mentioned in the response")
    inflammatory_count: int = Field(description="Number of inflammatory diseases identified")

    def model_post_init(self, __context):
        self.correct = {
            "inflammatory_diseases": ["asthma", "bronchitis", "pneumonia"],
            "non_inflammatory": ["emphysema", "pulmonary fibrosis"]
        }

    def verify(self) -> bool:
        # Check if the correct inflammatory diseases are identified
        identified = [d.lower().strip() for d in self.diseases]
        correct_identified = sum(1 for d in self.correct["inflammatory_diseases"]
                                if d in identified)

        # At least 2 out of 3 correct inflammatory diseases
        return correct_identified >= 2
"""

benchmark5 = Benchmark.create(name="Complex Template Example")
benchmark5.add_question(
    question="Which of the following are inflammatory lung diseases: asthma, bronchitis, pneumonia, emphysema, pulmonary fibrosis?",
    raw_answer="asthma, bronchitis, pneumonia",
    answer_template=template_code,
    finished=True,
)

print("✓ Complex template example created")

✓ Complex template example created


## When to Use Which Approach

### Use Automatic Generation When:

- You have many questions to process
- Questions follow standard patterns (factual recall, numerical answers, multiple choice)
- You're prototyping or testing quickly

### Use Manual Creation When:

- You need very specific verification logic
- You want to implement tolerance ranges or fuzzy matching
- You're creating reusable template libraries
- Automatic generation doesn't produce the desired structure

## Complete Example

Here's a complete workflow showing automatic template generation:

In [9]:
from karenina import Benchmark
from karenina.schemas import VerificationConfig

# 1. Create benchmark and add questions
benchmark = Benchmark.create(
    name="Genomics Knowledge Benchmark", description="Testing LLM knowledge of genomics", version="1.0.0"
)

# Add questions
questions = [
    ("How many chromosomes are in a human somatic cell?", "46"),
    ("What is the approved drug target of Venetoclax?", "BCL2"),
    ("How many protein subunits does hemoglobin A have?", "4"),
]

for q, a in questions:
    benchmark.add_question(question=q, raw_answer=a, author={"name": "Bio Curator"})

print(f"✓ Added {len(questions)} questions to benchmark")

# 2. Generate templates automatically
print("Generating templates...")
results = benchmark.generate_all_templates(
    model="gpt-4.1-mini", model_provider="openai", temperature=0.1, interface="langchain"
)
print(f"✓ Generated {len(results)} templates")

# 3. Templates are now ready - proceed to verification
config = VerificationConfig(
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.1,
            interface="langchain",
        )
    ],
    parsing_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.1,
            interface="langchain",
        )
    ],
)

# Note: In actual usage, this would call real LLM APIs
# For documentation purposes, we show the pattern
print("Configuration ready for verification")
print(f"  Answering models: {[m.id for m in config.answering_models]}")
print(f"  Parsing models: {[m.id for m in config.parsing_models]}")

# 4. Save benchmark
save_path = temp_path("genomics_benchmark.jsonld")
benchmark.save(save_path)
print(f"✓ Benchmark saved to {save_path.name}")

✓ Added 3 questions to benchmark
Generating templates...
✓ Generated 3 templates
Configuration ready for verification
  Answering models: ['gpt-4.1-mini']
  Parsing models: ['gpt-4.1-mini']
✓ Benchmark saved to genomics_benchmark.jsonld


## Next Steps

Once you have templates set up for your questions, you can:

- [Create rubrics](rubrics.md) for qualitative assessment criteria
- [Run verification](verification.md) to evaluate LLM responses
- [Analyze results](verification.md#accessing-verification-results) to assess model performance
- [Save your benchmark](saving-loading.md) using checkpoints or database

## Related Documentation

- [Adding Questions](adding-questions.md) - Populate your benchmark with questions
- [Rubrics](rubrics.md) - Assess qualitative aspects beyond factual correctness
- [Verification](verification.md) - Run evaluations with multiple models
- [Quick Start](../quickstart.md) - End-to-end workflow example