In [None]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata
import copy
import hashlib
import json
import sys
import tempfile
from datetime import datetime
from pathlib import Path
from unittest.mock import MagicMock, patch

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import after path is set
from karenina.schemas.workflow.template_results import TemplateResults
from karenina.schemas.workflow.verification.result import VerificationResult
from karenina.schemas.workflow.verification.result_components import (
    VerificationResultMetadata,
    VerificationResultRubric,
    VerificationResultTemplate,
)
from karenina.schemas.workflow.verification_result_set import VerificationResultSet


# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""

    def __init__(self, content: str = "46 chromosomes"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}

    def __str__(self):
        return self.content


class MockStructuredOutput:
    """Mock structured output response that adapts to any template."""

    def __init__(self, **kwargs):
        self.count = kwargs.get("count", 46)
        self.target = kwargs.get("target", "BCL2")
        self.subunits = kwargs.get("subunits", 4)
        self.diseases = kwargs.get("diseases", ["asthma", "bronchitis", "pneumonia"])
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def model_dump(self):
        return self.dict()


def create_mock_chat_model():
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("46 chromosomes")
    mock.ainvoke.return_value = MockLLMResponse("46 chromosomes")

    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()

    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


def compute_result_id(question_id: str, answering_model: str, parsing_model: str, timestamp: str) -> str:
    """Compute deterministic 16-char SHA256 hash."""
    data = {
        "answering_mcp_servers": [],
        "answering_model": answering_model,
        "parsing_model": parsing_model,
        "question_id": question_id,
        "replicate": None,
        "timestamp": timestamp,
    }
    json_str = json.dumps(data, sort_keys=True, ensure_ascii=True)
    hash_obj = hashlib.sha256(json_str.encode("utf-8"))
    return hash_obj.hexdigest()[:16]


def create_mock_verification_result(
    question_id: str,
    question_text: str,
    answer: str,
    answering_model: str = "gpt-4.1-mini",
    parsing_model: str = "gpt-4.1-mini",
    passed: bool = True,
):
    """Create a mock VerificationResult for testing."""
    timestamp = datetime.now().isoformat()
    template_id = hashlib.md5(str(question_id).encode()).hexdigest()[:32]

    # Create mock template result
    template = VerificationResultTemplate(
        raw_llm_response=f"The answer is {answer}.",
        parsed_llm_response={"value": answer},
        parsed_gt_response={"value": answer},
        verify_result=passed,
        template_verification_performed=True,
        usage_metadata={
            "answer_generation": {"total_tokens": 50},
            "parsing": {"total_tokens": 30},
            "total": {"total_tokens": 80},
        },
        abstention_check_performed=True,
        abstention_detected=False,
    )

    # Create mock rubric result
    rubric = VerificationResultRubric(
        rubric_evaluation_performed=True,
        llm_trait_scores={
            "Conciseness": 4,
            "Clarity": True,
        },
    )

    # Create metadata with all required fields
    metadata = VerificationResultMetadata(
        question_id=question_id,
        template_id=template_id,
        completed_without_errors=True,
        question_text=question_text,
        raw_answer=answer,
        answering_model=answering_model,
        parsing_model=parsing_model,
        execution_time=1.5,
        timestamp=timestamp,
        result_id=compute_result_id(question_id, answering_model, parsing_model, timestamp),
    )

    return VerificationResult(
        metadata=metadata,
        template=template,
        rubric=rubric,
    )


# Store original run_verification
_original_run_verification = None


def mock_run_verification(self, config, question_ids=None, progress_callback=None):
    """Mock run_verification that returns realistic results."""
    global _original_run_verification

    # Get all finished questions
    finished = self.get_finished_questions(ids_only=False)

    # Filter by question_ids if provided
    if question_ids is not None:
        finished = [q for q in finished if q["id"] in question_ids]

    if len(finished) == 0:
        if _original_run_verification:
            return _original_run_verification(self, config)
        return VerificationResultSet(results=[], template_results=TemplateResults(results=[]))

    results = []

    # Map question keywords to expected answers
    mock_data = [
        {"keywords": ["chromosomes"], "answer": "46", "passed": True},
        {"keywords": ["venetoclax", "bcl2"], "answer": "BCL2", "passed": True},
        {"keywords": ["hemoglobin", "subunits"], "answer": "4", "passed": True},
        {"keywords": ["inflammatory", "lung"], "answer": "asthma, bronchitis, pneumonia", "passed": True},
    ]

    for question in finished:
        q_id = question["id"]
        q_text = question["question"]
        raw_answer = question.get("raw_answer", "")
        passed = True
        mock_ans = raw_answer
        q_text_lower = q_text.lower()
        for data in mock_data:
            if any(kw in q_text_lower for kw in data["keywords"]):
                passed = data["passed"]
                mock_ans = data["answer"]
                break
        results.append(
            create_mock_verification_result(question_id=q_id, question_text=q_text, answer=mock_ans, passed=passed)
        )

    # Handle multiple models in config
    if hasattr(config, "answering_models") and len(config.answering_models) > 1:
        model_results = []
        for model_config in config.answering_models:
            for result in results:
                # Create a copy with different answering model
                result_copy = copy.deepcopy(result)
                result_copy.metadata.answering_model = model_config.id
                # Recompute result_id
                new_result_id = compute_result_id(
                    result.metadata.question_id,
                    model_config.id,
                    result.metadata.parsing_model,
                    result.metadata.timestamp,
                )
                result_copy.metadata.result_id = new_result_id
                model_results.append(result_copy)
        results = model_results

    template_results = TemplateResults(results=results)
    return VerificationResultSet(
        results=results,
        template_results=template_results,
        rubric_results=None,
    )


# Patch all LLM providers before any imports
_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch(
        "karenina.infrastructure.llm.interface.init_chat_model_unified",
        side_effect=lambda **kwargs: create_mock_chat_model(),
    ),
]
for p in _llm_patches:
    p.start()

# Patch Benchmark.run_verification
from karenina.benchmark import Benchmark

_original_run_verification = Benchmark.run_verification
Benchmark.run_verification = mock_run_verification


def temp_path(filename: str) -> Path:
    """Helper to create paths in temp directory."""
    return TEMP_DIR / filename


# Cleanup
import atexit
import shutil


def _cleanup():
    Benchmark.run_verification = _original_run_verification
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print("✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")
print("✓ Mock verification results enabled - examples will show realistic output")

# Running VerificationThis guide covers how to configure and execute verification to evaluate LLM responses using your benchmark questions, templates, and rubrics.!!! tip "Command-Line Interface Available"    Prefer working from the terminal? Karenina provides a comprehensive CLI for running verifications without writing Python code. See **[CLI Verification](cli-verification.md)** for details on command-line usage, presets, and automation.**Quick Navigation:**- [Understanding Verification](#understanding-verification) - Core concepts and workflow- [Basic Configuration](#basic-verification-configuration) - Setting up VerificationConfig- [Running Verification](#running-verification) - Execute verification and select questions- [Multi-Model Support](#multi-model-support) - Test multiple models simultaneously- [Replication](#replication-for-statistical-analysis) - Statistical significance through repeated runs- [Evaluation Modes](#evaluation-modes) - Template-only, rubric-only, or combined- [Advanced Options](#advanced-configuration-options) - Abstention, deep judgment, system prompts- [LLM Interfaces](#using-different-llm-interfaces) - LangChain, OpenRouter, local models, manual- [Results](#accessing-verification-results) - Access and analyze verification data- [Automatic Database Storage](#automatic-database-storage) - Auto-save results to database- [Progress Tracking](#progress-tracking) - Monitor real-time verification progress- [Answer Caching](#answer-caching) - Automatic efficiency optimization- [Complete Example](#complete-example) - End-to-end verification workflow

---## Understanding VerificationVerification in Karenina evaluates LLM responses through a structured workflow:1. **Question Selection**: Choose which questions to evaluate2. **Answer Generation**: LLMs generate responses to questions3. **Answer Parsing**: Judge LLMs extract structured data using templates4. **Template Verification**: Check if extracted data matches expected answers5. **Rubric Evaluation**: Assess qualitative traits (if enabled)6. **Result Aggregation**: Collect metrics and scores for analysisThis two-model approach (answering model + judge model) allows natural language responses while maintaining structured evaluation.

### Verification Workflow```Questions → Answering Models → Raw Responses → Judge Models → Parsed Data → Verification                                                                                    ↓                                                                              Results with                                                                              Scores & Metrics```**Key Concepts:**- **Answering Models**: Generate responses to questions (can be any LLM)- **Parsing Models** (Judges): Extract structured data from responses using templates- **Templates**: Pydantic classes defining expected answer structure- **Rubrics**: Qualitative evaluation criteria (optional)- **Replication**: Run same question multiple times for statistical significance

---## Basic Verification Configuration### Configure VerificationUse `VerificationConfig` to specify how verification runs:

In [None]:
from karenina.schemas import ModelConfig, VerificationConfig

# Configure verification
config = VerificationConfig(
    # Models that generate answers
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.7,
            interface="langchain",
        )
    ],
    # Models that parse/judge answers
    parsing_models=[
        ModelConfig(
            id="gpt-4.1-mini-judge",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.0,  # Deterministic parsing
            interface="langchain",
        )
    ],
    # Evaluation settings
    evaluation_mode="template_only",  # or "template_and_rubric", "rubric_only"
    rubric_enabled=False,
    replicate_count=1,
)

print("VerificationConfig created with:")
print(f"  - {len(config.answering_models)} answering model(s)")
print(f"  - {len(config.parsing_models)} parsing model(s)")
print(f"  - Evaluation mode: {config.evaluation_mode}")
print(f"  - Rubric enabled: {config.rubric_enabled}")
print(f"  - Replicate count: {config.replicate_count}")

For a comprehensive guide to `ModelConfig` including all parameters, interfaces, providers, and the `extra_kwargs` feature, see the **[Model Configuration Guide](model-configuration.md)**.

---## Running Verification### Basic VerificationOnce you have templates and optionally rubrics, run verification:

In [None]:
# First, create a simple benchmark with questions for demonstration
from karenina import Benchmark

benchmark = Benchmark.create(
    name="Verification Demo", description="Demo benchmark for verification examples", version="1.0.0"
)

# Add some questions
questions = [
    ("How many chromosomes are in a human somatic cell?", "46"),
    ("What is the approved drug target of Venetoclax?", "BCL2"),
    ("How many protein subunits does hemoglobin A have?", "4"),
]

question_ids = []
for q, a in questions:
    qid = benchmark.add_question(question=q, raw_answer=a)
    question_ids.append(qid)

# Generate templates (using mock setup)
benchmark.generate_all_templates(model="gpt-4.1-mini", model_provider="openai", temperature=0.1, interface="langchain")

print(f"Created benchmark with {len(question_ids)} questions")

# Now run verification
results = benchmark.run_verification(config)

print(f"Verification complete: {len(results.results)} results generated")

### Verify Specific QuestionsVerify only a subset of questions by providing question IDs:

In [None]:
# Get question IDs for specific category (e.g., chromosome-related)
genomics_question_ids = [
    qid for qid in benchmark.get_question_ids() if "chromosome" in benchmark.get_question(qid)["question"].lower()
]

print(f"Found {len(genomics_question_ids)} chromosome-related questions")

# Run verification on subset
results_subset = benchmark.run_verification(config=config, question_ids=genomics_question_ids)

print(f"Verified {len(results_subset.results)} genomics questions")

---## Multi-Model SupportKarenina supports testing multiple models simultaneously and using different models for answering vs judging.### Test Multiple Answering ModelsCompare performance across different LLMs:

In [None]:
# Multiple answering models to compare
config_multi = VerificationConfig(
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.7,
            interface="langchain",
        ),
        ModelConfig(
            id="claude-sonnet",
            model_provider="anthropic",
            model_name="claude-sonnet-4.5",
            temperature=0.7,
            interface="langchain",
        ),
        ModelConfig(
            id="gemini-pro",
            model_provider="google",
            model_name="gemini-2.5-flash",
            temperature=0.7,
            interface="langchain",
        ),
    ],
    # Single judge model for consistent evaluation
    parsing_models=[
        ModelConfig(
            id="gpt-4.1-mini-judge",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.0,
            interface="langchain",
        )
    ],
)

# This will generate 3 results per question (one per answering model)
results_multi = benchmark.run_verification(config_multi)

print(f"Multi-model verification complete: {len(results_multi.results)} results")
print("Results per answering model:")
from collections import defaultdict

results_by_model = defaultdict(list)
for r in results_multi.results:
    results_by_model[r.metadata.answering_model].append(r)
for model, model_results in results_by_model.items():
    print(f"  - {model}: {len(model_results)} results")

### Different Answering and Judge ModelsUse a more capable model for judging:

In [None]:
# Fast model for generating answers, more capable model for judging
config_judge = VerificationConfig(
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.7,
            interface="langchain",
        )
    ],
    # More capable model for judging
    parsing_models=[
        ModelConfig(
            id="gpt-4.1-large-judge",
            model_provider="openai",
            model_name="gpt-4.1",
            temperature=0.0,
            interface="langchain",
        )
    ],
)

print("Config created: fast answering model + capable judge model")

### Multiple Judge ModelsCompare how different judges evaluate the same answers:

In [None]:
# Single answering model, multiple judges for comparison
config_multi_judge = VerificationConfig(
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.7,
            interface="langchain",
        )
    ],
    # Multiple judges for comparison
    parsing_models=[
        ModelConfig(
            id="gpt-judge", model_provider="openai", model_name="gpt-4.1-mini", temperature=0.0, interface="langchain"
        ),
        ModelConfig(
            id="claude-judge",
            model_provider="anthropic",
            model_name="claude-sonnet-4.5",
            temperature=0.0,
            interface="langchain",
        ),
    ],
)

print("Config created: single answering model + multiple judges")
print("Answer caching will ensure the same answer is evaluated by all judges")

---## Replication for Statistical AnalysisRun the same question multiple times to assess model consistency and compute statistical metrics.### Configure Replication

In [None]:
config_replicate = VerificationConfig(
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.7,
            interface="langchain",
        )
    ],
    parsing_models=[
        ModelConfig(
            id="gpt-4.1-mini-judge",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.0,
            interface="langchain",
        )
    ],
    replicate_count=5,  # Run each question 5 times
)

print(f"Replication configured: each question will run {config_replicate.replicate_count} times")
print("This enables statistical analysis of model consistency")

### Analyze Replication Results**Recommended: Use DataFrames** (see [DataFrame Quick Reference](dataframe-quick-reference.md)):

In [None]:
# Get DataFrame and group by question
# Note: Use get_template_results() for template verification data
template_results = results.get_template_results()
df = template_results.to_dataframe()

# Calculate pass rate per question
if "question_id" in df.columns and "field_match" in df.columns:
    pass_rates = df.groupby("question_id")["field_match"].mean()
    print("Pass Rates by Question:")
    for qid, rate in pass_rates.items():
        q = benchmark.get_question(qid)
        print(f"  {q['question'][:50]}...: {rate:.1%}")

**Alternative: Group raw results manually:**

In [None]:
from collections import defaultdict

# Group results by question
results_by_question = defaultdict(list)
for result in results.results:
    results_by_question[result.question_id].append(result)

# Compute pass rate for each question
for question_id, question_results in results_by_question.items():
    question = benchmark.get_question(question_id)
    pass_count = sum(1 for r in question_results if r.verify_result)
    total = len(question_results)
    pass_rate = pass_count / total

    print(f"{question['question'][:50]}...")
    print(f"  Pass Rate: {pass_rate:.1%} ({pass_count}/{total})")

    # Check consistency
    if pass_rate == 1.0:
        print("  ✓ Consistent: Always correct")
    elif pass_rate == 0.0:
        print("  ✗ Consistent: Always incorrect")
    else:
        print(f"  ⚠ Inconsistent: {pass_rate:.1%} accuracy")
    print()

**Use Cases:**- **Model Reliability**: Assess how consistently a model answers correctly- **Statistical Significance**: Run k replicates for robust metrics- **Temperature Effects**: Compare variance at different temperature settings

---## Evaluation ModesKarenina supports three evaluation modes that control what gets evaluated during verification.### Mode 1: template_only (Default)Evaluate responses against templates only. Fast and focused on factual correctness.

In [None]:
config_template_only = VerificationConfig(
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.7,
            interface="langchain",
        )
    ],
    parsing_models=[
        ModelConfig(
            id="gpt-4.1-mini-judge",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.0,
            interface="langchain",
        )
    ],
    evaluation_mode="template_only",
    rubric_enabled=False,  # Must be False
)

results_template = benchmark.run_verification(config_template_only)

print("Template-only verification results:")
print("  - template_verification_performed = True")
print("  - verify_result = True/False (template pass/fail)")
print("  - rubric_evaluation_performed = False")

**When to use:**- Testing template parsing accuracy- Fastest verification (no rubric overhead)- Focus on structured output correctness

### Mode 2: template_and_rubricEvaluate both template correctness AND rubric criteria. Comprehensive evaluation.

In [None]:
from karenina.schemas import LLMRubricTrait, Rubric

# Add a global rubric first
global_rubric = Rubric(
    llm_traits=[
        LLMRubricTrait(
            name="Conciseness", description="Rate how concise the answer is on a scale of 1-5", kind="score"
        ),
        LLMRubricTrait(name="Clarity", description="Is the answer clear and easy to understand?", kind="boolean"),
    ]
)
benchmark.set_global_rubric(global_rubric)

config_both = VerificationConfig(
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.7,
            interface="langchain",
        )
    ],
    parsing_models=[
        ModelConfig(
            id="gpt-4.1-mini-judge",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.0,
            interface="langchain",
        )
    ],
    evaluation_mode="template_and_rubric",
    rubric_enabled=True,  # Must be True
)

results_both = benchmark.run_verification(config_both)

print("Template and rubric verification results:")
print("  - template_verification_performed = True")
print("  - verify_result = True/False (template pass/fail)")
print("  - rubric_evaluation_performed = True")

# Show rubric scores if available
if results_both.results:
    first_result = results_both.results[0]
    if first_result.rubric and first_result.rubric.llm_trait_scores:
        print(f"\nSample rubric scores: {first_result.rubric.llm_trait_scores}")

**When to use:**- Production benchmarking with full metrics- Evaluate both correctness (template) and quality (rubric)- Comprehensive model assessment

### Mode 3: rubric_onlyEvaluate rubric criteria only, skip template verification. Useful for qualitative assessment.

In [None]:
config_rubric_only = VerificationConfig(
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.7,
            interface="langchain",
        )
    ],
    parsing_models=[
        ModelConfig(
            id="gpt-4.1-mini-judge",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.0,
            interface="langchain",
        )
    ],
    evaluation_mode="rubric_only",
    rubric_enabled=True,  # Must be True
)

results_rubric = benchmark.run_verification(config_rubric_only)

print("Rubric-only verification results:")
print("  - template_verification_performed = False")
print("  - verify_result = None")
print("  - rubric_evaluation_performed = True")

**When to use:**- Qualitative evaluation without structured output requirements- Rubric development and tuning- Open-ended response evaluation- Focus on content quality over format

---## Advanced Configuration Options### Enable Abstention DetectionDetect when models refuse to answer questions:

In [None]:
config_abstention = VerificationConfig(
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.7,
            interface="langchain",
        )
    ],
    parsing_models=[
        ModelConfig(
            id="gpt-4.1-mini-judge",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.0,
            interface="langchain",
        )
    ],
    abstention_check_enabled=True,  # Detect refusals
)

print("Abstention detection enabled")
print("This will detect when models refuse to answer questions")

### Enable Deep JudgmentExtract detailed feedback with verbatim excerpts and reasoning:

In [None]:
config_deep = VerificationConfig(
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.7,
            interface="langchain",
        )
    ],
    parsing_models=[
        ModelConfig(
            id="gpt-4.1-mini-judge",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.0,
            interface="langchain",
        )
    ],
    deep_judgment_enabled=True,
    deep_judgment_max_excerpts_per_attribute=3,
    deep_judgment_fuzzy_match_threshold=0.80,
)

print("Deep judgment enabled")
print("This will extract detailed feedback with verbatim excerpts")

See [Deep Judgment documentation](../advanced/deep-judgment.md) for comprehensive guide.

### Add System PromptsCustomize model behavior with system prompts:

In [None]:
# Answering model with domain expertise
answering_model = ModelConfig(
    id="gpt-genomics",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.7,
    interface="langchain",
    system_prompt="You are an expert in genomics and molecular biology. Answer concisely with precise scientific terminology.",
)

# Judge model with strict evaluation
judge_model = ModelConfig(
    id="gpt-judge-strict",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.0,
    interface="langchain",
    system_prompt="You are a strict evaluator. Parse responses carefully and extract only explicitly stated information.",
)

config_prompts = VerificationConfig(answering_models=[answering_model], parsing_models=[judge_model])

print("System prompts configured:")
print("  - Answering: Expert genomics terminology")
print("  - Judge: Strict evaluation")

### Configure TemperatureControl randomness and creativity:

In [None]:
# High temperature: More creative, less consistent
creative_model = ModelConfig(
    id="gpt-creative",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.9,  # More randomness
    interface="langchain",
)

# Zero temperature: Deterministic, consistent
deterministic_model = ModelConfig(
    id="gpt-deterministic",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.0,  # No randomness
    interface="langchain",
)

print("Temperature examples configured")
print("  - Creative: 0.9 (more random)")
print("  - Deterministic: 0.0 (no randomness)")

**Temperature Guidelines:**- **0.0**: Deterministic, always returns same answer (best for factual questions)- **0.3-0.5**: Slight variation, mostly consistent (good balance)- **0.7-0.9**: Creative, more diverse responses (good for open-ended questions)- **1.0+**: Very random, unpredictable (rarely useful for benchmarking)

---## Using Different LLM InterfacesKarenina supports four interface types for connecting to LLM providers:1. **`langchain`** - Default interface for major cloud providers (OpenAI, Anthropic, Google)2. **`openrouter`** - Unified access to 200+ models through OpenRouter API3. **`openai_endpoint`** - Custom OpenAI-compatible endpoints (Ollama, vLLM, local models)4. **`manual`** - Pre-recorded traces for testing without API calls**Quick Example:**

In [None]:
# Cloud provider via LangChain
cloud_model = ModelConfig(
    id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain", temperature=0.0
)

# Local model via custom endpoint
# Example configuration (not executed):
# local_model = ModelConfig(
#     id="llama-local",
#     model_name="llama3.1:70b",
#     interface="openai_endpoint",
#     endpoint_base_url="http://localhost:11434/v1",
#     endpoint_api_key="ollama",
#     temperature=0.0
# )

print("Model interface examples:")
print(f"  - Cloud: {cloud_model.interface} ({cloud_model.model_provider})")
print("  - Local: openai_endpoint (Ollama example shown in comments)")

For comprehensive documentation on all four interfaces, including:- Detailed configuration examples- Environment variable setup- Provider-specific options- The `extra_kwargs` feature for advanced configuration- MCP tool integration- System prompts and custom parametersSee the **[Model Configuration Guide](model-configuration.md)**.

---## Accessing Verification Results!!! tip "Recommended: Use DataFrames for Result Analysis"    For easier and more flexible result analysis, we recommend using the **DataFrame-first approach**:    - **[Analyzing Results with DataFrames](analyzing-results-dataframes.md)** - Comprehensive guide with 40+ examples    - **[DataFrame Quick Reference](dataframe-quick-reference.md)** - Cheat sheet for common operations    The DataFrame approach provides pandas-based analysis with:    - Standard pandas operations (groupby, filter, pivot)    - Helper methods for common aggregations    - Easy export to CSV, Excel, JSON    - Integration with visualization libraries    **Quick example:**    ```python    # After running verification    result_set = benchmark.run_verification(config)    # Convert to DataFrame for analysis    template_results = result_set.get_template_results()    df = template_results.to_dataframe()    # Analyze with pandas    pass_rates = df.groupby('question_id')['field_match'].mean()    ```    The sections below show how to access raw VerificationResult objects if you need them.

### Result StructureThe `run_verification()` method returns a `VerificationResultSet` object that provides multiple ways to access results:

In [None]:
# Run verification
result_set = benchmark.run_verification(config)

# Method 1: Use DataFrame API (RECOMMENDED)
template_results = result_set.get_template_results()
df = template_results.to_dataframe()
print(f"DataFrame shape: {df.shape}")

# Method 2: Access typed result wrappers
rubric_results = result_set.get_rubrics_results()  # For rubric data
judgment_results = result_set.get_judgment_results()  # For deep judgment data

print(f"Template results: {len(template_results.results)}")
print(f"Rubric results available: {rubric_results is not None}")
print(f"Judgment results available: {judgment_results is not None}")

### Access Raw ResultsFor detailed access to individual result properties:

In [None]:
# Method 3: Access raw VerificationResult list
for result in result_set.results:
    # Identification
    print(f"Question ID: {result.question_id}")
    print(f"Answering Model: {result.answering_model}")
    print(f"Parsing Model: {result.parsing_model}")

    # Raw response
    print(f"Raw Answer: {result.raw_llm_response}")

    # Template verification
    print(f"Template Passed: {result.verify_result}")
    print(f"Parsed Response: {result.parsed_llm_response}")

    # Rubric evaluation (if enabled)
    if result.rubric_evaluation_performed and result.verify_rubric:
        print(f"Rubric Scores: {result.verify_rubric}")

    # Abstention (if enabled)
    if result.abstention_detected:
        print("Abstention: Model refused to answer")

    print("-" * 50)
    break  # Just show first result

### Filter Results**Recommended: Use DataFrames for filtering** (see [DataFrame Quick Reference](dataframe-quick-reference.md#common-filters)):

In [None]:
# Get DataFrame
df = result_set.get_template_results().to_dataframe()

# Filter with pandas
if "field_match" in df.columns:
    passing = df[df["field_match"] == True]
    print(f"Passing results: {len(passing)}")

if "answering_model" in df.columns:
    gpt_results = df[df["answering_model"] == "gpt-4.1-mini"]
    print(f"GPT results: {len(gpt_results)}")

**Alternative: Filter raw result list:**

In [None]:
# Get only passing results
passing_results = [r for r in result_set.results if r.verify_result]
print(f"Passing: {len(passing_results)}/{len(result_set.results)}")

# Get results for specific model
gpt_results = [r for r in result_set.results if r.answering_model == "gpt-4.1-mini"]
print(f"GPT results: {len(gpt_results)}")

### Compute Aggregate Metrics**Recommended: Use DataFrame helper methods** (see [Analyzing Results with DataFrames](analyzing-results-dataframes.md)):

In [None]:
# Template metrics
template_results = result_set.get_template_results()
pass_rates = template_results.aggregate_pass_rate(by="question_id")
print(f"Pass Rates: {pass_rates}")

# Or use pandas directly
df = template_results.to_dataframe()
if "completed_without_errors" in df.columns and "field_match" in df.columns:
    successful = df[df["completed_without_errors"] == True]
    overall_accuracy = successful["field_match"].mean()
    print(f"Overall Accuracy: {overall_accuracy:.1%}")

---## Automatic Database StorageKarenina can automatically save verification results to a database as they are generated. This is especially useful for production deployments and long-running verification jobs.### Configure Automatic Storage

In [None]:
import tempfile
from pathlib import Path

# Get or create temp directory
if "TEMP_DIR" not in globals():
    TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))


def temp_path(filename: str) -> Path:
    return TEMP_DIR / filename


from karenina.storage import DBConfig

# Create database configuration
db_config = DBConfig(
    storage_url=f"sqlite:///{temp_path('benchmarks.db')}",
    auto_create=True,  # Create tables if they don't exist
)

print(f"Database configured: {temp_path('benchmarks.db')}")
print("Results will be automatically saved during verification")

### Configure Verification with Database

In [None]:
config_db = VerificationConfig(
    answering_models=[
        ModelConfig(id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")
    ],
    parsing_models=[
        ModelConfig(id="gpt-4.1-mini-judge", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")
    ],
    evaluation_mode="template_and_rubric",
    rubric_enabled=True,
    db_config=db_config,  # Enable automatic database storage
)

print("Verification configured with automatic database storage")

### How It Works1. When `db_config` is set in `VerificationConfig`, verification results are automatically saved to the specified database after completion2. The `AUTOSAVE_DATABASE` environment variable controls this behavior (defaults to `"true"`)3. Results are saved with metadata including run name, timestamp, and configuration details4. This happens transparently without requiring manual `save_to_db()` calls

### Benefits- **No data loss**: Results are persisted immediately after verification completes- **Automatic**: No need to remember to call `save_to_db()` after verification- **Production-ready**: Ideal for automated pipelines and long-running jobs- **Queryable**: Results are immediately available for database queries and analytics

### Disabling Auto-SaveTo disable automatic database storage temporarily:

In [None]:
# Method 1: Set db_config to None
config_no_save = VerificationConfig(
    answering_models=[
        ModelConfig(id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")
    ],
    parsing_models=[
        ModelConfig(id="gpt-4.1-mini-judge", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")
    ],
    db_config=None,  # No automatic database storage
)

# Method 2: Use environment variable (in shell)
# export AUTOSAVE_DATABASE="false"

print("Auto-save disabled examples shown above")

For detailed information about database storage options, see [Automatic Database Storage During Verification](saving-loading.md#automatic-database-storage-during-verification) and [Configuration](../configuration.md#database-configuration).

---## Progress Tracking### Real-Time Progress CallbackMonitor verification progress with a callback function:

In [None]:
def progress_callback(progress: float, message: str):
    """Called periodically during verification."""
    print(f"Progress: {progress:.1%} - {message}")


# Run verification with progress tracking
print("Starting verification with progress tracking...")
results_progress = benchmark.run_verification(config=config, progress_callback=progress_callback)
print("\nVerification complete!")

---## Answer CachingKarenina automatically caches answer generation to improve efficiency when multiple judge models evaluate the same answering model response.### How Answer Caching Works**Without Caching:**```1 question × 1 answering model × 3 judge models = 3 answer generations- Generate answer with Judge 1 → Parse with Judge 1- Generate answer with Judge 2 → Parse with Judge 2- Generate answer with Judge 3 → Parse with Judge 3Result: Same answer generated 3 times (wasteful, potentially inconsistent)```**With Caching (Automatic):**```1 question × 1 answering model × 3 judge models = 1 answer generation- Generate answer ONCE- Parse with Judge 1 (using cached answer)- Parse with Judge 2 (using cached answer)- Parse with Judge 3 (using cached answer)Result: Same answer reused 3 times (efficient, guaranteed consistent)```### Benefits1. **Efficiency**: Reduces LLM API calls and costs (generate once, evaluate many times)2. **Correctness**: Ensures all judges evaluate the exact same answer (important for fair comparison)3. **Speed**: Faster verification by avoiding redundant answer generation

### Cache BehaviorThe answer cache is:- **Automatic**: No configuration required, works transparently- **Thread-Safe**: Safe for parallel execution- **Per-Question**: Cache key includes question ID, answering model ID, and replicate number- **Replicate-Aware**: Each replicate gets independent answer generation**Cache Key Format:** `{question_id}_{answering_model_id}_{replicate}`

### Caching with ReplicationEach replicate run generates its own answer independently:

In [None]:
config_cache = VerificationConfig(
    answering_models=[
        ModelConfig(id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")
    ],
    parsing_models=[  # 3 judges
        ModelConfig(id="gpt-judge-1", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain"),
        ModelConfig(id="gpt-judge-2", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain"),
        ModelConfig(id="gpt-judge-3", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain"),
    ],
    replicate_count=2,  # 2 replicates
)

print("Answer caching with replication:")
print("  Total combinations: 1 question × 1 answering model × 3 judges × 2 replicates = 6 results")
print("  Answer generations: 1 question × 1 answering model × 2 replicates = 2 generations")
print("  Cache hits: 6 results - 2 generations = 4 cache reuses")
print("")
print("Result:")
print("  - Replicate 0: Answer generated once, reused by all 3 judges")
print("  - Replicate 1: New answer generated once, reused by all 3 judges")

---## Complete ExampleHere's a complete end-to-end example demonstrating multi-model verification with replication:

In [None]:
from collections import defaultdict

from karenina import Benchmark
from karenina.schemas import LLMRubricTrait, ModelConfig, Rubric, VerificationConfig

# Create a fresh benchmark for this example
benchmark_complete = Benchmark.create(
    name="Genomics Verification Demo", description="Complete verification workflow example", version="1.0.0"
)

# Add questions
demo_questions = [
    ("How many chromosomes are in a human somatic cell?", "46"),
    ("What is the approved drug target of Venetoclax?", "BCL2"),
    ("How many protein subunits does hemoglobin A have?", "4"),
]

demo_qids = []
for q, a in demo_questions:
    qid = benchmark_complete.add_question(question=q, raw_answer=a)
    demo_qids.append(qid)

# Generate templates
benchmark_complete.generate_all_templates(
    model="gpt-4.1-mini", model_provider="openai", temperature=0.1, interface="langchain"
)

# Add a global rubric
demo_rubric = Rubric(llm_traits=[LLMRubricTrait(name="Conciseness", description="Rate conciseness 1-5", kind="score")])
benchmark_complete.set_global_rubric(demo_rubric)

print("Benchmark setup complete")
print(f"  Questions: {len(demo_qids)}")
print("  Templates: Generated")
print("  Rubric: Global rubric with 1 trait")

In [None]:
# Configure three answering models for comparison
answering_models = [
    ModelConfig(
        id="gpt-4.1-mini",
        model_provider="openai",
        model_name="gpt-4.1-mini",
        temperature=0.7,
        interface="langchain",
        system_prompt="You are a genomics expert. Answer concisely.",
    ),
    ModelConfig(
        id="claude-sonnet",
        model_provider="anthropic",
        model_name="claude-sonnet-4.5",
        temperature=0.7,
        interface="langchain",
        system_prompt="You are a genomics expert. Answer concisely.",
    ),
    ModelConfig(
        id="gemini-flash",
        model_provider="google",
        model_name="gemini-2.5-flash",
        temperature=0.7,
        interface="langchain",
        system_prompt="You are a genomics expert. Answer concisely.",
    ),
]

# Configure single judge model for consistent evaluation
judge_model = ModelConfig(
    id="gpt-judge",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.0,  # Deterministic parsing
    interface="langchain",
    system_prompt="You are a strict evaluator. Parse carefully.",
)

# Configure verification with replication
config_complete = VerificationConfig(
    answering_models=answering_models,
    parsing_models=[judge_model],
    evaluation_mode="template_and_rubric",
    rubric_enabled=True,
    replicate_count=3,  # Run each combination 3 times
    abstention_check_enabled=True,
)

print("Verification configured:")
print(f"  Answering models: {len(answering_models)}")
print("  Judge models: 1")
print(f"  Replicates: {config_complete.replicate_count}")
print(
    f"  Total results expected: {len(demo_qids)} × {len(answering_models)} × 1 × {config_complete.replicate_count} = {len(demo_qids) * len(answering_models) * config_complete.replicate_count}"
)

In [None]:
# Progress callback
def show_progress(progress: float, message: str):
    print(f"[{progress:.0%}] {message}")


# Run verification
print("Starting verification...\n")
results_complete = benchmark_complete.run_verification(config=config_complete, progress_callback=show_progress)

print(f"\nVerification complete: {len(results_complete.results)} results generated")

In [None]:
# Analyze results by model
results_by_model = defaultdict(list)
for result in results_complete.results:
    results_by_model[result.answering_model].append(result)

print("\n=== Results by Answering Model ===")
for model_id, model_results in results_by_model.items():
    passed = sum(1 for r in model_results if r.verify_result)
    total = len(model_results)
    accuracy = passed / total

    print(f"\n{model_id}:")
    print(f"  Template Accuracy: {accuracy:.1%} ({passed}/{total})")

    # Rubric averages
    rubric_scores = defaultdict(list)
    for r in model_results:
        if r.rubric_evaluation_performed and r.verify_rubric:
            for trait, score in r.verify_rubric.items():
                rubric_scores[trait].append(score)

    if rubric_scores:
        print("  Rubric Averages:")
        for trait, scores in rubric_scores.items():
            avg = sum(scores) / len(scores)
            print(f"    {trait}: {avg:.2f}")

    # Abstention rate
    abstentions = sum(1 for r in model_results if r.abstention_detected)
    abstention_rate = abstentions / total
    print(f"  Abstention Rate: {abstention_rate:.1%}")

In [None]:
import tempfile
from pathlib import Path

# Get or create temp directory
if "TEMP_DIR" not in globals():
    TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))


def temp_path(filename: str) -> Path:
    return TEMP_DIR / filename


from karenina.storage import DBConfig

# Create database configuration
db_config = DBConfig(
    storage_url=f"sqlite:///{temp_path('benchmarks.db')}",
    auto_create=True,  # Create tables if they don't exist
)

print(f"Database configured: {temp_path('benchmarks.db')}")
print("Results will be automatically saved during verification")

---## Next StepsAfter running verification:- [Analyze Results](saving-loading.md#exporting-verification-results) - Export to CSV/JSON for deeper analysis- [Save Benchmark](saving-loading.md) - Persist results to database or checkpoint- [Advanced Features](../advanced/deep-judgment.md) - Use deep-judgment for detailed feedback- [Few-Shot Prompting](../advanced/few-shot.md) - Guide responses with examples

---## Related Documentation- [Model Configuration](model-configuration.md) - Comprehensive guide to ModelConfig parameters and extra_kwargs- [Defining Benchmarks](defining-benchmark.md) - Creating and configuring benchmarks- [Templates](templates.md) - Structured answer evaluation- [Rubrics](rubrics.md) - Qualitative assessment criteria- [Saving & Loading](saving-loading.md) - Checkpoints, database, and export- [Deep Judgment](../advanced/deep-judgment.md) - Extract detailed feedback with excerpts- [Abstention Detection](../advanced/abstention-detection.md) - Handle model refusals- [Few-Shot Prompting](../advanced/few-shot.md) - Guide responses with examples