In [1]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata

import tempfile
import sys
import os
import hashlib
import json
from pathlib import Path
from unittest.mock import Mock, MagicMock, patch, PropertyMock
from typing import Any, Dict, List
from datetime import datetime

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import after path is set
from karenina.schemas.workflow.verification.result import VerificationResult
from karenina.schemas.workflow.verification.result_components import (
    VerificationResultMetadata,
    VerificationResultTemplate,
    VerificationResultRubric,
)
from karenina.schemas.workflow.verification_result_set import VerificationResultSet
from karenina.schemas.workflow.template_results import TemplateResults

# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""
    def __init__(self, content: str = "BCL2"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}

    def __str__(self):
        return self.content

class MockStructuredOutput:
    """Mock structured output response that adapts to any template."""
    def __init__(self, **kwargs):
        self.count = kwargs.get('count', 46)
        self.target = kwargs.get('target', 'BCL2')
        self.subunits = kwargs.get('subunits', 4)
        self.diseases = kwargs.get('diseases', ['asthma', 'bronchitis', 'pneumonia'])
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)
    
    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}
    
    def model_dump(self):
        return self.dict()

def create_mock_chat_model():
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("46 chromosomes")
    mock.ainvoke.return_value = MockLLMResponse("46 chromosomes")
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock

def compute_result_id(question_id: str, answering_model: str, parsing_model: str, timestamp: str) -> str:
    """Compute deterministic 16-char SHA256 hash."""
    data = {
        "answering_mcp_servers": [],
        "answering_model": answering_model,
        "parsing_model": parsing_model,
        "question_id": question_id,
        "replicate": None,
        "timestamp": timestamp,
    }
    json_str = json.dumps(data, sort_keys=True, ensure_ascii=True)
    hash_obj = hashlib.sha256(json_str.encode("utf-8"))
    return hash_obj.hexdigest()[:16]

def create_mock_verification_result(question_id: str, question_text: str, answer: str, passed: bool = True):
    """Create a mock VerificationResult for testing."""
    timestamp = datetime.now().isoformat()
    template_id = hashlib.md5(str(question_id).encode()).hexdigest()[:32]
    
    # Create mock template result
    template = VerificationResultTemplate(
        raw_llm_response=f"The answer is {answer}.",
        parsed_llm_response={"value": answer},
        parsed_gt_response={"value": answer},
        verify_result=passed,
        template_verification_performed=True,
        usage_metadata={
            "answer_generation": {"total_tokens": 50},
            "parsing": {"total_tokens": 30},
            "total": {"total_tokens": 80}
        },
        abstention_check_performed=True,
        abstention_detected=False,
    )
    
    # Create mock rubric result
    rubric = VerificationResultRubric(
        rubric_evaluation_performed=True,
        llm_trait_scores={
            "Conciseness": 4,
            "Clarity": True,
        }
    )
    
    # Create metadata with all required fields
    metadata = VerificationResultMetadata(
        question_id=question_id,
        template_id=template_id,
        completed_without_errors=True,
        question_text=question_text,
        raw_answer=answer,
        answering_model="gpt-4.1-mini",
        parsing_model="gpt-4.1-mini",
        execution_time=1.5,
        timestamp=timestamp,
        result_id=compute_result_id(question_id, "gpt-4.1-mini", "gpt-4.1-mini", timestamp),
    )
    
    return VerificationResult(
        metadata=metadata,
        template=template,
        rubric=rubric,
    )

# Store original run_verification
_original_run_verification = None

def mock_run_verification(self, config):
    """Mock run_verification that returns realistic results."""
    global _original_run_verification
    
    # Get all finished questions
    finished = self.get_finished_questions(ids_only=False)
    
    if len(finished) == 0:
        if _original_run_verification:
            return _original_run_verification(self, config)
        return VerificationResultSet(results=[], template_results=TemplateResults(results=[]))
    
    results = []
    mock_data = [
        {"keywords": ["chromosomes"], "answer": "46", "passed": True},
        {"keywords": ["venetoclax", "bcl2"], "answer": "BCL2", "passed": True},
        {"keywords": ["hemoglobin", "subunits"], "answer": "4", "passed": True},
        {"keywords": ["inflammatory", "lung"], "answer": "asthma, bronchitis, pneumonia", "passed": True},
    ]
    
    for question in finished:
        q_id = question['id']
        q_text = question['question']
        raw_answer = question.get('raw_answer', '')
        
        passed = True
        mock_ans = raw_answer
        q_text_lower = q_text.lower()
        
        for data in mock_data:
            if any(kw in q_text_lower for kw in data["keywords"]):
                passed = data["passed"]
                mock_ans = data["answer"]
                break
        
        results.append(create_mock_verification_result(
            question_id=q_id,
            question_text=q_text,
            answer=mock_ans,
            passed=passed
        ))
    
    template_results = TemplateResults(results=results)
    
    return VerificationResultSet(
        results=results,
        template_results=template_results,
        rubric_results=None,
    )

# Patch all LLM providers before any imports
_llm_patches = [
    patch('langchain_openai.ChatOpenAI', side_effect=lambda **kwargs: create_mock_chat_model()),
    patch('langchain_anthropic.ChatAnthropic', side_effect=lambda **kwargs: create_mock_chat_model()),
    patch('langchain_google_genai.ChatGoogleGenerativeAI', side_effect=lambda **kwargs: create_mock_chat_model()),
    patch('karenina.infrastructure.llm.interface.init_chat_model_unified', side_effect=lambda **kwargs: create_mock_chat_model()),
]

for p in _llm_patches:
    p.start()

# Patch Benchmark.run_verification
from karenina.benchmark import Benchmark
_original_run_verification = Benchmark.run_verification
Benchmark.run_verification = mock_run_verification

def temp_path(filename: str) -> Path:
    return TEMP_DIR / filename

# Cleanup
import atexit
import shutil

def _cleanup():
    Benchmark.run_verification = _original_run_verification
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)

atexit.register(_cleanup)

print(f"✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print(f"✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")
print(f"✓ Mock verification results enabled - examples will show realistic output")

✓ Mock setup complete
✓ Temp directory: /var/folders/34/129m5tdd04vf10ptyj12w6f80000gp/T/karenina_docs_7gzctznk
✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src
✓ Mock verification results enabled - examples will show realistic output


# Quick Start

Get started with Karenina in just a few minutes! This guide walks you through creating your first benchmark, adding questions, generating templates, and running verification.

---

## Prerequisites

Before you begin, make sure you have:

1. **Installed Karenina**:
   ```bash
   pip install karenina
   ```

2. **Set up API keys** (for LLM providers):
   ```bash
   # For OpenAI
   export OPENAI_API_KEY="your-api-key-here"

   # For Google Gemini
   export GOOGLE_API_KEY="your-api-key-here"

   # For Anthropic Claude
   export ANTHROPIC_API_KEY="your-api-key-here"
   ```

3. **Python 3.9+** installed

---

## Complete Workflow Example

This example demonstrates the full Karenina workflow: creating a benchmark, adding questions, generating templates, creating a rubric, running verification, and exporting results.

### Step 1: Create a Benchmark

In [2]:
from karenina import Benchmark

# Create a new benchmark
benchmark = Benchmark.create(
    name="Genomics Knowledge Benchmark",
    description="Testing LLM knowledge of genomics and molecular biology",
    version="1.0.0",
    creator="Your Name"
)

print(f"Created benchmark: {benchmark.name}")

Created benchmark: Genomics Knowledge Benchmark


---

### Step 2: Add Questions

You can add questions manually or extract them from files (Excel, CSV, TSV).

**Option A: Add questions manually**

In [3]:
# Add a few questions with answers
questions = [
    {
        "question": "How many chromosomes are in a human somatic cell?",
        "answer": "46",
        "author": {"name": "Bio Curator", "email": "curator@example.com"}
    },
    {
        "question": "What is the approved drug target of Venetoclax?",
        "answer": "BCL2",
        "author": {"name": "Bio Curator", "email": "curator@example.com"}
    },
    {
        "question": "How many protein subunits does hemoglobin A have?",
        "answer": "4",
        "author": {"name": "Bio Curator", "email": "curator@example.com"}
    }
]

question_ids = []
for q in questions:
    qid = benchmark.add_question(
        question=q["question"],
        raw_answer=q["answer"],
        author=q["author"]
    )
    question_ids.append(qid)

print(f"Added {len(question_ids)} questions")

Added 3 questions


**Option B: Extract from a file**

In [4]:
from karenina.domain.questions.extractor import extract_questions_from_file

# Note: This is an example of the API.
# In this notebook, we'll skip actual file extraction since we don't have a file.
# In practice, you would use:
#
# questions = extract_questions_from_file(
#     file_path="questions.xlsx",
#     question_column="Question",
#     answer_column="Answer",
#     author_name_column="Author",  # Optional
#     keywords_columns=[{"column": "Keywords", "separator": ","}]  # Optional
# )
#
# for q in questions:
#     benchmark.add_question(**q)

print("File extraction API shown above (not executed in this demo)")

File extraction API shown above (not executed in this demo)


---

### Step 3: Generate Answer Templates

Answer templates define how to extract and verify information from LLM responses. Karenina can generate these automatically using an LLM.

**Automatic template generation (recommended):**

In [5]:
# Generate templates for all questions
# Note: This method takes individual parameters (not a ModelConfig object)
print("Generating templates...")
results = benchmark.generate_all_templates(
    model="gpt-4.1-mini",
    model_provider="openai",
    temperature=0.1,
    interface="langchain",
    force_regenerate=False  # Skip questions that already have templates
)

print(f"Generated templates for {len(results)} questions")

Generating templates...
Generated templates for 3 questions


**Manual template creation (for advanced users):**

If you prefer full control, you can write templates manually:

In [None]:
# Manual template example
template_code = '''class Answer(BaseAnswer):
    mentions_bcl2_as_target: bool = Field(
        description="True if the response identifies BCL2 (or BCL-2, B-cell lymphoma 2) as the drug target of Venetoclax"
    )

    def model_post_init(self, __context):
        self.correct = {"mentions_bcl2_as_target": True}

    def verify(self) -> bool:
        return self.mentions_bcl2_as_target == self.correct["mentions_bcl2_as_target"]
'''

# Add template to a specific question
benchmark.add_question(
    question="What is the approved drug target of Venetoclax?",
    raw_answer="BCL2",
    answer_template=template_code,
    finished=True  # Mark as ready for verification
)

print("Manual template example created")

Manual template example created


---

### Step 4: Create a Rubric (Optional)

Rubrics assess qualitative aspects of answers. Karenina supports two types of rubrics:

- **Global Rubrics**: Applied to ALL questions (great for general quality assessment)
- **Question-Specific Rubrics**: Applied to ONE specific question (great for domain-specific validation)

#### Global Rubric (LLM-based traits)

These traits evaluate general answer quality across all questions:

In [7]:
from karenina.schemas import LLMRubricTrait, Rubric

# Create a global rubric with LLM-based traits
# These will be evaluated for EVERY question in the benchmark
global_rubric = Rubric(
    llm_traits=[
        LLMRubricTrait(
            name="Conciseness",
            description="Rate how concise the answer is on a scale of 1-5, where 1 is very verbose and 5 is extremely concise.",
            kind="score"  # Returns a score from 1-5
        ),
        LLMRubricTrait(
            name="Clarity",
            description="Is the answer clear and easy to understand?",
            kind="boolean"  # Returns pass/fail (use "boolean", not "binary")
        )
    ]
)

# Set the global rubric
benchmark.set_global_rubric(global_rubric)

print(f"Created global rubric with {len(global_rubric.llm_traits)} traits")
print("This rubric will be evaluated for ALL questions")

Created global rubric with 2 traits
This rubric will be evaluated for ALL questions


#### Question-Specific Rubric (Regex-based trait)

This trait validates that the answer contains specific content:

In [8]:
from karenina.schemas import RegexTrait

# Find the drug target question ID
drug_target_qid = [qid for qid in question_ids
                   if "Venetoclax" in benchmark.get_question(qid)['question']][0]

# Create a regex trait specific to the drug target question
# The answer must contain "BCL2" (case-insensitive)
regex_trait = RegexTrait(
    name="BCL2 Mention",
    description="Answer must mention BCL2",
    pattern=r"\bBCL2\b",  # Matches the exact word "BCL2" with word boundaries
    case_sensitive=False,
    invert_result=False  # Don't invert the result (match = pass)
)

# Add ONLY to the drug target question (not global!)
# Note: Use add_question_rubric_trait for single traits
benchmark.add_question_rubric_trait(question_id=drug_target_qid, trait=regex_trait)

print(f"Created question-specific rubric for question: {drug_target_qid}")
print("This rubric will ONLY be evaluated for the Venetoclax question")
print("It checks that the answer contains 'BCL2'")

Created question-specific rubric for question: urn:uuid:question-what-is-the-approved-drug-target-of-venetoclax-2a9de717
This rubric will ONLY be evaluated for the Venetoclax question
It checks that the answer contains 'BCL2'


#### Question-Specific Rubric (Metric-based trait)

For questions requiring classification accuracy (e.g., identifying disease types):

In [9]:
from karenina.schemas import MetricRubricTrait

# Example: If you had a question like "List inflammatory lung diseases"
# Add this question to demonstrate metric traits
disease_qid = benchmark.add_question(
    question="Which of the following are inflammatory lung diseases: asthma, bronchitis, pneumonia, emphysema, pulmonary fibrosis?",
    raw_answer="asthma, bronchitis, pneumonia",
    author={"name": "Bio Curator", "email": "curator@example.com"}
)

# Create a metric trait to evaluate classification accuracy
# Note: Use tn_instructions for items that should NOT be in the answer
metric_trait = MetricRubricTrait(
    name="Inflammatory Disease Identification",
    description="Evaluate accuracy of identifying inflammatory lung diseases",
    metrics=["precision", "recall", "f1"],
    tp_instructions=[
        "asthma",       # Should be identified (inflammatory)
        "bronchitis",   # Should be identified (inflammatory)
        "pneumonia"     # Should be identified (inflammatory)
    ],
    tn_instructions=[  # Use tn_instructions, not fp_instructions
        "emphysema",            # Should NOT be identified (obstructive, not inflammatory)
        "pulmonary fibrosis"    # Should NOT be identified (restrictive, not inflammatory)
    ],
    repeated_extraction=True  # Remove duplicate mentions
)

# Add ONLY to the disease classification question
# Note: Use add_question_rubric_trait for single traits
benchmark.add_question_rubric_trait(question_id=disease_qid, trait=metric_trait)

print(f"Created metric-based rubric for question: {disease_qid}")
print("This will compute precision, recall, and F1 score for this specific question")

Created metric-based rubric for question: urn:uuid:question-which-of-the-following-are-inflammatory-lung-disea-309b7d5b
This will compute precision, recall, and F1 score for this specific question


**Key Distinction:**

- **Global rubrics** (clarity, conciseness): Assessed for every question → generic quality metrics
- **Question-specific rubrics** (gene format, disease classification): Assessed for one question → domain-specific validation

---

### Step 5: Run Verification

Configure models and run verification to evaluate LLM responses against your templates and rubrics.

> **Tip: CLI Alternative**  
> You can also run verification from the command line without writing Python code: `karenina verify checkpoint.jsonld --preset config.json`. See [CLI Verification](using-karenina/cli-verification.md) for details.

In [10]:
from karenina.schemas import VerificationConfig, ModelConfig

# Configure verification
config = VerificationConfig(
    # Models that generate answers (can use multiple for comparison)
    answering_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.7,
            interface="langchain",
            system_prompt="You are a knowledgeable assistant. Answer accurately and concisely."
        )
    ],
    # Models that parse/judge answers (usually more capable models)
    parsing_models=[
        ModelConfig(
            id="gpt-4.1-mini",
            model_provider="openai",
            model_name="gpt-4.1-mini",
            temperature=0.0,
            interface="langchain",
            system_prompt="You are an expert judge. Parse and evaluate responses carefully."
        )
    ],
    # Enable rubric evaluation with proper evaluation_mode
    evaluation_mode="template_and_rubric",  # Run both template and rubric evaluation
    rubric_enabled=True,  # Enable rubric evaluation (required when evaluation_mode=template_and_rubric)
    replicate_count=1,    # Number of times to run each question (use >1 for statistical analysis)
    deep_judgment_enabled=False,  # Enable for detailed feedback with excerpts
    abstention_check_enabled=True  # Detect when models refuse to answer
)

# Run verification
print("Running verification...")
results = benchmark.run_verification(config)

print(f"Verification complete! Processed {len(results)} questions")

Running verification...
Verification complete! Processed 1 questions


**Using different interfaces:**

In [11]:
# Examples of different ModelConfig options:
#
# # OpenRouter
# ModelConfig(
#     id="sonnet-4.5",
#     model_provider="openrouter",
#     model_name="anthropic/claude-sonnet-4.5",
#     interface="openrouter"
# )
#
# # OpenAI-compatible endpoint (e.g., Ollama)
# ModelConfig(
#     id="glm46",
#     model_name="glm-4.6",
#     interface="openai_endpoint",
#     endpoint_api_key="your-api-key",
#     endpoint_base_url="http://localhost:11434/v1"
# )
#
# # Manual traces (for testing/debugging without API calls)
# ModelConfig(
#     id="manual",
#     model_provider="manual",
#     model_name="manual",
#     interface="manual"
# )

print("Model configuration examples shown above (not executed in this demo)")

Model configuration examples shown above (not executed in this demo)


---

### Step 6: Access and Analyze Results

After verification, you can analyze results using DataFrames (recommended) or access raw result objects.

#### Option 1: Analyze with DataFrames (Recommended)

In [12]:
# Get DataFrame for easy analysis
template_results = results.get_template_results()  # Use get_template_results(), not get_templates()
df = template_results.to_dataframe()

# Calculate pass rates
if 'question_id' in df.columns and 'field_match' in df.columns:
    pass_rates = df.groupby('question_id')['field_match'].mean()
    print("Pass Rates by Question:")
    print(pass_rates)

# If rubrics are enabled
rubric_results = results.get_rubrics_results()  # Use get_rubrics_results(), not get_rubrics()
rubric_df = rubric_results.to_dataframe(trait_type="llm")
if not rubric_df.empty:
    print("\nRubric Scores:")
    print(rubric_df.groupby('trait_name')['trait_score'].mean())

Pass Rates by Question:
question_id
urn:uuid:question-what-is-the-approved-drug-target-of-venetoclax-2a9de717-1    1.0
Name: field_match, dtype: float64

Rubric Scores:
trait_name
Clarity        1.0
Conciseness    4.0
Name: trait_score, dtype: object


See [Analyzing Results with DataFrames](using-karenina/analyzing-results-dataframes.md) for comprehensive examples.

#### Option 2: Access Raw Results

In [13]:
# Iterate through results
for result in results.results:
    print(f"\nQuestion: {result.question_text[:50]}...")
    print(f"Verification: {'✓ PASS' if result.verify_result else '✗ FAIL'}")
    print(f"Model Answer: {result.raw_llm_response[:100] if len(result.raw_llm_response) < 100 else result.raw_llm_response[:100] + '...'}")

    # Access rubric scores (if rubric enabled)
    # Note: Access through result.rubric not result.rubric_scores
    if result.rubric and result.rubric.llm_trait_scores:
        print("Rubric Scores:")
        for trait_name, score in result.rubric.llm_trait_scores.items():
            print(f"  - {trait_name}: {score}")

    # Check for abstention (if enabled)
    # Note: Access through result.template.abstention_reasoning
    if result.abstention_detected:
        reasoning = result.template.abstention_reasoning if result.template else None
        print(f"⚠ Model abstained: {reasoning}")


Question: What is the approved drug target of Venetoclax?...
Verification: ✓ PASS
Model Answer: The answer is BCL2.
Rubric Scores:
  - Conciseness: 4
  - Clarity: True


**Calculate aggregate metrics:**

In [14]:
# Using DataFrames (recommended)
df = results.get_template_results().to_dataframe()  # Use get_template_results()
successful = df[df['completed_without_errors'] == True]
pass_rate = successful['field_match'].mean() * 100
total = len(df.drop_duplicates(subset=['result_index']))
passed = (total * pass_rate / 100)

print(f"\n{'='*50}")
print(f"Overall Pass Rate: {pass_rate:.1f}% ({passed:.0f}/{total})")
print(f"{'='*50}")

# Or using raw results
total = len(results.results)
passed = sum(1 for r in results.results if r.verify_result)
pass_rate_raw = (passed / total) * 100

print(f"\nPass rate from raw results: {pass_rate_raw:.1f}%")


Overall Pass Rate: 100.0% (1/1)

Pass rate from raw results: 100.0%


---

### Step 7: Save and Export

Save your benchmark as a checkpoint or export results for analysis.

**Save checkpoint (preserves full benchmark state):**

In [15]:
from pathlib import Path

# Save benchmark with all questions, templates, and results
checkpoint_path = temp_path("genomics_benchmark.jsonld")
benchmark.save(checkpoint_path)
print(f"Saved checkpoint to {checkpoint_path}")

# Load later
loaded_benchmark = Benchmark.load(checkpoint_path)
print(f"Loaded benchmark: {loaded_benchmark.name}")

Saved checkpoint to /var/folders/34/129m5tdd04vf10ptyj12w6f80000gp/T/karenina_docs_7gzctznk/genomics_benchmark.jsonld
Loaded benchmark: Genomics Knowledge Benchmark


**Export verification results to CSV/JSON:**

In [16]:
# Export to CSV for spreadsheet analysis
benchmark.export_verification_results_to_file(
    file_path=temp_path("results.csv"),
    format="csv"
)

# Export to JSON for programmatic analysis
benchmark.export_verification_results_to_file(
    file_path=temp_path("results.json"),
    format="json"
)

print("Exported verification results to results.csv and results.json")

Exported verification results to results.csv and results.json


**Save to database:**

In [17]:
# Save benchmark to SQLite database (with checkpoint file)
db_path = temp_path("benchmarks.db")
benchmark.save_to_db(
    storage=f"sqlite:///{db_path}",
    checkpoint_path=checkpoint_path
)

# Load from database later
loaded = Benchmark.load_from_db(
    benchmark_name="Genomics Knowledge Benchmark",
    storage=f"sqlite:///{db_path}"
)

print(f"Loaded from database: {loaded.name}")

Loaded from database: Genomics Knowledge Benchmark


---

## Complete Example Script

Here's the entire workflow in one script with both global and question-specific rubrics:

In [18]:
# Complete workflow example
from karenina import Benchmark
from karenina.schemas import (
    VerificationConfig, ModelConfig, LLMRubricTrait,
    RegexTrait, MetricRubricTrait, Rubric
)
from pathlib import Path

# 1. Create benchmark
benchmark2 = Benchmark.create(
    name="Genomics Quiz",
    description="Basic genomics knowledge test",
    version="1.0.0",
    creator="Your Name"
)

# 2. Add questions
questions = [
    ("How many chromosomes are in a human somatic cell?", "46"),
    ("What is the approved drug target of Venetoclax?", "BCL2"),
    ("How many protein subunits does hemoglobin A have?", "4")
]

question_ids2 = []
for q, a in questions:
    qid = benchmark2.add_question(question=q, raw_answer=a, author={"name": "Bio Curator"})
    question_ids2.append(qid)

# Add a classification question for metric trait demonstration
disease_qid2 = benchmark2.add_question(
    question="Which of the following are inflammatory lung diseases: asthma, bronchitis, pneumonia, emphysema, pulmonary fibrosis?",
    raw_answer="asthma, bronchitis, pneumonia",
    author={"name": "Bio Curator"}
)

# 3. Generate templates
# Note: generate_all_templates() takes individual parameters, not ModelConfig
benchmark2.generate_all_templates(
    model="gpt-4.1-mini",
    model_provider="openai",
    temperature=0.1,
    interface="langchain"
)

# 4. Create global rubric (applies to ALL questions)
# Note: Create a Rubric object and use set_global_rubric
global_rubric = Rubric(
    llm_traits=[
        LLMRubricTrait(
            name="Conciseness",
            description="Rate conciseness 1-5",
            kind="score"
        ),
        LLMRubricTrait(
            name="Clarity",
            description="Is the answer clear?",
            kind="boolean"  # Use "boolean" not "binary"
        )
    ]
)
benchmark2.set_global_rubric(global_rubric)

# 5. Add question-specific rubrics
# Regex trait for Venetoclax question
drug_target_qid2 = [qid for qid in question_ids2 if "Venetoclax" in benchmark2.get_question(qid)['question']][0]
benchmark2.add_question_rubric_trait(
    question_id=drug_target_qid2,
    trait=RegexTrait(
        name="BCL2 Mention",
        description="Answer must mention BCL2",
        pattern=r"\bBCL2\b",
        case_sensitive=False,
        invert_result=False  # Use invert_result not invert
    )
)

# Metric trait for disease classification question
benchmark2.add_question_rubric_trait(
    question_id=disease_qid2,
    trait=MetricRubricTrait(
        name="Inflammatory Disease ID",
        description="Evaluate disease classification accuracy",
        metrics=["precision", "recall", "f1"],
        tp_instructions=["asthma", "bronchitis", "pneumonia"],
        tn_instructions=["emphysema", "pulmonary fibrosis"],  # Use tn_instructions not fp_instructions
        repeated_extraction=True
    )
)

# 6. Run verification
# Note: VerificationConfig DOES use ModelConfig objects
model_config = ModelConfig(
    id="gpt-4.1-mini",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.1,
    interface="langchain"
)

config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    evaluation_mode="template_and_rubric",  # Required when rubric_enabled=True
    rubric_enabled=True
)
results2 = benchmark2.run_verification(config)

# 7. Analyze results
# Note: results2 is a VerificationResultSet, not a dict - use .results to iterate
if len(results2.results) > 0:
    passed = sum(1 for r in results2.results if r.verify_result)
    print(f"Pass Rate: {(passed/len(results2.results)*100):.1f}%")
else:
    print("No results returned (this may happen in demo/mock mode)")

# 8. Save and export
checkpoint_path2 = temp_path("genomics_quiz.jsonld")
benchmark2.save(checkpoint_path2)
benchmark2.export_verification_results_to_file(
    file_path=temp_path("results.csv"),
    format="csv"
)

print("Done! Check results.csv for detailed results.")

No results returned (this may happen in demo/mock mode)
Done! Check results.csv for detailed results.


---

## Next Steps

Now that you've completed your first benchmark, explore these guides:

### Core Usage
- [Defining Benchmarks](using-karenina/defining-benchmark.md) - Benchmark creation, metadata, and organization
- [Adding Questions](using-karenina/adding-questions.md) - File extraction, metadata mapping, and management
- [Templates](using-karenina/templates.md) - Creating and customizing answer templates
- [Rubrics](using-karenina/rubrics.md) - Evaluation criteria and trait types
- [Verification](using-karenina/verification.md) - Configuration, replication, and result analysis
- [Saving & Loading](using-karenina/saving-loading.md) - Checkpoints, database persistence, and export

### Advanced Features
- [Deep-Judgment](advanced/deep-judgment.md) - Extract detailed feedback with excerpts and reasoning
- [Few-Shot Prompting](advanced/few-shot.md) - Guide responses with examples
- [Abstention Detection](advanced/abstention-detection.md) - Handle model refusals
- [Embedding Check](advanced/embedding-check.md) - Semantic similarity fallback
- [Presets](advanced/presets.md) - Save and reuse verification configurations
- [System Integration](advanced/integration.md) - Server and GUI integration

### Reference
- [Features Overview](features.md) - Complete feature catalog
- [Configuration](configuration.md) - Environment variables and defaults
- [API Reference](api-reference.md) - Complete API documentation
- [Troubleshooting](troubleshooting.md) - Common issues and solutions

---

## Tips for Success

1. **Start simple**: Begin with a few questions and manual templates to understand the workflow
2. **Use template generation**: Let Karenina generate templates automatically to save time
3. **Iterate on templates**: Review and refine generated templates for better accuracy
4. **Leverage rubrics**: Add rubrics to assess answer quality beyond correctness
5. **Run replications**: Use `replicate_count > 1` for statistical analysis of model consistency
6. **Save checkpoints**: Regularly save your benchmark to avoid losing work
7. **Export results**: Use CSV export for easy analysis in spreadsheet tools

---

## Common Questions

**Q: Do I need to write templates manually?**  
A: No! Karenina can generate templates automatically using LLMs. Manual creation is only needed for complex custom logic.

**Q: Can I use local models?**  
A: Yes! Use the `openai_endpoint` interface with Ollama, vLLM, or any OpenAI-compatible server.

**Q: How do I compare multiple models?**  
A: Add multiple models to `answering_models` in your verification config. Karenina will test all of them.

**Q: What's the difference between templates and rubrics?**  
A: Templates verify **factual correctness** (e.g., "Is the answer 'BCL2'?"), while rubrics assess **qualitative traits** (e.g., "Is the answer concise?").

**Q: Can I test without making API calls?**  
A: Yes! Use the `manual` interface with pre-recorded traces for testing and debugging without costs.