In [None]:
# Mock Setup - Hidden in rendered documentation
import json
import sys
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))


# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""

    def __init__(self, content: str = "Mock response"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}

    def __str__(self):
        return self.content


class MockStructuredOutput:
    """Mock structured output response that adapts to any template."""

    def __init__(self, **kwargs):
        self.count = kwargs.get("count", 46)
        self.target = kwargs.get("target", "BCL2")
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def model_dump(self):
        return self.dict()


def create_mock_chat_model():
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("46 chromosomes")
    mock.ainvoke.return_value = MockLLMResponse("46 chromosomes")
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


# Patch all LLM providers before any imports
_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch(
        "karenina.infrastructure.llm.interface.init_chat_model_unified",
        side_effect=lambda **kwargs: create_mock_chat_model(),
    ),
]

for p in _llm_patches:
    p.start()


def temp_path(filename: str) -> Path:
    """Helper to create paths in temp directory."""
    return TEMP_DIR / filename


# Cleanup
import atexit
import shutil


def _cleanup():
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("Mock setup complete")
print(f"Temp directory: {TEMP_DIR}")
print("Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")

# Defining a Benchmark

The `Benchmark` class is the core component of Karenina. This page explains what it is, how to create benchmarks, what metadata can be associated with them, and how to persist them.

## The Benchmark Class

The `Benchmark` class is the central orchestrator for all benchmarking activities in Karenina. It:

- **Manages collections of questions** and their associated templates
- **Coordinates verification workflows** using LLM-as-a-judge patterns
- **Handles serialization and persistence** through JSON-LD checkpoints
- **Provides a unified interface** for benchmark creation, execution, and analysis

Think of a benchmark as a structured container that brings together questions, evaluation templates, and execution configuration into a cohesive evaluation framework.

## How to Create a Benchmark

### Basic Creation

Create a benchmark using the `Benchmark.create()` method:

In [None]:
from karenina import Benchmark

# Create a basic benchmark
benchmark = Benchmark.create(name="Genomics Knowledge Benchmark")

print(f"Created: {benchmark.name}")
print(f"Description: {benchmark.description}")
print(f"Version: {benchmark.version}")
print(f"Creator: {benchmark.creator}")

### Creation with Metadata

You can attach rich metadata to provide context and organization:

In [None]:
from karenina import Benchmark

benchmark = Benchmark.create(
    name="Genomics Knowledge Benchmark",
    description="Testing LLM knowledge of genomics and molecular biology",
    version="1.0.0",
    creator="Dr. Jane Smith",
)

print(f"Benchmark: {benchmark.name}")
print(f"Description: {benchmark.description}")
print(f"Version: {benchmark.version}")
print(f"Creator: {benchmark.creator}")

**Key Parameters:**

- **`name`** (required): Unique identifier for the benchmark
- **`description`**: Human-readable explanation of the benchmark's purpose
- **`version`**: Version string for tracking benchmark evolution (e.g., "1.0.0")
- **`creator`**: Name or organization that created the benchmark

## Benchmark Metadata Attributes

### Standard Metadata

The following standard attributes can be set when creating a benchmark:

| Attribute | Type | Description |
|-----------|------|-------------|
| `name` | `str` | Unique identifier for the benchmark (required) |
| `description` | `str` | Human-readable description of the benchmark's purpose |
| `version` | `str` | Version string for tracking benchmark evolution |
| `creator` | `str` | Creator or maintainer of the benchmark |

### Accessing Metadata

In [None]:
# Access standard attributes
print(benchmark.name)  # "Genomics Knowledge Benchmark"
print(benchmark.description)  # "Testing LLM knowledge of..."
print(benchmark.version)  # "1.0.0"
print(benchmark.creator)  # "Dr. Jane Smith"
print(f"Questions: {benchmark.question_count}")  # Number of questions

## Benchmark Organization Patterns

### Domain-Specific Benchmarks

Organize benchmarks by domain to facilitate comparison and reuse:

In [None]:
# Molecular biology benchmark
molecular_bio_benchmark = Benchmark.create(
    name="Molecular Biology Fundamentals",
    description="Tests understanding of core molecular biology concepts",
    version="1.0.0",
    creator="Biology Education Team",
)

# Pharmacology benchmark
pharmacology_benchmark = Benchmark.create(
    name="Drug Mechanisms and Targets",
    description="Evaluates knowledge of drug targets and mechanisms of action",
    version="1.0.0",
    creator="Pharmacology Research Group",
)

print(f"Created: {molecular_bio_benchmark.name}")
print(f"Created: {pharmacology_benchmark.name}")

### Multi-Version Benchmarks

Track benchmark evolution by versioning:

In [None]:
# Version 1.0: Basic genomics questions
genomics_v1 = Benchmark.create(
    name="Genomics Knowledge Benchmark",
    version="1.0.0",
    description="Basic genomics questions covering chromosomes and DNA structure",
    creator="Dr. Jane Smith",
)

# Version 2.0: Expanded with advanced topics
genomics_v2 = Benchmark.create(
    name="Genomics Knowledge Benchmark",
    version="2.0.0",
    description="Expanded genomics benchmark including epigenetics and gene regulation",
    creator="Dr. Jane Smith",
)

print(f"v1.0: {genomics_v1.name} - {genomics_v1.description}")
print(f"v2.0: {genomics_v2.name} - {genomics_v2.description}")

## Database Persistence

Karenina provides SQLite database storage for persistent benchmark management.

### Save to Database

Save your benchmark to a database with an optional checkpoint file:

In [None]:
from pathlib import Path

# Create a test benchmark
benchmark = Benchmark.create(
    name="Test Genomics Benchmark",
    description="For demonstrating database persistence",
    version="1.0.0",
    creator="Documentation Example",
)

# Add a sample question
question_id = benchmark.add_question(
    question="How many chromosomes are in a human somatic cell?", raw_answer="46", finished=True
)

# Save to SQLite database (with checkpoint file)
db_path = temp_path("benchmarks.db")
checkpoint_path = temp_path("genomics_benchmark.jsonld")

benchmark.save_to_db(storage=f"sqlite:///{db_path}", checkpoint_path=checkpoint_path)

print(f"Saved benchmark to database: {db_path}")
print(f"Checkpoint file: {checkpoint_path}")
print(f"Checkpoint exists: {checkpoint_path.exists()}")

**Parameters:**

- **`storage`**: Database connection string (e.g., `"sqlite:///benchmarks.db"`)
- **`checkpoint_path`** (optional): Path to save a checkpoint file alongside the database entry

**What gets stored:**

- Benchmark metadata (name, description, version)
- All questions with their metadata
- Answer templates
- Rubrics (global and question-specific)
- Verification results (if available)

### Load from Database

Load a previously saved benchmark by name:

In [None]:
from karenina import Benchmark

# Load from database
loaded_benchmark = Benchmark.load_from_db(benchmark_name="Test Genomics Benchmark", storage=f"sqlite:///{db_path}")

print(f"Loaded: {loaded_benchmark.name}")
print(f"Description: {loaded_benchmark.description}")
print(f"Questions: {loaded_benchmark.question_count}")

**Parameters:**

- **`benchmark_name`**: Exact name of the benchmark to load
- **`storage`**: Database connection string

### Database Use Cases

**Version Control:**
Store multiple versions of the same benchmark with different version strings:

In [None]:
# Example: Save v1.0 and v2.0
benchmark_v1 = Benchmark.create(name="My Benchmark", version="1.0.0", description="First version")

benchmark_v2 = Benchmark.create(name="My Benchmark", version="2.0.0", description="Updated version")

# Both can be saved to the same database
print("v1.0:", benchmark_v1.name, "-", benchmark_v1.version)
print("v2.0:", benchmark_v2.name, "-", benchmark_v2.version)

# In practice:
# benchmark_v1.save_to_db(storage="sqlite:///benchmarks.db")
# benchmark_v2.save_to_db(storage="sqlite:///benchmarks.db")

**Shared Storage:**
Multiple team members can access the same database to collaborate on benchmarks.

**Automatic Verification Persistence:**
When you run verification, results are automatically saved to the database if you provide a `storage` parameter in your `VerificationConfig`.

## Checkpoint Files

Checkpoints are JSON-LD files that contain the complete state of a benchmark. Unlike database storage, checkpoints are portable files that can be easily shared, version-controlled, and inspected.

### Save Checkpoint

Save your benchmark to a JSON-LD checkpoint file:

In [None]:
from pathlib import Path

# Create a benchmark with a question
benchmark = Benchmark.create(
    name="Checkpoint Example Benchmark", description="Demonstrating checkpoint save/load", version="1.0.0"
)

benchmark.add_question(question="How many chromosomes are in a human somatic cell?", raw_answer="46", finished=True)

# Save checkpoint (two equivalent methods)
checkpoint_path = temp_path("genomics_benchmark.jsonld")

# Method 1: Using save()
benchmark.save(checkpoint_path)

print(f"Checkpoint saved to: {checkpoint_path}")
print(f"File exists: {checkpoint_path.exists()}")
print(f"File size: {checkpoint_path.stat().st_size} bytes")

### Load Checkpoint

Load a benchmark from a checkpoint file:

In [None]:
from karenina import Benchmark

# Load from checkpoint
loaded = Benchmark.load(checkpoint_path)

print(f"Loaded benchmark: {loaded.name}")
print(f"Description: {loaded.description}")
print(f"Version: {loaded.version}")
print(f"Total questions: {loaded.question_count}")

### Checkpoint Format

Checkpoints use JSON-LD format following schema.org conventions:

In [None]:
# View the raw JSON-LD structure

with open(checkpoint_path) as f:
    jsonld_data = json.load(f)

# Display key fields
print("JSON-LD Structure:")
print(f"  @type: {jsonld_data.get('@type')}")
print(f"  name: {jsonld_data.get('name')}")
print(f"  version: {jsonld_data.get('version')}")
print(f"  description: {jsonld_data.get('description')}")
print(f"  creator: {jsonld_data.get('creator')}")
print(f"  hasPart (questions): {len(jsonld_data.get('hasPart', []))} items")

### Checkpoint Use Cases

**Sharing Benchmarks:**
Send checkpoint files to collaborators or publish them in repositories.

**Version Control:**
Track checkpoint files in Git to monitor benchmark evolution over time.

**Portability:**
Move benchmarks between systems without database dependencies.

**Inspection:**
Open checkpoint files in text editors to review benchmark structure.

## Summary

In this guide, you learned:

- How to create benchmarks with `Benchmark.create()`
- How to attach metadata (name, description, version, creator)
- How to organize benchmarks by domain and version
- How to save/load benchmarks using SQLite database storage
- How to save/load checkpoints as portable JSON-LD files

## Next Steps

Once you have a benchmark defined, you can:

- **Add questions** to populate it with evaluation content
- **Set up templates** for structured evaluation
- **Configure verification** to run assessments
- **Save and load** benchmarks using checkpoints or database