In [None]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata

import hashlib
import json
import sys
import tempfile
from datetime import datetime
from pathlib import Path
from unittest.mock import MagicMock, patch

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import after path is set
from karenina.schemas.workflow.template_results import TemplateResults
from karenina.schemas.workflow.verification.result import VerificationResult
from karenina.schemas.workflow.verification.result_components import (
    VerificationResultMetadata,
    VerificationResultRubric,
    VerificationResultTemplate,
)
from karenina.schemas.workflow.verification_result_set import VerificationResultSet


# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""

    def __init__(self, content: str = "Mock response"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}

    def __str__(self):
        return self.content


class MockStructuredOutput:
    """Mock structured output response that adapts to any template."""

    def __init__(self, **kwargs):
        # Set common attributes with realistic defaults
        self.count = kwargs.get("count", 46)
        self.target = kwargs.get("target", "BCL2")
        self.subunits = kwargs.get("subunits", 4)
        self.diseases = kwargs.get("diseases", ["asthma", "bronchitis", "pneumonia"])
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def model_dump(self):
        return self.dict()


def create_mock_chat_model():
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("46 chromosomes")
    mock.ainvoke.return_value = MockLLMResponse("46 chromosomes")
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


def compute_result_id(question_id: str, answering_model: str, parsing_model: str, timestamp: str) -> str:
    """Compute deterministic 16-char SHA256 hash."""
    data = {
        "answering_mcp_servers": [],
        "answering_model": answering_model,
        "parsing_model": parsing_model,
        "question_id": question_id,
        "replicate": None,
        "timestamp": timestamp,
    }
    json_str = json.dumps(data, sort_keys=True, ensure_ascii=True)
    hash_obj = hashlib.sha256(json_str.encode("utf-8"))
    return hash_obj.hexdigest()[:16]


def create_mock_verification_result(question_id: str, question_text: str, answer: str, passed: bool = True):
    """Create a mock VerificationResult for testing."""
    timestamp = datetime.now().isoformat()
    template_id = hashlib.md5(str(question_id).encode()).hexdigest()[:32]

    # Create mock template result
    template = VerificationResultTemplate(
        raw_llm_response=f"The answer is {answer}.",
        parsed_llm_response={"value": answer},
        parsed_gt_response={"value": answer},
        verify_result=passed,
        template_verification_performed=True,
        usage_metadata={
            "answer_generation": {"total_tokens": 50},
            "parsing": {"total_tokens": 30},
            "total": {"total_tokens": 80},
        },
        abstention_check_performed=True,
        abstention_detected=False,
    )

    # Create mock rubric result
    rubric = VerificationResultRubric(
        rubric_evaluation_performed=True,
        llm_trait_scores={
            "Conciseness": 4,
            "Clarity": True,
        },
    )

    # Create metadata with all required fields
    metadata = VerificationResultMetadata(
        question_id=question_id,
        template_id=template_id,
        completed_without_errors=True,
        question_text=question_text,
        raw_answer=answer,
        answering_model="gpt-4.1-mini",
        parsing_model="gpt-4.1-mini",
        execution_time=1.5,
        timestamp=timestamp,
        result_id=compute_result_id(question_id, "gpt-4.1-mini", "gpt-4.1-mini", timestamp),
    )

    return VerificationResult(
        metadata=metadata,
        template=template,
        rubric=rubric,
    )


# Store original run_verification
_original_run_verification = None


def mock_run_verification(self, config):
    """Mock run_verification that returns realistic results."""
    global _original_run_verification

    # Get all finished questions
    finished = self.get_finished_questions(ids_only=False)

    if len(finished) == 0:
        if _original_run_verification:
            return _original_run_verification(self, config)
        return VerificationResultSet(results=[], template_results=TemplateResults(results=[]))

    results = []
    # Map question keywords to expected answers
    mock_data = [
        {"keywords": ["chromosomes"], "answer": "46", "passed": True},
        {"keywords": ["venetoclax", "bcl2"], "answer": "BCL2", "passed": True},
        {"keywords": ["hemoglobin", "subunits"], "answer": "4", "passed": True},
        {"keywords": ["inflammatory", "lung"], "answer": "asthma, bronchitis, pneumonia", "passed": True},
    ]

    for question in finished:
        q_id = question["id"]
        q_text = question["question"]
        raw_answer = question.get("raw_answer", "")

        passed = True
        mock_ans = raw_answer
        q_text_lower = q_text.lower()

        for data in mock_data:
            if any(kw in q_text_lower for kw in data["keywords"]):
                passed = data["passed"]
                mock_ans = data["answer"]
                break

        results.append(
            create_mock_verification_result(question_id=q_id, question_text=q_text, answer=mock_ans, passed=passed)
        )

    template_results = TemplateResults(results=results)

    return VerificationResultSet(
        results=results,
        template_results=template_results,
        rubric_results=None,
    )


# Patch all LLM providers before any imports
_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch(
        "karenina.infrastructure.llm.interface.init_chat_model_unified",
        side_effect=lambda **kwargs: create_mock_chat_model(),
    ),
]

for p in _llm_patches:
    p.start()

# Patch Benchmark.run_verification
from karenina.benchmark import Benchmark

_original_run_verification = Benchmark.run_verification
Benchmark.run_verification = mock_run_verification


def temp_path(filename: str) -> Path:
    """Helper to create paths in temp directory."""
    return TEMP_DIR / filename


# Cleanup
import atexit
import shutil


def _cleanup():
    Benchmark.run_verification = _original_run_verification
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print("✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")
print("✓ Mock verification results enabled - examples will show realistic output")

# Saving and Loading Benchmarks

This guide covers how to persist, restore, and export benchmarks using Karenina's checkpoint and database systems.

## Understanding Persistence

Karenina provides two main approaches for persisting benchmarks:

1. **Checkpoints (JSON-LD files)**: Portable, human-readable files perfect for sharing and version control
2. **Database storage (SQLite)**: Structured storage with query capabilities for production use

You can use both approaches together: databases for primary storage and checkpoints for backups and sharing.

## Checkpoint Files (JSON-LD)

Checkpoints are JSON-LD files that capture the complete state of a benchmark.

### What Gets Saved

A checkpoint includes:
- Benchmark metadata (name, description, version)
- All questions with their metadata
- Answer templates
- Rubrics (global and question-specific)
- Verification results (if available)

### JSON-LD Format

Karenina uses **JSON-LD (JSON for Linked Data)** format following schema.org conventions:

**Benefits:**
- **Structured and semantic**: Machine-readable with clear data relationships
- **Human-readable**: Open in any text editor to inspect contents
- **Cross-platform**: Works across different environments
- **Version-compatible**: Maintains backward compatibility

## Saving Checkpoints

### Basic Save

Save your benchmark to a JSON-LD checkpoint file:

In [None]:
from pathlib import Path

from karenina import Benchmark

# Basic save
# benchmark.save(Path("genomics_benchmark.jsonld"))

# Save to specific directory
# benchmark.save(Path("benchmarks/genomics_benchmark.jsonld"))

print("Code example: Saving checkpoints")

### What Happens When You Save

In [None]:
from karenina import Benchmark

# Create and populate benchmark
benchmark = Benchmark.create(name="Genomics Knowledge Benchmark", version="1.0.0")

benchmark.add_question(
    question="How many chromosomes are in a human somatic cell?", raw_answer="46", author={"name": "Bio Curator"}
)

benchmark.add_question(
    question="What is the approved drug target of Venetoclax?", raw_answer="BCL2", author={"name": "Bio Curator"}
)

# Save checkpoint
checkpoint_path = temp_path("genomics_benchmark.jsonld")
benchmark.save(checkpoint_path)

print(f"✓ Saved checkpoint to {checkpoint_path.name}")
print(f"  Questions: {benchmark.question_count}")
print(f"  Size: {checkpoint_path.stat().st_size} bytes")

**Expected Output:**
```
✓ Saved checkpoint to genomics_benchmark.jsonld
  Questions: 2
  Size: 4532 bytes
```

## Loading Checkpoints

### Basic Load

Load a benchmark from a checkpoint file:

In [None]:
from pathlib import Path

from karenina import Benchmark

# Load benchmark
benchmark = Benchmark.load(checkpoint_path)

print(f"Loaded benchmark: {benchmark.name}")
print(f"Version: {benchmark.version}")
print(f"Questions: {benchmark.question_count}")

# Access questions
question_ids = benchmark.get_question_ids()
for qid in question_ids[:3]:
    question = benchmark.get_question(qid)
    print(f"  • {question['question'][:50]}...")

**Expected Output:**
```
Loaded benchmark: Genomics Knowledge Benchmark
Version: 1.0.0
Questions: 2
  • How many chromosomes are in a human somatic ce...
  • What is the approved drug target of Venetoclax...
```

### Verify Loaded Data

In [None]:
def load_and_verify(checkpoint_path: Path):
    """Load benchmark with validation"""
    try:
        benchmark = Benchmark.load(checkpoint_path)

        # Basic validation
        assert benchmark.question_count > 0, "No questions found"
        assert benchmark.name, "Missing benchmark name"

        print(f"✓ Successfully loaded: {benchmark.name}")
        print(f"  Questions: {benchmark.question_count}")

        # Check templates
        all_questions = benchmark.get_all_questions(ids_only=False)
        questions_with_templates = sum(1 for q in all_questions if q.get("answer_template") is not None)
        print(f"  Templates: {questions_with_templates}/{benchmark.question_count}")

        return benchmark

    except Exception as e:
        print(f"✗ Failed to load: {str(e)}")
        return None


# Load with validation
benchmark = load_and_verify(checkpoint_path)

## Database Storage

Database storage provides structured persistence with query capabilities.

### Quick Database Example

In [None]:
from pathlib import Path

from karenina import Benchmark, DBConfig

# Create benchmark
benchmark = Benchmark.create(name="Genomics Knowledge Benchmark", version="1.0.0")

# Add questions
benchmark.add_question(question="How many chromosomes are in a human somatic cell?", raw_answer="46")

# Save to database with checkpoint backup
db_path = temp_path("benchmarks.db")
checkpoint_db_path = temp_path("genomics_from_db.jsonld")

benchmark.save_to_db(storage=f"sqlite:///{db_path}", checkpoint_path=checkpoint_db_path)

print("✓ Saved to database and checkpoint")

# Load from database
loaded = Benchmark.load_from_db(benchmark_name="Genomics Knowledge Benchmark", storage=f"sqlite:///{db_path}")

print(f"✓ Loaded from database: {loaded.name}")

### When to Use Database vs Checkpoints

| Use Case | Recommended Approach |
|----------|---------------------|
| **Development and prototyping** | Checkpoints only |
| **Sharing benchmarks** | Checkpoints (portable files) |
| **Production deployment** | Database primary, checkpoints for backup |
| **Version control (Git)** | Checkpoints (diff-friendly) |
| **Multi-user collaboration** | Database with checkpoint backups |
| **Query and analytics** | Database |
| **Backups** | Checkpoints |

**Best Practice:** Use both!

In [None]:
# Example: Save to database for primary storage
# benchmark.save_to_db("sqlite:///production.db")

# Also save checkpoint for backup/sharing
# benchmark.save(Path("backups/genomics_v1.0.0.jsonld"))

print("Code example: Using both database and checkpoints")

## Automatic Database Storage During Verification

Karenina can automatically save verification results to a database as they are generated. This is especially useful for long-running verification jobs where you want results persisted immediately.

**Configure automatic storage:**

In [None]:
from karenina.schemas import ModelConfig, VerificationConfig
from karenina.storage import DBConfig

# Create database configuration
db_config = DBConfig(
    storage_url="sqlite:///benchmarks.db",
    auto_create=True,  # Create tables if they don't exist
)

print("✓ Database configuration created")
print(f"  Storage URL: {db_config.storage_url}")
print(f"  Auto create: {db_config.auto_create}")
print(f"  Dialect: {db_config.dialect}")

**How it works:**

1. When `db_config` is set in `VerificationConfig`, verification results are automatically saved to the specified database after completion
2. The `AUTOSAVE_DATABASE` environment variable controls this behavior (defaults to `"true"`)
3. Results are saved with metadata including run name, timestamp, and configuration details
4. This happens transparently without requiring manual `save_to_db()` calls

**Benefits:**

- **No data loss**: Results are persisted immediately after verification completes
- **Automatic**: No need to remember to call `save_to_db()` after verification
- **Production-ready**: Ideal for automated pipelines and long-running jobs
- **Queryable**: Results are immediately available for database queries and analytics

**Disabling auto-save:**

If you need to disable automatic database storage temporarily:

In [None]:
# Set environment variable (example)
# export AUTOSAVE_DATABASE="false"

# Or use db_config=None in VerificationConfig
config_no_db = VerificationConfig(
    answering_models=[
        ModelConfig(id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")
    ],
    parsing_models=[
        ModelConfig(id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")
    ],
    db_config=None,  # No automatic database storage
)

print("✓ Configured without automatic database storage")

**Example with full workflow:**

In [None]:
from pathlib import Path

from karenina import Benchmark
from karenina.schemas import ModelConfig, VerificationConfig
from karenina.storage import DBConfig

# Load benchmark
benchmark = Benchmark.load(checkpoint_path)

# Configure database
db_config = DBConfig(storage_url=f"sqlite:///{db_path}", auto_create=True)

# Configure verification with automatic database storage
model_config = ModelConfig(
    id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", temperature=0.7, interface="langchain"
)

config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    evaluation_mode="template_and_rubric",  # Required when rubric_enabled=True
    rubric_enabled=True,
    replicate_count=3,
    db_config=db_config,  # Automatic storage enabled
)

print("✓ Verification configured with automatic database storage")
print(f"  Replicate count: {config.replicate_count}")
print(f"  Evaluation mode: {config.evaluation_mode}")
print(f"  Rubric enabled: {config.rubric_enabled}")

## Exporting Verification Results

After running verification, export results for analysis and reporting.

### Export to CSV

CSV format is ideal for spreadsheet analysis:

In [None]:
from pathlib import Path

from karenina.schemas import ModelConfig, VerificationConfig

# Run verification first
config = VerificationConfig(
    answering_models=[
        ModelConfig(id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")
    ],
    parsing_models=[
        ModelConfig(id="gpt-judge", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")
    ],
)

results = benchmark.run_verification(config)

# Export to CSV
csv_path = temp_path("results.csv")
benchmark.export_verification_results_to_file(file_path=csv_path, format="csv")

print(f"✓ Exported to {csv_path.name}")
print(f"  File size: {csv_path.stat().st_size} bytes")

**CSV Output Structure:**

| question_id | question | expected_answer | model_answer | template_passed | answering_model | parsing_model | timestamp |
|-------------|----------|-----------------|--------------|-----------------|-----------------|---------------|-----------|
| abc123... | How many chromosomes... | 46 | There are 46 chromosomes... | True | gpt-4.1-mini | gpt-judge | 2024-03-15 14:30:22 |

### Export to JSON

JSON format is ideal for programmatic analysis:

In [None]:
# Export to JSON
json_path = temp_path("results.json")
benchmark.export_verification_results_to_file(file_path=json_path, format="json")

print(f"✓ Exported to {json_path.name}")
print(f"  File size: {json_path.stat().st_size} bytes")

**JSON Output Structure:**
```json
{
  "benchmark_name": "Genomics Knowledge Benchmark",
  "export_timestamp": "2024-03-15T14:30:22",
  "total_results": 3,
  "results": [
    {
      "question_id": "abc123...",
      "question": "How many chromosomes are in a human somatic cell?",
      "expected_answer": "46",
      "raw_response": "There are 46 chromosomes in a human somatic cell.",
      "parsed_response": {"count": 46},
      "verify_result": true,
      "answering_model_id": "gpt-4.1-mini",
      "parsing_model_id": "gpt-judge",
      "timestamp": "2024-03-15T14:30:22"
    }
  ]
}
```

### Export Specific Questions

Export results for a subset of questions:

In [None]:
# Get question IDs for chromosomes questions
all_questions = benchmark.get_all_questions(ids_only=False)
chromosome_qids = [q["id"] for q in all_questions if "chromosome" in q["question"].lower()]

print(f"Found {len(chromosome_qids)} chromosome-related questions")

# Export only chromosome questions
if chromosome_qids:
    chromosome_csv_path = temp_path("chromosome_results.csv")
    benchmark.export_verification_results_to_file(
        file_path=chromosome_csv_path, format="csv", question_ids=chromosome_qids
    )
    print(f"✓ Exported {len(chromosome_qids)} chromosome questions")

## Checkpoint Management

### Incremental Checkpoints

Save checkpoints at key stages of your workflow:

In [None]:
from pathlib import Path

from karenina import Benchmark
from karenina.schemas import LLMRubricTrait, ModelConfig, Rubric

# Create checkpoint directory
checkpoint_dir = TEMP_DIR / "checkpoints"
checkpoint_dir.mkdir(exist_ok=True)

# Create benchmark
benchmark = Benchmark.create(name="Genomics Knowledge Benchmark", version="1.0.0")

# Add questions
benchmark.add_question(question="How many chromosomes are in a human somatic cell?", raw_answer="46")
benchmark.add_question(question="What is the approved drug target of Venetoclax?", raw_answer="BCL2")

# Checkpoint 1: After adding questions
cp1 = checkpoint_dir / "01_questions_added.jsonld"
benchmark.save(cp1)
print("✓ Checkpoint 1: Questions added")

# Generate templates
model_config = ModelConfig(id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")
benchmark.generate_all_templates(model="gpt-4.1-mini")

# Checkpoint 2: After template generation
cp2 = checkpoint_dir / "02_templates_generated.jsonld"
benchmark.save(cp2)
print("✓ Checkpoint 2: Templates generated")

# Create rubric
rubric = Rubric(llm_traits=[LLMRubricTrait(name="Conciseness", description="Rate conciseness 1-5", kind="score")])
benchmark.set_global_rubric(rubric)

# Checkpoint 3: After rubrics
cp3 = checkpoint_dir / "03_rubrics_created.jsonld"
benchmark.save(cp3)
print("✓ Checkpoint 3: Rubrics created")

print(f"\nAll checkpoints saved to: {checkpoint_dir}")
for cp in sorted(checkpoint_dir.glob("*.jsonld")):
    print(f"  - {cp.name}")

### Timestamped Backups

Create timestamped backups automatically:

In [None]:
from pathlib import Path

# Create backup directory
backup_dir = TEMP_DIR / "backups"
backup_dir.mkdir(exist_ok=True)


def save_with_timestamp(benchmark, base_name: str):
    """Save benchmark with timestamp"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{base_name}_{timestamp}.jsonld"
    path = backup_dir / filename

    benchmark.save(path)

    print(f"✓ Backup saved: {filename}")
    return path


# Save with timestamp
save_with_timestamp(benchmark, "genomics_benchmark")

**Expected Output:**
```
✓ Backup saved: genomics_benchmark_20240315_143022.jsonld
```

## Comparing Checkpoints

Compare two checkpoints to see what changed:

In [None]:
def compare_checkpoints(path1: Path, path2: Path):
    """Compare two benchmark checkpoints"""
    from karenina import Benchmark

    bench1 = Benchmark.load(path1)
    bench2 = Benchmark.load(path2)

    print("=== Comparing Checkpoints ===")
    print(f"Checkpoint 1: {path1.name}")
    print(f"Checkpoint 2: {path2.name}")
    print()

    # Compare questions
    print("Questions:")
    print(f"  {path1.name}: {bench1.question_count}")
    print(f"  {path2.name}: {bench2.question_count}")

    # Compare templates
    q1 = bench1.get_all_questions(ids_only=False)
    q2 = bench2.get_all_questions(ids_only=False)
    templates1 = sum(1 for q in q1 if q.get("answer_template") is not None)
    templates2 = sum(1 for q in q2 if q.get("answer_template") is not None)
    print("\nTemplates:")
    print(f"  {path1.name}: {templates1}")
    print(f"  {path2.name}: {templates2}")

    # Compare rubrics
    has_global1 = bench1.get_global_rubric() is not None
    has_global2 = bench2.get_global_rubric() is not None
    print("\nGlobal Rubric:")
    print(f"  {path1.name}: {'Yes' if has_global1 else 'No'}")
    print(f"  {path2.name}: {'Yes' if has_global2 else 'No'}")


# Compare before and after rubrics
compare_checkpoints(cp2, cp3)

## Portability and Sharing

### Sharing Benchmarks with Collaborators

Checkpoints are portable and can be easily shared:

In [None]:
# Prepare benchmark for sharing
share_path = temp_path("genomics_benchmark_v1.0.0.jsonld")
benchmark.save(share_path)

# Collaborator loads it
from karenina import Benchmark

shared_benchmark = Benchmark.load(share_path)
print(f"Loaded shared benchmark: {shared_benchmark.name}")
print(f"Version: {shared_benchmark.version}")

**Sharing checklist:**

- ✅ Save to descriptive filename with version
- ✅ Include README with benchmark purpose and usage
- ✅ Document any special requirements (API keys, models)
- ✅ Test loading on a different machine

### Version Control with Git

Checkpoints work well with Git:

```bash
# Add checkpoint to Git
git add genomics_benchmark_v1.0.0.jsonld
git commit -m "Add genomics benchmark v1.0.0"
git push

# Track benchmark evolution over time
git log -- genomics_benchmark_v1.0.0.jsonld
```

**Git best practices:**

- Use semantic versioning for checkpoint filenames
- Include descriptive commit messages
- Tag important versions: `git tag v1.0.0`
- Use `.gitignore` for temporary checkpoints

## Complete Workflow Example

Here's a complete example showing checkpoints, database storage, and export:

In [None]:
from pathlib import Path

from karenina import Benchmark
from karenina.schemas import LLMRubricTrait, ModelConfig, Rubric, VerificationConfig

# 1. Create benchmark
benchmark = Benchmark.create(
    name="Genomics Knowledge Benchmark",
    description="Testing LLM knowledge of genomics",
    version="1.0.0",
    creator="Bio Team",
)

# 2. Add questions
questions = [
    ("How many chromosomes are in a human somatic cell?", "46"),
    ("What is the approved drug target of Venetoclax?", "BCL2"),
    ("How many protein subunits does hemoglobin A have?", "4"),
]

for q, a in questions:
    benchmark.add_question(question=q, raw_answer=a, author={"name": "Bio Curator"})

# Checkpoint 1
step1 = checkpoint_dir / "step1_questions.jsonld"
benchmark.save(step1)
print("✓ Checkpoint 1: Questions added")

# 3. Generate templates
model_config = ModelConfig(
    id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", temperature=0.1, interface="langchain"
)
benchmark.generate_all_templates(model="gpt-4.1-mini")

# Checkpoint 2
step2 = checkpoint_dir / "step2_templates.jsonld"
benchmark.save(step2)
print("✓ Checkpoint 2: Templates generated")

# 4. Create rubric
rubric = Rubric(
    llm_traits=[
        LLMRubricTrait(name="Conciseness", description="Rate conciseness 1-5", kind="score"),
        LLMRubricTrait(name="Clarity", description="Is the answer clear?", kind="boolean"),
    ]
)
benchmark.set_global_rubric(rubric)

# Checkpoint 3
step3 = checkpoint_dir / "step3_rubrics.jsonld"
benchmark.save(step3)
print("✓ Checkpoint 3: Rubrics created")

# 5. Run verification
config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    evaluation_mode="template_and_rubric",  # Required when rubric_enabled=True
    rubric_enabled=True,
)
results = benchmark.run_verification(config)

# Checkpoint 4
step4 = checkpoint_dir / "step4_verified.jsonld"
benchmark.save(step4)
print("✓ Checkpoint 4: Verification complete")

# 6. Save to database with checkpoint
final_checkpoint = temp_path("genomics_benchmark_v1.0.0.jsonld")
benchmark.save_to_db(storage=f"sqlite:///{db_path}", checkpoint_path=final_checkpoint)
print("✓ Saved to database with checkpoint")

# 7. Export results
results_csv = temp_path("results.csv")
results_json = temp_path("results.json")
benchmark.export_verification_results_to_file(file_path=results_csv, format="csv")
benchmark.export_verification_results_to_file(file_path=results_json, format="json")
print("✓ Exported results to CSV and JSON")

# 8. Create timestamped backup
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = backup_dir / f"genomics_{timestamp}.jsonld"
benchmark.save(backup_path)
print(f"✓ Backup saved: {backup_path.name}")

print("\n=== Summary ===")
print(f"Benchmark: {benchmark.name} v{benchmark.version}")
print(f"Questions: {benchmark.question_count}")
print(f"Verification results: {len(results)}")
print("Checkpoints: 4")
print(f"Database: Yes (sqlite:///{db_path.name})")
print("Exports: CSV and JSON")
print("Backup: Yes")

**Expected Output:**
```
✓ Checkpoint 1: Questions added
✓ Checkpoint 2: Templates generated
✓ Checkpoint 3: Rubrics created
✓ Checkpoint 4: Verification complete
✓ Saved to database with checkpoint
✓ Exported results to CSV and JSON
✓ Backup saved: genomics_20240315_143022.jsonld

=== Summary ===
Benchmark: Genomics Knowledge Benchmark v1.0.0
Questions: 3
Verification results: 3
Checkpoints: 4
Database: Yes (sqlite:///benchmarks.db)
Exports: CSV and JSON
Backup: Yes
```

## Best Practices

### Naming Conventions

Use descriptive, versioned filenames:

In [None]:
# ✅ Good: Descriptive with version and date
# benchmark.save(Path("genomics_benchmark_v1.0.0_20240315.jsonld"))
# benchmark.save(Path("drug_targets_benchmark_v2.1.0.jsonld"))

# ❌ Bad: Generic or unclear names
# benchmark.save(Path("benchmark.jsonld"))
# benchmark.save(Path("test.jsonld"))

print("Examples of good checkpoint names:")
print("  - genomics_benchmark_v1.0.0_20240315.jsonld")
print("  - drug_targets_benchmark_v2.1.0.jsonld")
print("  - clinical_questions_v1.2.3-beta.jsonld")

### Directory Organization

Organize checkpoints logically:

```
project/
├── benchmarks/
│   ├── genomics_v1.0.0.jsonld
│   ├── drug_targets_v1.0.0.jsonld
│   └── proteins_v1.0.0.jsonld
├── checkpoints/
│   └── genomics/
│       ├── 01_questions.jsonld
│       ├── 02_templates.jsonld
│       ├── 03_rubrics.jsonld
│       └── 04_verified.jsonld
├── backups/
│   ├── genomics_20240315.jsonld
│   └── genomics_20240316.jsonld
└── exports/
    ├── results.csv
    └── results.json
```

### Backup Strategy

Implement a regular backup strategy:

In [None]:
from datetime import datetime
from pathlib import Path


def backup_benchmark(benchmark, backup_dir: Path):
    """Create daily backup"""
    backup_dir.mkdir(parents=True, exist_ok=True)

    # Daily backup
    date_str = datetime.now().strftime("%Y%m%d")
    daily_backup = backup_dir / f"{benchmark.name}_{date_str}.jsonld"

    if not daily_backup.exists():
        benchmark.save(daily_backup)
        print(f"✓ Daily backup: {daily_backup.name}")
    else:
        print(f"  Daily backup already exists for {date_str}")


# Backup after important changes
backup_benchmark(benchmark, backup_dir)

### Cleanup Old Checkpoints

Remove old temporary checkpoints periodically:

In [None]:
from datetime import datetime, timedelta
from pathlib import Path


def cleanup_old_checkpoints(checkpoint_dir: Path, keep_days: int = 30):
    """Remove checkpoint files older than keep_days"""
    cutoff_date = datetime.now() - timedelta(days=keep_days)

    removed = 0
    for checkpoint in checkpoint_dir.glob("*.jsonld"):
        # Check if it's a temporary checkpoint (contains 'checkpoint' in name)
        if "checkpoint" in checkpoint.name.lower():
            mtime = datetime.fromtimestamp(checkpoint.stat().st_mtime)
            if mtime < cutoff_date:
                checkpoint.unlink()
                removed += 1
                print(f"  Removed: {checkpoint.name}")

    print(f"✓ Removed {removed} old checkpoints (older than {keep_days} days)")


# Note: This would remove files matching criteria
# cleanup_old_checkpoints(checkpoint_dir, keep_days=30)

print("Cleanup function defined (not executed in demo)")

## Next Steps

After saving and loading benchmarks:

- [Run Verification](verification.md) - Evaluate LLM responses
- [Advanced Features](../advanced/deep-judgment.md) - Use deep-judgment for detailed feedback
- [Share Benchmarks](#portability-and-sharing) - Collaborate with your team

## Related Documentation

- [Defining Benchmarks](defining-benchmark.md) - Benchmark creation and database persistence
- [Verification](verification.md) - Run evaluations
- [Templates](templates.md) - Structured answer evaluation
- [Rubrics](rubrics.md) - Qualitative assessment criteria
- [Quick Start](../quickstart.md) - End-to-end workflow example