In [None]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata

import tempfile
import sys
import os
import hashlib
import json
from pathlib import Path
from unittest.mock import Mock, MagicMock, patch, PropertyMock
from typing import Any, Dict, List
from datetime import datetime

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations (make global for all cells)
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import after path is set
from karenina.schemas.workflow.verification.result import VerificationResult
from karenina.schemas.workflow.verification.result_components import (
    VerificationResultMetadata,
    VerificationResultTemplate,
    VerificationResultRubric,
)
from karenina.schemas.workflow.verification_result_set import VerificationResultSet
from karenina.schemas.workflow.template_results import TemplateResults

# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""
    def __init__(self, content: str = "Mock response"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}

    def __str__(self):
        return self.content

class MockStructuredOutput:
    """Mock structured output response that adapts to any template."""
    def __init__(self, **kwargs):
        self.count = kwargs.get('count', 46)
        self.target = kwargs.get('target', 'BCL2')
        self.subunits = kwargs.get('subunits', 4)
        self.diseases = kwargs.get('diseases', ['asthma', 'bronchitis', 'pneumonia'])
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}

    def model_dump(self):
        return self.dict()

def create_mock_chat_model():
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("46 chromosomes")
    mock.ainvoke.return_value = MockLLMResponse("46 chromosomes")
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock

def compute_result_id(question_id: str, answering_model: str, parsing_model: str, timestamp: str) -> str:
    data = {
        "answering_mcp_servers": [],
        "answering_model": answering_model,
        "parsing_model": parsing_model,
        "question_id": question_id,
        "replicate": None,
        "timestamp": timestamp,
    }
    json_str = json.dumps(data, sort_keys=True, ensure_ascii=True)
    hash_obj = hashlib.sha256(json_str.encode("utf-8"))
    return hash_obj.hexdigest()[:16]

def create_mock_verification_result(question_id: str, question_text: str, answer: str, passed: bool = True):
    timestamp = datetime.now().isoformat()
    template_id = hashlib.md5(str(question_id).encode()).hexdigest()[:32]
    template = VerificationResultTemplate(
        raw_llm_response=f"The answer is {answer}.",
        parsed_llm_response={"value": answer},
        parsed_gt_response={"value": answer},
        verify_result=passed,
        template_verification_performed=True,
        usage_metadata={
            "answer_generation": {"total_tokens": 50},
            "parsing": {"total_tokens": 30},
            "total": {"total_tokens": 80}
        },
        abstention_check_performed=True,
        abstention_detected=False,
    )
    rubric = VerificationResultRubric(
        rubric_evaluation_performed=True,
        llm_trait_scores={"Conciseness": 4, "Clarity": True},
    )
    metadata = VerificationResultMetadata(
        question_id=question_id,
        template_id=template_id,
        completed_without_errors=True,
        question_text=question_text,
        raw_answer=answer,
        answering_model="gpt-4.1-mini",
        parsing_model="gpt-4.1-mini",
        execution_time=1.5,
        timestamp=timestamp,
        result_id=compute_result_id(question_id, "gpt-4.1-mini", "gpt-4.1-mini", timestamp),
    )
    return VerificationResult(
        metadata=metadata,
        template=template,
        rubric=rubric,
    )

_original_run_verification = None
_original_generate_all_templates = None

def mock_generate_all_templates(self, **kwargs):
    all_questions = self.get_all_questions(ids_only=False)
    for q_data in all_questions:
        q_id = q_data.get('id')
        if q_id and q_data.get('answer_template') is None:
            mock_template = {'template_type': 'mock', 'generated_at': datetime.now().isoformat()}
            self._base.data[q_id]['answer_template'] = mock_template
    return {}

def mock_run_verification(self, config):
    global _original_run_verification
    finished = self.get_finished_questions(ids_only=False)
    if len(finished) == 0:
        all_questions = self.get_all_questions(ids_only=False)
        if len(all_questions) == 0:
            return VerificationResultSet(results=[], template_results=TemplateResults(results=[]))
        finished = all_questions
    results = []
    mock_data = [
        {"keywords": ["chromosomes"], "answer": "46", "passed": True},
        {"keywords": ["venetoclax", "bcl2"], "answer": "BCL2", "passed": True},
        {"keywords": ["hemoglobin", "subunits"], "answer": "4", "passed": True},
        {"keywords": ["inflammatory", "lung"], "answer": "asthma, bronchitis, pneumonia", "passed": True},
    ]
    for question in finished:
        q_id = question.get('id', question.get('question_id', ''))
        q_text = question.get('question', '')
        raw_answer = question.get('raw_answer', '')
        passed = True
        mock_ans = raw_answer
        q_text_lower = q_text.lower()
        for data in mock_data:
            if any(kw in q_text_lower for kw in data["keywords"]):
                passed = data["passed"]
                mock_ans = data["answer"]
                break
        results.append(create_mock_verification_result(
            question_id=q_id,
            question_text=q_text,
            answer=mock_ans,
            passed=passed
        ))
    template_results = TemplateResults(results=results)
    return VerificationResultSet(
        results=results,
        template_results=template_results,
        rubric_results=None,
    )

_llm_patches = [
    patch('langchain_openai.ChatOpenAI', side_effect=lambda **kwargs: create_mock_chat_model()),
    patch('langchain_anthropic.ChatAnthropic', side_effect=lambda **kwargs: create_mock_chat_model()),
    patch('langchain_google_genai.ChatGoogleGenerativeAI', side_effect=lambda **kwargs: create_mock_chat_model()),
    patch('karenina.infrastructure.llm.interface.init_chat_model_unified', side_effect=lambda **kwargs: create_mock_chat_model()),
]
for p in _llm_patches:
    p.start()

from karenina.benchmark import Benchmark
_original_run_verification = Benchmark.run_verification
_original_generate_all_templates = Benchmark.generate_all_templates
Benchmark.run_verification = mock_run_verification
Benchmark.generate_all_templates = mock_generate_all_templates

import atexit
import shutil

def _cleanup():
    Benchmark.run_verification = _original_run_verification
    Benchmark.generate_all_templates = _original_generate_all_templates
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)

atexit.register(_cleanup)

print(f"✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print(f"✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")
print(f"✓ Mock verification results enabled - examples will show realistic output")

# Configuration Presets

Configuration presets allow you to save, load, and share complete verification configurations, eliminating the need to manually reconfigure settings for recurring benchmark scenarios.

## What are Presets?

**Configuration presets** are saved snapshots of your verification settings that can be quickly reloaded for future benchmark runs. A preset captures:

- **Model configurations**: Answering and parsing models with all their settings
- **Evaluation settings**: Replication count, evaluation mode, parsing-only flag
- **Rubric settings**: Enabled status, trait selection, evaluation mode
- **Advanced features**: Deep-judgment, abstention detection, few-shot configuration

Presets make it easy to:

- **Reuse configurations**: Quickly switch between different benchmark setups
- **Ensure consistency**: Use the same configuration across multiple runs
- **Share setups**: Export and share configurations with teammates
- **Organize scenarios**: Maintain separate configs for testing, production, experiments

## Why Use Presets?

### 1. Save Time

Instead of manually reconfiguring models and settings each time:

In [None]:
# Without presets: Manually configure every time ❌
print("Example of manual configuration (not executed):")
print("""
config = VerificationConfig(
    answering_models=[model1, model2],
    parsing_models=[parser],
    replicate_count=3,
    rubric_enabled=True,
    deep_judgment_enabled=True,
    deep_judgment_max_excerpts_per_attribute=3,
    # ... 15 more parameters ...
)""")

In [None]:
# With presets: Load saved configuration ✓
from karenina.schemas import VerificationConfig
from pathlib import Path

# (In real use, you would specify an actual preset file)
# config = VerificationConfig.from_preset(Path("my-setup.json"))
print("Config loaded from preset file!")

### 2. Maintain Consistency

Presets ensure the same configuration is used across runs, eliminating configuration drift:

In [None]:
# Example showing consistency across runs
print("Run 1: Today")
print("  config = VerificationConfig.from_preset(Path('production-config.json'))")
print("  results1 = benchmark.run_verification(config)")
print()
print("Run 2: Next week (identical configuration guaranteed)")
print("  config = VerificationConfig.from_preset(Path('production-config.json'))")
print("  results2 = benchmark.run_verification(config)")
print()
print("✓ Same configuration = comparable results")

### 3. Share Configurations

Share preset files with teammates or across projects:

In [None]:
# Example of sharing presets via command line
print("# Export preset")
print("cp presets/genomics-standard.json shared/")
print()
print("# Teammate imports preset")
print("cp shared/genomics-standard.json presets/")

## Saving a Preset

### Basic Usage

Save your current configuration with a descriptive name:

In [None]:
from karenina import Benchmark
from karenina.schemas import VerificationConfig, ModelConfig
from pathlib import Path

# Create and configure your verification setup
model_config = ModelConfig(
    id="gpt-4.1-mini",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.0,
    interface="langchain"
)

# Note: When rubric_enabled=True, must also set evaluation_mode='template_and_rubric'
config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    replicate_count=3,
    rubric_enabled=True,
    evaluation_mode="template_and_rubric",
    deep_judgment_enabled=True
)

# Create a presets directory for our example
example_presets_dir = TEMP_DIR / "presets"
example_presets_dir.mkdir(exist_ok=True)

# Save as a preset
metadata = config.save_preset(
    name="Genomics Standard Config",
    description="Standard setup for genomics benchmarks with deep-judgment",
    presets_dir=example_presets_dir
)

print(f"Preset saved to: {metadata['filepath']}")

### What Gets Saved?

**✓ Included:**

- All model configurations (answering_models, parsing_models)
- Evaluation settings (replicate_count, parsing_only, evaluation_mode)
- Rubric settings (rubric_enabled, rubric_trait_names)
- Advanced features (abstention_enabled, deep_judgment_*, few_shot_config)

**✗ Excluded:**

- Job-specific metadata (run_name)
- Database configuration (db_config)

### Preset File Structure

Presets are saved as JSON files in the `presets/` directory:

In [None]:
# View the saved preset file structure
import json

preset_file = Path(metadata['filepath'])
if preset_file.exists():
    with open(preset_file) as f:
        data = json.load(f)
    
    # Show preset structure
    print("Preset file structure:")
    print(json.dumps({
        "id": data["id"],
        "name": data["name"],
        "description": data["description"],
        "config": {
            "answering_models": [
                {"id": m["id"], "model_provider": m["model_provider"]}
                for m in data["config"]["answering_models"]
            ],
            "parsing_models": [
                {"id": m["id"], "model_provider": m["model_provider"]}
                for m in data["config"]["parsing_models"]
            ],
            "replicate_count": data["config"]["replicate_count"],
            "rubric_enabled": data["config"]["rubric_enabled"],
            "deep_judgment_enabled": data["config"]["deep_judgment_enabled"]
        },
        "created_at": data["created_at"],
        "updated_at": data["updated_at"]
    }, indent=2))
else:
    print(f"Preset file not found: {preset_file}")

## Loading a Preset

!!! tip "CLI Preset Management"
    You can also manage presets from the command line:

    - `karenina preset list` - List all available presets
    - `karenina preset show NAME` - Display preset configuration
    - `karenina preset delete NAME` - Delete a preset
    - `karenina verify checkpoint.jsonld --preset NAME` - Run verification with preset

    See **[CLI Verification](../using-karenina/cli-verification.md#preset-management)** for complete CLI preset documentation.

### Basic Usage

Load a saved preset and use it for verification:

In [None]:
from karenina import Benchmark
from karenina.schemas import VerificationConfig
from pathlib import Path

# Load the preset we just saved
config = VerificationConfig.from_preset(
    preset_file
)

print(f"✓ Preset loaded successfully")
print(f"  Answering models: {[m.id for m in config.answering_models]}")
print(f"  Parsing models: {[m.id for m in config.parsing_models]}")
print(f"  Replicate count: {config.replicate_count}")
print(f"  Rubric enabled: {config.rubric_enabled}")
print(f"  Deep-judgment enabled: {config.deep_judgment_enabled}")

### Custom Preset Directory

Specify a custom location for presets using an environment variable:

In [None]:
import os
from pathlib import Path

# Set custom preset directory
custom_dir = TEMP_DIR / "my_presets"
custom_dir.mkdir(exist_ok=True)

os.environ["KARENINA_PRESETS_DIR"] = str(custom_dir)

print(f"Custom preset directory set: {custom_dir}")
print(f"\nTo use: config.save_preset(name='My Preset') will save to this directory")

## Complete Example

Here's an end-to-end workflow showing preset creation and usage:

In [None]:
from karenina import Benchmark
from karenina.schemas import VerificationConfig, ModelConfig
from pathlib import Path

# ============================================================
# STEP 1: Create benchmark with genomics questions
# ============================================================

benchmark = Benchmark.create(
    name="Genomics Knowledge Benchmark",
    description="Testing LLM knowledge of genomics and molecular biology",
    version="1.0.0"
)

# Add questions
questions = [
    ("What is the approved drug target of Venetoclax?", "BCL2"),
    ("How many chromosomes are in a human somatic cell?", "46"),
    ("How many protein subunits does hemoglobin A have?", "4"),
]

for question, answer in questions:
    benchmark.add_question(
        question=question,
        raw_answer=answer,
        author={"name": "Genomics Curator"}
    )

print(f"✓ Created benchmark with {len(questions)} questions")

In [None]:
# ============================================================
# STEP 2: Configure models and settings
# ============================================================

model_config = ModelConfig(
    id="gpt-4.1-mini",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.0,
    interface="langchain"
)

# Configuration for testing (fast)
test_config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    replicate_count=1,
    rubric_enabled=False
)

# Configuration for production (comprehensive)
# Note: rubric_enabled=True requires evaluation_mode='template_and_rubric'
production_config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    replicate_count=3,
    rubric_enabled=True,
    evaluation_mode="template_and_rubric",
    deep_judgment_enabled=True,
    abstention_enabled=True
)

print("✓ Configured test and production configurations")

In [None]:
# ============================================================
# STEP 3: Save presets
# ============================================================

presets_dir = TEMP_DIR / "presets"

# Save test configuration
test_metadata = test_config.save_preset(
    name="Quick Test",
    description="Fast configuration for smoke tests",
    presets_dir=presets_dir
)
print(f"✓ Test preset saved: {test_metadata['filepath']}")

# Save production configuration
prod_metadata = production_config.save_preset(
    name="Production Full",
    description="Comprehensive configuration with all features enabled",
    presets_dir=presets_dir
)
print(f"✓ Production preset saved: {prod_metadata['filepath']}")

In [None]:
# ============================================================
# STEP 4: Generate templates
# ============================================================

print("\nGenerating templates...")
benchmark.generate_all_templates(
    model="gemini-2.0-flash",
    model_provider="google_genai"
)
print("✓ Templates generated")

In [None]:
# ============================================================
# STEP 5: Run quick test using preset
# ============================================================

print("\nRunning quick test...")
test_config = VerificationConfig.from_preset(
    Path(test_metadata['filepath'])
)
test_results = benchmark.run_verification(test_config)
print(f"✓ Quick test complete: {len(test_results.results)} questions")

In [None]:
# ============================================================
# STEP 6: Run production verification using preset
# ============================================================

print("\nRunning production verification...")
prod_config = VerificationConfig.from_preset(
    Path(prod_metadata['filepath'])
)
prod_results = benchmark.run_verification(prod_config)
print(f"✓ Production verification complete: {len(prod_results.results)} questions")

# Analyze results
passed = sum(1 for r in prod_results.results if r.verify_result)
total = len(prod_results.results)
print(f"Pass rate: {passed}/{total} ({passed/total*100:.1f}%)")

In [None]:
# Save final benchmark
benchmark_path = TEMP_DIR / "genomics_benchmark_final.jsonld"
benchmark.save(benchmark_path)
print(f"\n✓ Benchmark saved to: {benchmark_path}")

## Managing Presets

### Listing Available Presets

List all presets in the presets directory:

In [None]:
from pathlib import Path
import json

presets_dir = TEMP_DIR / "presets"

if presets_dir.exists():
    preset_files = list(presets_dir.glob("*.json"))
    print(f"Found {len(preset_files)} presets:\n")

    for preset_file in preset_files:
        with open(preset_file) as f:
            data = json.load(f)
            print(f"  📄 {preset_file.name}")
            print(f"     Name: {data['name']}")
            print(f"     Description: {data.get('description', 'N/A')}")
            print(f"     Created: {data['created_at']}")
            print()
else:
    print("No presets directory found")

### Updating a Preset

To update a preset, load it, modify the configuration, and save it with a new name (overwrites are not allowed):

In [None]:
from pathlib import Path

# Load existing preset
preset_path = Path(prod_metadata['filepath'])
config = VerificationConfig.from_preset(preset_path)

# Show current settings
print("Current settings:")
print(f"  Replicate count: {config.replicate_count}")
print(f"  Abstention enabled: {config.abstention_enabled}")

# Modify configuration
config.replicate_count = 5  # Increase replication
config.abstention_enabled = True  # Enable abstention detection

# Save as a new preset (cannot overwrite existing)
updated_metadata = config.save_preset(
    name="Production Full Updated",  # New name to avoid overwrite error
    description="Updated with 5 replicates and abstention detection",
    presets_dir=presets_dir
)

print(f"\n✓ Preset updated: {updated_metadata['filepath']}")
print(f"\nNew settings:")
print(f"  Replicate count: {config.replicate_count}")
print(f"  Abstention enabled: {config.abstention_enabled}")

### Deleting a Preset

Delete a preset file:

In [None]:
from pathlib import Path

# Create a test preset to delete
test_preset_path = presets_dir / "old-config.json"

# First, create a dummy preset
dummy_config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    replicate_count=1
)
dummy_config.save_preset(
    name="Old Config",
    presets_dir=presets_dir
)

# Now delete it
if test_preset_path.exists():
    test_preset_path.unlink()
    print(f"✓ Deleted preset: {test_preset_path}")
else:
    print(f"Preset not found: {test_preset_path}")

## Common Preset Scenarios

### Scenario 1: Quick Test vs Full Evaluation

Create two presets for different thoroughness levels:

In [None]:
# Quick test: Minimal configuration for fast feedback
quick_config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    replicate_count=1,
    rubric_enabled=False,
    deep_judgment_enabled=False,
    abstention_enabled=False
)

quick_preset = quick_config.save_preset(
    name="Quick Test Scenario",
    description="Fast smoke test configuration",
    presets_dir=presets_dir
)

# Full evaluation: Comprehensive configuration
# Note: rubric_enabled=True requires evaluation_mode='template_and_rubric'
full_config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    replicate_count=5,
    rubric_enabled=True,
    evaluation_mode="template_and_rubric",
    deep_judgment_enabled=True,
    abstention_enabled=True,
    deep_judgment_max_excerpts_per_attribute=3
)

full_preset = full_config.save_preset(
    name="Full Evaluation Scenario",
    description="Comprehensive configuration with all features",
    presets_dir=presets_dir
)

print("✓ Created two presets:")
print(f"  1. {quick_preset['name']} - Fast testing")
print(f"  2. {full_preset['name']} - Full evaluation")


In [None]:
# Usage example
print("Usage:")
print()
print("# During development: Use quick test")
print("config = VerificationConfig.from_preset(Path('presets/quick-test.json'))")
print("dev_results = benchmark.run_verification(config)")
print()
print("# Before release: Use full evaluation")
print("config = VerificationConfig.from_preset(Path('presets/full-evaluation.json'))")
print("final_results = benchmark.run_verification(config)")

### Scenario 2: Multi-Model Comparison

Create a preset for comparing multiple models:

In [None]:
# Define models to compare
gpt4_mini = ModelConfig(
    id="gpt-4.1-mini",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.0,
    interface="langchain"
)

claude_sonnet = ModelConfig(
    id="claude-sonnet-4",
    model_provider="anthropic",
    model_name="claude-sonnet-4",
    temperature=0.0,
    interface="langchain"
)

# Multi-model comparison configuration
# Note: rubric_enabled=True requires evaluation_mode='template_and_rubric'
comparison_config = VerificationConfig(
    answering_models=[gpt4_mini, claude_sonnet],  # Both models answer
    parsing_models=[gpt4_mini],  # One model judges
    replicate_count=3,
    rubric_enabled=True,
    evaluation_mode="template_and_rubric"
)

comparison_preset = comparison_config.save_preset(
    name="GPT-4 vs Claude Comparison",
    description="Compare GPT-4 and Claude on genomics questions",
    presets_dir=presets_dir
)

print("✓ Created multi-model comparison preset")
print(f"  Answering models: {[m.id for m in comparison_config.answering_models]}")
print(f"  Parsing models: {[m.id for m in comparison_config.parsing_models]}")

In [None]:
# Usage example
print("Usage:")
print()
print("# Load and run comparison")
print("config = VerificationConfig.from_preset(")
print("    Path('presets/gpt-4-vs-claude-comparison.json')")
print(" )")
print("results = benchmark.run_verification(config)")
print()
print("# Analyze by model")
print("for result in results.results:")
print("    print(f'Question: {result.metadata.question_text}')")
print("    print(f'  GPT-4: {result.template.parsed_llm_response}')")
print("    print(f'  Claude: {result.template.parsed_llm_response}')")

### Scenario 3: Feature-Specific Configurations

Create presets for testing specific features:

In [None]:
# Deep-judgment focused
deep_judgment_config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    replicate_count=1,
    deep_judgment_enabled=True,
    deep_judgment_max_excerpts_per_attribute=5,
    deep_judgment_fuzzy_match_threshold=0.80
)

deep_judgment_preset = deep_judgment_config.save_preset(
    name="Deep Judgment Test",
    description="Testing deep-judgment parsing with 5 excerpts per attribute",
    presets_dir=presets_dir
)

# Abstention detection focused
abstention_config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    replicate_count=1,
    abstention_enabled=True
)

abstention_preset = abstention_config.save_preset(
    name="Abstention Detection Test",
    description="Testing abstention detection on safety questions",
    presets_dir=presets_dir
)

print("✓ Created feature-specific presets:")
print(f"  1. {deep_judgment_preset['name']}")
print(f"  2. {abstention_preset['name']}")

## Best Practices

### 1. Use Descriptive Names

**Good names:**

- "GPT-4 Production Config"
- "Quick Smoke Test"
- "Claude with Deep Judgment"
- "Multi-Model Comparison Setup"

**Avoid:**

- Vague names: "Test 1", "Config", "Setup"
- Timestamp-only names: "2025-11-03"
- Overly long names (keep under 50 characters)

### 2. Add Meaningful Descriptions

Include context about when and why to use the preset:

In [None]:
# Example of a good preset description
print("Example:")
print("""
config.save_preset(
    name="Production Genomics",
    description="Standard production configuration for genomics benchmarks. "
                "Uses 3 replicates, enables rubrics and deep-judgment. "
                "Suitable for final evaluations before publication."
)
""")

### 3. Organize by Purpose

Create separate presets for different scenarios:

In [None]:
print("# Development presets")
print('quick_test_config.save_preset(name="Dev: Quick Test", description="...")')
print('debug_config.save_preset(name="Dev: Debug Mode", description="...")')
print()
print("# Production presets")
print('standard_config.save_preset(name="Prod: Standard", description="...")')
print('comprehensive_config.save_preset(name="Prod: Comprehensive", description="...")')
print()
print("# Experiment presets")
print('ablation_config.save_preset(name="Exp: Ablation Study", description="...")')

### 4. Version Control Your Presets

Track preset files in version control:

In [None]:
print("# Example Git commands for versioning presets")
print("git add presets/")
print("git commit -m 'Add genomics benchmark presets'")
print()
print("This allows you to:")
print("  - Track changes to configurations over time")
print("  - Revert to previous configurations")
print("  - Share presets with teammates")
print("  - Document configuration evolution")

### 5. Test Presets After Loading

Verify that loaded presets work as expected:

In [None]:
# Load and verify preset configuration
config = VerificationConfig.from_preset(Path(full_preset['filepath']))

# Verify configuration
print(f"Answering models: {len(config.answering_models)}")
print(f"Parsing models: {len(config.parsing_models)}")
print(f"Replicate count: {config.replicate_count}")
print(f"Deep-judgment: {config.deep_judgment_enabled}")
print(f"Abstention: {config.abstention_enabled}")
print()
print("✓ Configuration verified successfully")


## Troubleshooting

### Issue 1: Preset File Not Found

**Symptom**: `FileNotFoundError` when loading preset

**Solution**:

In [None]:
from pathlib import Path

preset_path = Path("presets/my-config.json")

if not preset_path.exists():
    print(f"Preset not found: {preset_path}")
    print("Available presets:")
    for p in presets_dir.glob("*.json"):
        print(f"  - {p.name}")
else:
    config = VerificationConfig.from_preset(preset_path)
    print("✓ Preset loaded successfully")

### Issue 2: Invalid Preset Configuration

**Symptom**: `ValidationError` when loading preset

**Solution**:

In [None]:
import json
from pathlib import Path

# Example: Inspect a valid preset file
preset_path = Path(full_preset['filepath'])

with open(preset_path) as f:
    data = json.load(f)
    print("Preset structure check:")
    print(f"  ✓ Has 'config' field: {'config' in data}")
    print(f"  ✓ Has 'answering_models' in config: {'answering_models' in data.get('config', {})}")
    print(f"  ✓ Has 'parsing_models' in config: {'parsing_models' in data.get('config', {})}")
    print(f"  ✓ Has 'name' field: {'name' in data}")

## Next Steps

Once you have presets configured, you can:

- [Verification](../using-karenina/verification.md) - Run verifications with presets
- [Deep-Judgment](deep-judgment.md) - Configure deep-judgment in presets
- [Abstention Detection](abstention-detection.md) - Configure abstention in presets
- [Few-Shot Prompting](few-shot.md) - Add few-shot configuration to presets

## Related Documentation

- [Verification](../using-karenina/verification.md) - Core verification workflow
- [Saving and Loading](../using-karenina/saving-loading.md) - Checkpoint management
- [Deep-Judgment](deep-judgment.md) - Multi-stage parsing
- [Abstention Detection](abstention-detection.md) - Refusal detection