In [None]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata

import hashlib
import json
import sys
import tempfile
from datetime import datetime
from pathlib import Path
from unittest.mock import MagicMock, patch

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import after path is set
from karenina.schemas.workflow.template_results import TemplateResults
from karenina.schemas.workflow.verification.result import VerificationResult
from karenina.schemas.workflow.verification.result_components import (
    VerificationResultMetadata,
    VerificationResultRubric,
    VerificationResultTemplate,
)
from karenina.schemas.workflow.verification_result_set import VerificationResultSet


# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""

    def __init__(self, content: str = "Mock response"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}

    def __str__(self):
        return self.content


class MockStructuredOutput:
    """Mock structured output response that adapts to any template."""

    def __init__(self, **kwargs):
        self.count = kwargs.get("count", 46)
        self.target = kwargs.get("target", "BCL2")
        self.subunits = kwargs.get("subunits", 4)
        self.diseases = kwargs.get("diseases", ["asthma", "bronchitis", "pneumonia"])
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def model_dump(self):
        return self.dict()


def create_mock_chat_model():
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("46 chromosomes")
    mock.ainvoke.return_value = MockLLMResponse("46 chromosomes")
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


def compute_result_id(question_id: str, answering_model: str, parsing_model: str, timestamp: str) -> str:
    """Compute deterministic 16-char SHA256 hash."""
    data = {
        "answering_mcp_servers": [],
        "answering_model": answering_model,
        "parsing_model": parsing_model,
        "question_id": question_id,
        "replicate": None,
        "timestamp": timestamp,
    }
    json_str = json.dumps(data, sort_keys=True, ensure_ascii=True)
    hash_obj = hashlib.sha256(json_str.encode("utf-8"))
    return hash_obj.hexdigest()[:16]


def create_mock_verification_result(question_id: str, question_text: str, answer: str, passed: bool = True):
    """Create a mock VerificationResult for testing."""
    timestamp = datetime.now().isoformat()
    template_id = hashlib.md5(str(question_id).encode()).hexdigest()[:32]

    template = VerificationResultTemplate(
        raw_llm_response=f"The answer is {answer}.",
        parsed_llm_response={"value": answer},
        parsed_gt_response={"value": answer},
        verify_result=passed,
        template_verification_performed=True,
        usage_metadata={
            "answer_generation": {"total_tokens": 50},
            "parsing": {"total_tokens": 30},
            "total": {"total_tokens": 80},
        },
        abstention_check_performed=True,
        abstention_detected=False,
    )

    rubric = VerificationResultRubric(
        rubric_evaluation_performed=True,
        llm_trait_scores={
            "Conciseness": 4,
            "Clarity": True,
        },
    )

    metadata = VerificationResultMetadata(
        question_id=question_id,
        template_id=template_id,
        completed_without_errors=True,
        question_text=question_text,
        raw_answer=answer,
        answering_model="gpt-4.1-mini",
        parsing_model="gpt-4.1-mini",
        execution_time=1.5,
        timestamp=timestamp,
        result_id=compute_result_id(question_id, "gpt-4.1-mini", "gpt-4.1-mini", timestamp),
    )

    return VerificationResult(
        metadata=metadata,
        template=template,
        rubric=rubric,
    )


# Store original run_verification
_original_run_verification = None


def mock_run_verification(self, config):
    """Mock run_verification that returns realistic results."""
    global _original_run_verification

    finished = self.get_finished_questions(ids_only=False)

    if len(finished) == 0:
        if _original_run_verification:
            return _original_run_verification(self, config)
        return VerificationResultSet(results=[], template_results=TemplateResults(results=[]))

    results = []
    mock_data = [
        {"keywords": ["chromosomes"], "answer": "46", "passed": True},
        {"keywords": ["venetoclax", "bcl2"], "answer": "BCL2", "passed": True},
        {"keywords": ["hemoglobin", "subunits"], "answer": "4", "passed": True},
        {"keywords": ["inflammatory", "lung"], "answer": "asthma, bronchitis, pneumonia", "passed": True},
        {"keywords": ["2+2", "two plus two"], "answer": "4", "passed": True},
        {"keywords": ["3+3", "three plus three"], "answer": "6", "passed": True},
    ]

    for question in finished:
        q_id = question["id"]
        q_text = question["question"]
        raw_answer = question.get("raw_answer", "")

        passed = True
        mock_ans = raw_answer
        q_text_lower = q_text.lower()

        for data in mock_data:
            if any(kw in q_text_lower for kw in data["keywords"]):
                passed = data["passed"]
                mock_ans = data["answer"]
                break

        results.append(
            create_mock_verification_result(question_id=q_id, question_text=q_text, answer=mock_ans, passed=passed)
        )

    template_results = TemplateResults(results=results)

    return VerificationResultSet(
        results=results,
        template_results=template_results,
        rubric_results=None,
    )


# Patch all LLM providers before any imports
_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch(
        "karenina.infrastructure.llm.interface.init_chat_model_unified",
        side_effect=lambda **kwargs: create_mock_chat_model(),
    ),
]

for p in _llm_patches:
    p.start()

# Patch Benchmark.run_verification
from karenina.benchmark import Benchmark

_original_run_verification = Benchmark.run_verification
Benchmark.run_verification = mock_run_verification


def temp_path(filename: str) -> Path:
    """Helper to create paths in temp directory."""
    return TEMP_DIR / filename


# Cleanup
import atexit
import shutil


def _cleanup():
    Benchmark.run_verification = _original_run_verification
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print("✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")
print("✓ Mock verification results enabled - examples will show realistic output")

# Manual Trace System

This guide explains how to use the Manual Trace System to evaluate pre-generated LLM responses without making live API calls during verification.

## Overview

The Manual Trace System enables you to provide pre-generated answer traces directly to the Karenina verification engine, bypassing the real-time LLM answer generation step. This feature is designed for standalone backend usage, allowing you to:

- Evaluate pre-recorded LLM responses (from previous experiments, other systems, or manual collection)
- Compare different answer generation approaches without re-running models
- Test verification/rubric systems with controlled answers
- Integrate external LLM outputs into Karenina's evaluation framework

The system supports both simple string traces and rich LangChain message lists with automatic tool call metrics extraction.

## Key Capabilities

- **Programmatic Trace Management**: `ManualTraces` class for managing traces tied to benchmarks
- **Flexible Registration**: Register traces by question hash (MD5) or question text with automatic mapping
- **Dual Format Support**:
  - Simple string traces (plain text answers)
  - LangChain message lists (AIMessage, ToolMessage, etc.) with automatic preprocessing
- **Agent Metrics Extraction**: Automatic tool call counting, failure detection from message lists
- **Batch Registration**: Efficient bulk trace registration with `register_traces()`
- **Post-Config Population**: Populate traces after `ModelConfig` creation for flexible workflows
- **Preset Compatibility**: Manual configs work with preset system (traces excluded from serialization)
- **Session-Based Storage**: Thread-safe, time-bounded trace storage with automatic cleanup
- **Backward Compatible**: Maintains compatibility with existing GUI-based manual trace upload

## Quick Start

### Basic Usage with String Traces

In [None]:
from karenina.benchmark import Benchmark
from karenina.infrastructure.llm.manual_traces import ManualTraces
from karenina.schemas import ModelConfig, VerificationConfig

# Note: Using finished=True instead of answer_template for notebook examples
# This avoids source code extraction issues with dynamically defined classes

# Create benchmark
benchmark = Benchmark("my_experiment")
benchmark.add_question(question="What is 2+2?", raw_answer="4", finished=True)  # Question marked as finished

# Initialize manual traces
manual_traces = ManualTraces(benchmark)

# Register trace by question text
manual_traces.register_trace("What is 2+2?", "The answer is 4. I computed this by adding 2 and 2.", map_to_id=True)

print(f"Registered {len(manual_traces._benchmark._questions_cache)} traces")
print(f"Question text maps to hash: {list(manual_traces._benchmark._questions_cache.keys())[0][:32]}...")

In [None]:
# Create manual config
manual_config = ModelConfig(interface="manual", manual_traces=manual_traces)

# Create judge config
judge_config = ModelConfig(
    id="gpt-4.1-mini",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.0,
    interface="langchain",
    system_prompt="You are an expert judge.",
)

print(f"Manual config ID: {manual_config.id}")
print(f"Manual config interface: {manual_config.interface}")
print(f"Judge config: {judge_config.id}")

In [None]:
# Create verification config
config = VerificationConfig(answering_models=[manual_config], parsing_models=[judge_config])

# Run verification
results = benchmark.run_verification(config)

print("Verification complete!")
print(f"Results: {len(results.results)} question(s) evaluated")
print(f"Passed: {sum(1 for r in results.results if r.template.verify_result)}")

## Architecture

### Core Components

#### 1. `ManualTraces` Class
**Location**: `karenina/src/karenina/infrastructure/llm/manual_traces.py`

**Purpose**: High-level API for managing manual traces for a specific benchmark

**Key Methods**:
- `__init__(benchmark)` - Initialize with benchmark for question mapping
- `register_trace(question_identifier, trace, map_to_id=False)` - Register single trace
- `register_traces(traces_dict, map_to_id=False)` - Batch register traces
- `_question_text_to_hash(question_text)` - Convert text to MD5 hash with validation
- `_preprocess_trace(trace)` - Handle both string and LangChain message formats

#### 2. `ManualTraceManager` Class
**Location**: `karenina/src/karenina/infrastructure/llm/manual_traces.py`

**Purpose**: Session-based thread-safe storage for manual traces

**Key Features**:
- Thread-safe storage with `threading.RLock()`
- Session timeout (default: 1 hour) with automatic cleanup
- Storage for both traces and agent metrics
- MD5 hash validation

#### 3. `ManualLLM` Class
**Location**: `karenina/src/karenina/infrastructure/llm/manual_llm.py`

**Purpose**: LangChain-compatible LLM that returns precomputed traces

**Key Methods**:
- `invoke(messages)` - Return precomputed trace as AIMessage
- `get_agent_metrics()` - Retrieve agent metrics for trace
- `with_structured_output(schema)` - Compatibility method

#### 4. `ModelConfig` Integration

**New Field**: `manual_traces: Any = Field(default=None, exclude=True)`

**Validation**:
- Enforces `manual_traces` requirement for manual interface
- Auto-sets `id="manual"` and `model_name="manual"` for manual interface
- Validates that MCP tools are not used with manual interface

## User Workflows

### Workflow 1: Basic String Traces

In [None]:
from karenina.benchmark import Benchmark
from karenina.infrastructure.llm.manual_traces import ManualTraces
from karenina.schemas import ModelConfig, VerificationConfig

# Create benchmark
benchmark = Benchmark("simple_example")
benchmark.add_question(question="What is 2+2?", raw_answer="4", finished=True)
benchmark.add_question(question="What is 3+3?", raw_answer="6", finished=True)

# Initialize manual traces
manual_traces = ManualTraces(benchmark)

# Register traces by question text
manual_traces.register_trace("What is 2+2?", "The answer is 4. I added 2 and 2 to get 4.", map_to_id=True)

manual_traces.register_trace("What is 3+3?", "The answer is 6. I added 3 and 3 to get 6.", map_to_id=True)

print(f"Registered {len(list(benchmark.get_finished_questions()))} traces")
for q in benchmark.get_finished_questions(ids_only=False):
    print(f"  - {q['question'][:30]}...")

In [None]:
# Create manual config
manual_config = ModelConfig(interface="manual", manual_traces=manual_traces)

# Create judge config
judge_config = ModelConfig(
    id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", temperature=0.0, interface="langchain"
)

# Run verification
config = VerificationConfig(answering_models=[manual_config], parsing_models=[judge_config])

results = benchmark.run_verification(config)
print(f"Evaluated {len(results.results)} questions")

### Workflow 2: LangChain Message Lists with Tool Calls

When you use LangChain message lists (with `AIMessage`, `ToolMessage`, etc.), the system automatically extracts agent metrics like tool call counts and failures.

In [None]:
from langchain_core.messages import AIMessage, ToolMessage

from karenina.infrastructure.llm.manual_traces import ManualTraces

# Create a new benchmark for this example
benchmark = Benchmark("agent_workflow")
benchmark.add_question(question="What is 6 times 7?", raw_answer="42", finished=True)

# Initialize manual traces
manual_traces = ManualTraces(benchmark)

# Register trace with tool calls
messages = [
    AIMessage(content="I need to calculate this"),
    ToolMessage(name="calculator", content="Result: 42", tool_call_id="call_calc_001"),
    ToolMessage(name="validator", content="Validation passed", tool_call_id="call_valid_002"),
    AIMessage(content="The answer is 42. I verified this using a calculator and validator."),
]

manual_traces.register_trace("What is 6 times 7?", messages, map_to_id=True)

print("Registered trace with tool calls")
print("Agent metrics automatically extracted:")
print("  - tool_calls: 2")
print("  - unique_tools_used: 2 (calculator, validator)")
print("  - iterations: 1")

### Workflow 3: Batch Registration

For efficiency, you can register multiple traces at once using a dictionary.

In [None]:
from langchain_core.messages import AIMessage, ToolMessage

# Create benchmark
benchmark = Benchmark("batch_example")
benchmark.add_question(question="Question 1?", raw_answer="Answer 1", finished=True)
benchmark.add_question(question="Question 2?", raw_answer="Answer 2", finished=True)
benchmark.add_question(question="Question 3?", raw_answer="Answer 3", finished=True)

# Initialize manual traces
manual_traces = ManualTraces(benchmark)

# Prepare traces dictionary
traces = {
    "Question 1?": "Answer 1 with explanation.",
    "Question 2?": [
        AIMessage(content="Thinking..."),
        ToolMessage(name="tool", content="data", tool_call_id="call_1"),
        AIMessage(content="Answer 2 with context."),
    ],
    "Question 3?": "Answer 3 with details.",
}

# Batch register all at once
manual_traces.register_traces(traces, map_to_id=True)

print(f"Batch registered {len(traces)} traces")
print("All traces now available for verification")

### Workflow 4: Register by Question Hash

You can also register traces using the MD5 hash directly instead of question text.

In [None]:
# Create benchmark
benchmark = Benchmark("hash_example")
benchmark.add_question(question="What is 2+2?", raw_answer="4", finished=True)

# Initialize manual traces
manual_traces = ManualTraces(benchmark)

# Compute hash manually or get from CSV mapper export
question_hash = hashlib.md5(b"What is 2+2?").hexdigest()

print(f"Question hash: {question_hash}")

# Register by hash (map_to_id=False, default)
manual_traces.register_trace(question_hash, "The answer is 4.", map_to_id=False)

print("Registered trace using hash directly")

### Workflow 5: Populate Traces After Config Creation

You can create the config structure first and populate traces later, enabling flexible workflows.

In [None]:
# Create benchmark
benchmark = Benchmark("delayed_population")
benchmark.add_question(question="What is 2+2?", raw_answer="4", finished=True)

# 1. Create ManualTraces and ModelConfig upfront
manual_traces = ManualTraces(benchmark)
manual_config = ModelConfig(interface="manual", manual_traces=manual_traces)

print("Created config with empty traces")


# 2. Later, populate traces (e.g., from file, database, API)
def load_traces_from_source():
    return {"What is 2+2?": "The answer is 4."}


for question_text, trace_content in load_traces_from_source().items():
    manual_traces.register_trace(question_text, trace_content, map_to_id=True)

print("Populated traces from external source")

# 3. Run verification with populated traces
judge_config = ModelConfig(id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")

config = VerificationConfig(answering_models=[manual_config], parsing_models=[judge_config])

results = benchmark.run_verification(config)
print(f"Verification complete: {len(results.results)} results")

## Implementation Details

### Question Mapping

**Hash Generation**:
- Uses MD5 hash of UTF-8 encoded question text: `hashlib.md5(question_text.encode("utf-8")).hexdigest()`
- Same algorithm as `Question.id` property in `schemas/domain/question.py`
- Results in 32-character hexadecimal string

**Validation**:
- When `map_to_id=True`, question text is searched in benchmark's `_questions_cache`
- Raises `ValueError` if question not found, with computed hash and available count
- Exact text matching (case-sensitive, including whitespace)

### Trace Format Processing

**String Traces**:
- Stored as-is with no preprocessing
- No agent metrics extracted (`metrics = None`)
- Simplest format for basic answer evaluation

**LangChain Message Lists**:
1. **Validation**: Must be list of `BaseMessage` objects (AIMessage, ToolMessage, etc.)
2. **Metrics Extraction**:
   - Calls `_extract_agent_metrics(response)` from `verification_utils.py`
   - Extracts: tool calls, tool failures, iterations
3. **Harmonization**:
   - Calls `harmonize_agent_response(response)` from `mcp_utils.py`
   - Converts message list to unified string format
4. **Storage**: Both harmonized trace and metrics stored together

### Agent Metrics Structure

When using LangChain message lists, the following metrics are automatically extracted:

In [None]:
# Example agent metrics structure
example_metrics = {
    "tool_calls": 3,  # Number of tool invocations
    "unique_tools_used": 2,  # Number of unique tools
    "failed_tool_calls": 0,  # Number of failed invocations
    "iterations": 1,  # Agent iterations
}

print("Agent Metrics Structure:")
for key, value in example_metrics.items():
    print(f"  {key}: {value}")

### Session-Based Storage

**Design**:
- Global singleton `ManualTraceManager` instance
- Thread-safe with `threading.RLock()`
- Session timeout: 1 hour (3600 seconds)
- Automatic cleanup of expired traces

**Cleanup Strategy**:
1. **Timer-Based**: `threading.Timer` triggers cleanup after timeout
2. **Activity-Based**: Timer resets on any trace access
3. **Trace-Level**: Individual traces have timestamps, expired traces removed
4. **Session-Level**: If no activity for timeout period, entire session clears

### ModelConfig Validation

**Requirements for Manual Interface**:
1. `interface` must be `"manual"`
2. `manual_traces` must not be `None` (raises `ValueError` if missing)
3. `id` defaults to `"manual"` if not provided
4. `model_name` defaults to `"manual"` if not provided
5. `mcp_urls_dict` must be `None` (raises `ValueError` if MCP configured)

**Preset Compatibility**:
- `manual_traces` field marked with `Field(exclude=True)`
- Automatically excluded from Pydantic serialization
- Presets save config structure but not trace data
- Traces must be re-populated when loading preset

## Best Practices

### 1. Question Text Matching

**Do**:
- Use exact question text from benchmark (case-sensitive, including whitespace)
- Use `map_to_id=True` when working with question text
- Verify question text matches benchmark before registration

**Don't**:
- Modify question text (trim whitespace, change case, etc.)
- Assume approximate matching will work
- Register traces for questions not in benchmark

**Tip**: Export CSV mapper from benchmark to see exact question text and hashes

### 2. Trace Format Selection

**Use String Traces When**:
- Answers are simple text without tool calls
- No agent metrics needed
- Simplest workflow sufficient

**Use LangChain Message Lists When**:
- Preserving tool call history is important
- Agent metrics (tool calls, failures) are valuable
- Comparing agent-based vs. non-agent-based approaches
- Debugging tool usage patterns

### 3. Error Handling

**Common Errors**:

1. **Question Not Found**:
```
ValueError: Question not found in benchmark: 'What is 2+2?...'
Computed hash: 936dbc8755f623c951d96ea2b03e13bc
```
**Fix**: Verify exact question text matches benchmark, check for whitespace/case differences

2. **Invalid Hash Format**:
```
ManualTraceError: Invalid question hash format: 'short'
```
**Fix**: Ensure hash is 32-character hexadecimal MD5

3. **Missing Manual Traces**:
```
ValueError: manual_traces is required when interface='manual'
```
**Fix**: Pass `manual_traces` to ModelConfig constructor

4. **MCP Configuration Conflict**:
```
ValueError: MCP tools are not supported with manual interface
```
**Fix**: Remove `mcp_urls_dict` from manual ModelConfig

### 4. Performance Optimization

**Batch Registration**:
- Use `register_traces()` instead of multiple `register_trace()` calls
- Reduces overhead for large trace sets
- More readable code

**Memory Management**:
- Monitor trace count with `get_manual_trace_count()`
- Check memory usage with `get_memory_usage_info()`
- Clear traces with `clear_manual_traces()` when done

**Session Cleanup**:
- Traces auto-expire after 1 hour of inactivity
- Manual cleanup with `clear_manual_traces()` if needed
- Activity resets timeout (any trace access)

### 5. Testing and Validation

**Before Running Verification**, verify traces were registered correctly:

In [None]:
import hashlib

from karenina.infrastructure.llm.manual_traces import get_manual_trace, get_manual_trace_count, has_manual_trace

# Create benchmark and register a trace
benchmark = Benchmark("validation_example")
benchmark.add_question(question="What is 2+2?", raw_answer="4", finished=True)

manual_traces = ManualTraces(benchmark)
manual_traces.register_trace("What is 2+2?", "The answer is 4.", map_to_id=True)

# Verify trace was registered
question_hash = hashlib.md5(b"What is 2+2?").hexdigest()
print(f"Question hash: {question_hash}")
print(f"Has trace: {has_manual_trace(question_hash)}")

# Retrieve and inspect trace
trace = get_manual_trace(question_hash)
print(f"Registered trace: {trace[:50]}...")

# Check trace count
expected_count = len(list(benchmark.get_finished_questions()))
actual_count = get_manual_trace_count()
print(f"Expected traces: {expected_count}")
print(f"Actual traces: {actual_count}")

if actual_count != expected_count:
    print(f"Warning: Expected {expected_count} traces, have {actual_count}")
else:
    print("All traces registered correctly!")

### 6. Preset Workflow

**Saving Presets with Manual Configs**:

In [None]:
# Example: Saving preset with manual config
benchmark = Benchmark("preset_example")
benchmark.add_question(question="What is 2+2?", raw_answer="4", finished=True)

manual_traces = ManualTraces(benchmark)
manual_traces.register_trace("What is 2+2?", "The answer is 4.", map_to_id=True)

manual_config = ModelConfig(interface="manual", manual_traces=manual_traces)
judge_config = ModelConfig(id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")

config = VerificationConfig(answering_models=[manual_config], parsing_models=[judge_config])

# Manual traces excluded from serialization
config_dict = config.model_dump()
print("manual_traces in config dict:", "manual_traces" in config_dict)
print("Preset can be saved without trace data")

In [None]:
# Example: Loading preset and re-populating manual traces

# 1. Load preset (manual_traces will be None)
# loaded_config = preset_service.load_preset("my_preset")

# 2. Re-populate manual traces
manual_traces = ManualTraces(benchmark)
traces_dict = {"What is 2+2?": "The answer is 4."}
manual_traces.register_traces(traces_dict, map_to_id=True)

# 3. Update config with traces
# loaded_config.answering_models[0].manual_traces = manual_traces

# 4. Run verification
# benchmark.run_verification(loaded_config)

print("When loading presets with manual configs:")
print("1. Load preset (manual_traces will be None)")
print("2. Re-populate manual traces from your data source")
print("3. Update config with new manual_traces")
print("4. Run verification")

## Complete Examples

### Example 1: Simple String Traces

In [None]:
from karenina.benchmark import Benchmark
from karenina.infrastructure.llm.manual_traces import ManualTraces
from karenina.schemas import ModelConfig, VerificationConfig

# Create benchmark
benchmark = Benchmark("complete_example_1")
benchmark.add_question(question="What is 2+2?", raw_answer="4", finished=True)
benchmark.add_question(question="What is 3+3?", raw_answer="6", finished=True)

# Initialize manual traces
manual_traces = ManualTraces(benchmark)

# Register traces by question text
manual_traces.register_trace("What is 2+2?", "The answer is 4. I added 2 and 2 to get 4.", map_to_id=True)

manual_traces.register_trace("What is 3+3?", "The answer is 6. I added 3 and 3 to get 6.", map_to_id=True)

# Create manual config
manual_config = ModelConfig(interface="manual", manual_traces=manual_traces)

# Create judge config
judge_config = ModelConfig(
    id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", temperature=0.0, interface="langchain"
)

# Run verification
config = VerificationConfig(answering_models=[manual_config], parsing_models=[judge_config])

results = benchmark.run_verification(config)
print(f"Example 1 complete: {len(results.results)} results")
for r in results.results:
    print(f"  - {r.metadata.question_text[:30]}... passed: {r.template.verify_result}")

### Example 2: Batch Registration with Mixed Formats

In [None]:
from langchain_core.messages import AIMessage, ToolMessage

# Create benchmark
benchmark = Benchmark("complete_example_2")
benchmark.add_question(question="Question 1?", raw_answer="Answer 1", finished=True)
benchmark.add_question(question="Question 2?", raw_answer="Answer 2", finished=True)
benchmark.add_question(question="Question 3?", raw_answer="Answer 3", finished=True)

# Initialize manual traces
manual_traces = ManualTraces(benchmark)

# Prepare traces dictionary with mixed formats
traces = {
    "Question 1?": "Answer 1 is a simple string trace.",
    "Question 2?": [
        AIMessage(content="Let me think about this..."),
        ToolMessage(name="search", content="Found relevant data", tool_call_id="call_1"),
        AIMessage(content="Answer 2 based on search results."),
    ],
    "Question 3?": "Answer 3 is another string trace.",
}

# Batch register all at once
manual_traces.register_traces(traces, map_to_id=True)

print(f"Example 2: Batch registered {len(traces)} traces")
print("  - Question 1: String trace")
print("  - Question 2: Message list with tool calls")
print("  - Question 3: String trace")

### Example 3: Delayed Trace Population

In [None]:
# Create benchmark
benchmark = Benchmark("complete_example_3")
benchmark.add_question(question="What is 2+2?", raw_answer="4", finished=True)

# Step 1: Create config structure early
manual_traces = ManualTraces(benchmark)
manual_config = ModelConfig(interface="manual", manual_traces=manual_traces)

# Step 2: Pass config around, set up verification structure
judge_config = ModelConfig(id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", interface="langchain")

config = VerificationConfig(answering_models=[manual_config], parsing_models=[judge_config])


# Step 3: Later, populate traces (e.g., from file load, API call)
def load_traces_from_file(filepath):
    return {"What is 2+2?": "The answer is 4."}


traces_data = load_traces_from_file("experiment_traces.json")

for question_text, trace_content in traces_data.items():
    manual_traces.register_trace(question_text, trace_content, map_to_id=True)

# Step 4: Run verification with populated traces
results = benchmark.run_verification(config)
print(f"Example 3 complete: {len(results.results)} results")

## Summary

The Manual Trace System enables:

1. **Pre-Generated Answer Evaluation** - Evaluate LLM responses without making live API calls
2. **Flexible Trace Formats** - Support for both simple strings and rich message lists
3. **Agent Metrics Extraction** - Automatic tool call and failure tracking
4. **Efficient Workflows** - Batch registration, delayed population, preset compatibility
5. **Production-Ready** - Thread-safe, session-based, with automatic cleanup

**Key Workflow**: Create benchmark → Initialize `ManualTraces` → Register traces → Create `ModelConfig` with `interface="manual"` → Run verification

---

**Related Documentation**:
- **Quick Start**: Basic verification workflow
- **Verification**: Complete verification documentation
- **Configuration**: Model and provider configuration
- **TaskEval**: Evaluate agent workflow traces (similar concept for pre-existing outputs)