In [None]:
# Mock Setup - Hidden in rendered documentation
# This cell sets up mocking infrastructure for executable examples

import hashlib
import json
import sys
import tempfile
from datetime import datetime
from pathlib import Path
from unittest.mock import MagicMock, patch

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import after path is set
from karenina.schemas.results import TemplateResults, VerificationResultSet
from karenina.schemas.verification import (
    VerificationResult,
    VerificationResultMetadata,
    VerificationResultRubric,
    VerificationResultTemplate,
)


# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""

    def __init__(self, content: str = "Mock response"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}

    def __str__(self):
        return self.content


class MockStructuredOutput:
    """Mock structured output response that adapts to any template."""

    def __init__(self, **kwargs):
        # Set common attributes with realistic defaults
        self.finding = kwargs.get("finding", "BCL2 inhibition shows promise in treating certain cancers")
        self.status = kwargs.get("status", "In clinical trials")
        self.gene = kwargs.get("gene", "BCL2")
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def model_dump(self):
        return self.dict()


def create_mock_chat_model():
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("BCL2 inhibition shows promise in treating certain cancers")
    mock.ainvoke.return_value = MockLLMResponse("BCL2 inhibition shows promise in treating certain cancers")
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


def compute_result_id(question_id: str, answering_model: str, parsing_model: str, timestamp: str) -> str:
    """Compute deterministic 16-char SHA256 hash."""
    data = {
        "answering_mcp_servers": [],
        "answering_model": answering_model,
        "parsing_model": parsing_model,
        "question_id": question_id,
        "replicate": None,
        "timestamp": timestamp,
    }
    json_str = json.dumps(data, sort_keys=True, ensure_ascii=True)
    hash_obj = hashlib.sha256(json_str.encode("utf-8"))
    return hash_obj.hexdigest()[:16]


def create_mock_verification_result(question_id: str, question_text: str, answer: str, passed: bool = True):
    """Create a mock VerificationResult for testing."""
    timestamp = datetime.now().isoformat()
    template_id = hashlib.md5(str(question_id).encode()).hexdigest()[:32]

    # Create mock template result
    template = VerificationResultTemplate(
        raw_llm_response=f"Based on web search, {answer}.",
        parsed_llm_response={"finding": answer},
        parsed_gt_response={"finding": answer},
        verify_result=passed,
        template_verification_performed=True,
        usage_metadata={
            "answer_generation": {"total_tokens": 150},
            "parsing": {"total_tokens": 30},
            "total": {"total_tokens": 180},
        },
        abstention_check_performed=True,
        abstention_detected=False,
    )

    # Create mock rubric result
    rubric = VerificationResultRubric(
        rubric_evaluation_performed=True,
        llm_trait_scores={
            "Conciseness": 4,
            "Clarity": True,
        },
    )

    # Create metadata with all required fields
    metadata = VerificationResultMetadata(
        question_id=question_id,
        template_id=template_id,
        completed_without_errors=True,
        question_text=question_text,
        raw_answer=answer,
        answering_model="gpt-4.1-mini",
        parsing_model="gpt-4.1-mini",
        execution_time=2.5,
        timestamp=timestamp,
        result_id=compute_result_id(question_id, "gpt-4.1-mini", "gpt-4.1-mini", timestamp),
    )

    return VerificationResult(
        metadata=metadata,
        template=template,
        rubric=rubric,
    )


# Store original run_verification
_original_run_verification = None


def mock_run_verification(self, config):
    """Mock run_verification that returns realistic results."""
    global _original_run_verification

    # Get all finished questions
    finished = self.get_finished_questions(ids_only=False)

    if len(finished) == 0:
        if _original_run_verification:
            return _original_run_verification(self, config)
        return VerificationResultSet(results=[], template_results=TemplateResults(results=[]))

    results = []
    # Map question keywords to expected answers
    mock_data = [
        {
            "keywords": ["bcl2", "cancer"],
            "answer": "BCL2 inhibition shows promise in treating certain cancers",
            "passed": True,
        },
        {
            "keywords": ["crispr", "hemoglobin"],
            "answer": "CRISPR treatments for sickle cell disease are in clinical trials",
            "passed": True,
        },
    ]

    for question in finished:
        q_id = question["id"]
        q_text = question["question"]
        raw_answer = question.get("raw_answer", "")

        passed = True
        mock_ans = raw_answer
        q_text_lower = q_text.lower()

        for data in mock_data:
            if any(kw in q_text_lower for kw in data["keywords"]):
                passed = data["passed"]
                mock_ans = data["answer"]
                break

        results.append(
            create_mock_verification_result(question_id=q_id, question_text=q_text, answer=mock_ans, passed=passed)
        )

    template_results = TemplateResults(results=results)

    return VerificationResultSet(
        results=results,
        template_results=template_results,
        rubric_results=None,
    )


# Mock MCP utilities for documentation examples
def mock_fetch_tool_descriptions(mcp_urls_dict, tool_filter=None):
    """Mock fetching tool descriptions from MCP servers."""
    return {
        "web_search": "Search the web for information",
        "query_gene": "Query genomics database for gene information",
    }


# Patch all LLM providers before any imports
_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
]

for p in _llm_patches:
    p.start()

# Patch Benchmark.run_verification
from karenina.benchmark import Benchmark

_original_run_verification = Benchmark.run_verification
Benchmark.run_verification = mock_run_verification

# Patch MCP utilities
import karenina.utils.mcp.tools as mcp_tools_module

mcp_tools_module.sync_fetch_tool_descriptions = mock_fetch_tool_descriptions


def temp_path(filename: str) -> Path:
    """Helper to create paths in temp directory."""
    return TEMP_DIR / filename


# Cleanup
import atexit
import shutil


def _cleanup():
    Benchmark.run_verification = _original_run_verification
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print("✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")
print("✓ Mock verification results enabled - examples will show realistic output")
print("✓ MCP utilities mocked for demonstration")

# MCP Integration

This guide explains how to integrate Model Context Protocol (MCP) servers to provide tool access for LLMs during verification.

## What is MCP?

**Model Context Protocol (MCP)** is a standardized protocol that enables LLMs to access external tools and data sources. MCP servers provide tools that LLMs can invoke during answer generation, such as:

- Web search
- Database queries
- File system operations
- API calls
- Code execution
- Custom domain-specific tools

**Key Benefits**:

- Extend LLM capabilities beyond text generation
- Access real-time data during verification
- Standardized tool invocation protocol
- Modular tool integration

## Why Use MCP with Karenina?

MCP integration allows LLMs to access external information when answering benchmark questions.

**Use Cases**:

- **Current information**: Search web for recent drug approvals
- **Database access**: Query genomics databases for gene information
- **File operations**: Read configuration files or data files
- **API integration**: Call external APIs for real-time data
- **Custom tools**: Domain-specific tools for specialized benchmarks

**Example**: A benchmark question asks "What is the current FDA approval status of drug X?" The LLM can use an MCP web search tool to find the latest information instead of relying on training data.

## MCP Server Structure

An MCP server provides:

1. **Health Check**: Endpoint to verify server is running
2. **Tool Discovery**: List available tools and their schemas
3. **Tool Invocation**: Execute tools with parameters

```
MCP Server (http://localhost:3000/mcp)
├── GET  /health          # Server status
├── GET  /tools           # Available tools
└── POST /invoke          # Execute a tool
```

## Configuration

### Basic Setup

Configure MCP integration via `ModelConfig` using the `mcp_urls_dict` parameter:

In [None]:
from karenina import Benchmark
from karenina.schemas.config import ModelConfig
from karenina.schemas.verification import VerificationConfig

# Create benchmark
benchmark = Benchmark.create(
    name="Genomics Benchmark", description="Testing genomics knowledge with tool access", version="1.0.0"
)

benchmark.add_question(
    question="What is the latest research on BCL2 protein function?",
    raw_answer="BCL2 regulates apoptosis",
    author={"name": "Research Curator"},
)

print(f"✓ Created benchmark with {len(benchmark.get_question_ids())} question(s)")

In [None]:
# Generate templates (using mock model)
# Note: generate_all_templates takes individual parameters, not a ModelConfig
benchmark.generate_all_templates(model="gpt-4.1-mini", model_provider="openai", temperature=0.0)
print("✓ Templates generated")

In [None]:
# Configure verification with MCP server using mcp_urls_dict in ModelConfig
answering_model_with_mcp = ModelConfig(
    id="agent-with-mcp",
    model_name="gpt-4.1-mini",
    model_provider="openai",
    temperature=0.0,
    interface="langchain",
    mcp_urls_dict={"search_tools": "http://localhost:3000/mcp"},
)

# Create parsing model config
parsing_model = ModelConfig(id="parsing-model", model_name="gpt-4.1-mini", model_provider="openai", temperature=0.0)

config = VerificationConfig(
    answering_models=[answering_model_with_mcp], parsing_models=[parsing_model], replicate_count=1
)

print("✓ Verification config created")
print(f"  Answering model has MCP URLs: {answering_model_with_mcp.mcp_urls_dict}")

### Multiple MCP Servers

Configure multiple MCP servers for different tool categories:

In [None]:
# Multiple MCP servers
multi_mcp_model = ModelConfig(
    id="multi-mcp-agent",
    model_name="gpt-4.1-mini",
    model_provider="openai",
    temperature=0.0,
    interface="langchain",
    mcp_urls_dict={"search_tools": "http://localhost:3000/mcp", "database_tools": "http://localhost:3001/mcp"},
)

print("✓ Model configured with multiple MCP servers:")
for name, url in multi_mcp_model.mcp_urls_dict.items():
    print(f"  - {name}: {url}")

## Example MCP Tools

### Web Search Tool

Enables LLMs to search for current information.

**Example usage**: LLM searches for "latest BCL2 protein research" to answer a genomics question with recent findings.

### Database Query Tool

Allows LLMs to query databases.

**Example usage**: LLM queries database for "BCL2" to get official gene information, protein function, and chromosome location.

### File Read Tool

Enables LLMs to read data files.

**Example usage**: LLM reads a drug-target database file to answer questions about approved therapeutics.

## Complete Example

This example shows MCP integration for a genomics benchmark with web search:

In [None]:
from pathlib import Path

# Step 1: Create benchmark with questions requiring current data
benchmark = Benchmark.create(
    name="Current Genomics Research", description="Testing knowledge of recent genomics discoveries", version="1.0.0"
)

# Questions that benefit from tool access
benchmark.add_question(
    question="What are the latest findings on BCL2's role in cancer therapy?",
    raw_answer="BCL2 inhibition shows promise in treating certain cancers",
    author={"name": "Oncology Researcher"},
)

benchmark.add_question(
    question="What is the current status of CRISPR therapies for hemoglobin disorders?",
    raw_answer="CRISPR treatments for sickle cell disease are in clinical trials",
    author={"name": "Gene Therapy Researcher"},
)

print(f"✓ Created benchmark with {len(benchmark.get_question_ids())} questions")

In [None]:
# Step 2: Generate templates (one-time)
# Note: generate_all_templates takes individual parameters
benchmark.generate_all_templates(model="gpt-4.1-mini", model_provider="openai", temperature=0.0)
print("✓ Templates generated")

In [None]:
# Step 3: Configure verification with MCP web search tool
# (Assumes MCP server running at localhost:3000 with search tool)
answering_model_with_mcp = ModelConfig(
    id="web-search-agent",
    model_name="gpt-4.1-mini",
    model_provider="openai",
    temperature=0.0,
    interface="langchain",
    mcp_urls_dict={
        "web_search": "http://localhost:3000/mcp"  # Web search MCP server
    },
)

# Create parsing model config
parsing_model = ModelConfig(id="parsing-model", model_name="gpt-4.1-mini", model_provider="openai", temperature=0.0)

config = VerificationConfig(
    answering_models=[answering_model_with_mcp], parsing_models=[parsing_model], replicate_count=1
)

print("✓ Verification configured with MCP web search")

In [None]:
# Step 4: Run verification with tool access
# LLM can search web for current information
results = benchmark.run_verification(config)

print(f"✓ Verification complete: {len(results.results)} result(s)")

In [None]:
# Step 5: Analyze results
for result in results.results:
    question = benchmark.get_question(result.metadata.question_id)
    print(f"\nQuestion: {question['question'][:80]}...")
    print(f"Verification: {'✓ PASS' if result.template.verify_result else '✗ FAIL'}")
    print(f"Tokens used: {result.template.usage_metadata['total']['total_tokens']}")

## Validating MCP Servers

Before running verification, validate MCP server connectivity:

In [None]:
from karenina.utils.mcp import sync_fetch_tool_descriptions

# Test MCP server connectivity by fetching tool descriptions
server_url = "http://localhost:3000/mcp"
try:
    tool_descriptions = sync_fetch_tool_descriptions({"local": server_url})
    print("✓ MCP server is reachable")
    print(f"\nAvailable tools ({len(tool_descriptions)}):")
    for name, desc in tool_descriptions.items():
        print(f"  - {name}: {desc[:60]}...")
except Exception as e:
    print("✗ MCP server validation failed")
    print(f"Error: {e}")
    print("Ensure server is running and accessible")

## Discovering Available Tools

Query an MCP server to see what tools it provides:

In [None]:
from karenina.utils.mcp import sync_fetch_tool_descriptions

# Discover tools from MCP server
server_url = "http://localhost:3000/mcp"
tool_descriptions = sync_fetch_tool_descriptions({"local": server_url})

print(f"Discovered {len(tool_descriptions)} tools:\n")
for name, description in tool_descriptions.items():
    print(f"Tool: {name}")
    print(f"Description: {description}")
    print()

## Use Cases

### Use Case 1: Current Information Access

**Scenario**: Benchmark tests LLM knowledge of recent drug approvals.

**Setup**:
- Deploy MCP server with web search tool
- Configure verification with MCP server URL
- Questions ask about recent FDA approvals

**Benefit**: LLM can search for current information instead of relying on training data cutoff.

### Use Case 2: Database Integration

**Scenario**: Questions require querying a genomics database.

**Setup**:
- Deploy MCP server with database query tool
- Configure database connection in MCP server
- Questions ask about specific genes

**Benefit**: LLM gets accurate, up-to-date gene information from authoritative database.

### Use Case 3: File-Based Data

**Scenario**: Benchmark uses data files with drug-target mappings.

**Setup**:
- Deploy MCP server with file read tool
- Store drug-target data in structured files
- Configure file system access permissions

**Benefit**: LLM reads data files to answer questions accurately without relying on memorized facts.

### Use Case 4: API Integration

**Scenario**: Questions require real-time API data.

**Setup**:
- Deploy MCP server with API call tools
- Configure API keys and endpoints
- Questions ask about live data

**Benefit**: LLM calls APIs to fetch current data during verification.

## Creating a Simple MCP Server

Example MCP server with a genomics database query tool:

In [None]:
# Example MCP server code (not executed - for reference only)
print("""
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

# Mock genomics database
GENOMICS_DB = {
    "BCL2": {
        "full_name": "B-cell lymphoma 2",
        "chromosome": "18",
        "function": "Regulates apoptosis"
    },
    "HBB": {
        "full_name": "Hemoglobin subunit beta",
        "chromosome": "11",
        "function": "Oxygen transport"
    }
}

class ToolInvocation(BaseModel):
    tool_name: str
    parameters: dict

@app.get("/health")
def health_check():
    return {"status": "healthy"}

@app.get("/tools")
def list_tools():
    return {
        "tools": [
            {
                "name": "query_gene",
                "description": "Query genomics database for gene information",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "gene_symbol": {
                            "type": "string",
                            "description": "Gene symbol (e.g., BCL2, HBB)"
                        }
                    },
                    "required": ["gene_symbol"]
                }
            }
        ]
    }

@app.post("/invoke")
def invoke_tool(invocation: ToolInvocation):
    if invocation.tool_name == "query_gene":
        gene = invocation.parameters.get("gene_symbol", "").upper()
        if gene in GENOMICS_DB:
            return {"result": GENOMICS_DB[gene]}
        return {"error": f"Gene {gene} not found in database"}
    return {"error": "Unknown tool"}

# Run: uvicorn mcp_server:app --port 3000
""")

print("# Example usage:")
print("# Start MCP server:")
print("#   uvicorn mcp_server:app --port 3000")
print("")
print("# In another terminal, run Karenina verification with MCP:")
print("#   python verify_with_mcp.py")

## Anthropic Prompt Caching

When using Anthropic models (Claude) with MCP tools via the `langchain` interface, **prompt caching is enabled by default** to reduce costs and latency. This caches repetitive prompt content like system prompts, tool definitions, and conversation history on Anthropic's servers.

### How It Works

1. **First request**: System prompt, tools, and the user message are sent to the API and cached
2. **Subsequent requests**: Cached content is retrieved rather than reprocessed
3. **Cache expiration**: Content expires after the TTL (5 minutes or 1 hour)

### Configuration

Prompt caching is configured via `AgentMiddlewareConfig` in `ModelConfig`:

In [None]:
from karenina.schemas.config import AgentMiddlewareConfig, ModelConfig, PromptCachingConfig

model_config = ModelConfig(
    id="cached-claude",
    model_provider="anthropic",
    model_name="claude-sonnet-4-5-20250929",
    temperature=0.0,
    interface="langchain",
    mcp_urls_dict={"biocontext": "https://mcp.biocontext.ai/mcp/"},
    agent_middleware=AgentMiddlewareConfig(
        prompt_caching=PromptCachingConfig(
            enabled=True,  # Default: True for Anthropic models
            ttl="5m",  # Cache lifetime: "5m" (5 minutes) or "1h" (1 hour)
            min_messages_to_cache=0,  # Min messages before caching starts
            unsupported_model_behavior="warn",  # "ignore", "warn", or "raise"
        )
    ),
)

print("✓ Model configured with Anthropic prompt caching")
print(f"  Caching enabled: {model_config.agent_middleware.prompt_caching.enabled}")
print(f"  TTL: {model_config.agent_middleware.prompt_caching.ttl}")

### Configuration Options

| Parameter | Default | Description |
|-----------|---------|-------------|
| `enabled` | `True` | Enable/disable prompt caching |
| `ttl` | `"5m"` | Cache time-to-live: `"5m"` or `"1h"` |
| `min_messages_to_cache` | `0` | Minimum messages before caching activates |
| `unsupported_model_behavior` | `"warn"` | Behavior for non-Anthropic models |

### Disabling Prompt Caching

To disable prompt caching for Anthropic models:

In [None]:
# Disable prompt caching
no_cache_model = ModelConfig(
    id="no-cache-claude",
    model_provider="anthropic",
    model_name="claude-sonnet-4-5-20250929",
    temperature=0.0,
    interface="langchain",
    mcp_urls_dict={"biocontext": "https://mcp.biocontext.ai/mcp/"},
    agent_middleware=AgentMiddlewareConfig(prompt_caching=PromptCachingConfig(enabled=False)),
)

print("✓ Model configured with prompt caching disabled")
print(f"  Caching enabled: {no_cache_model.agent_middleware.prompt_caching.enabled}")

### Requirements

- **Provider**: `anthropic` only
- **Interface**: `langchain` only
- **Dependency**: `langchain-anthropic` must be installed

Prompt caching does **not** provide conversation memory - it only reduces API costs by caching tokens. For conversation persistence, use a checkpointer.

See the [LangChain documentation](https://docs.langchain.com/oss/python/integrations/middleware/anthropic#prompt-caching) for more details.

## Best Practices

### Server Configuration

**Do**:

- Validate MCP server before verification
- Use HTTPS in production
- Implement authentication for MCP servers
- Set appropriate timeout limits
- Log tool invocations for debugging

**Don't**:

- Expose MCP servers publicly without authentication
- Allow unrestricted file system access
- Skip server validation before use
- Use untrusted MCP servers

### Tool Design

**Do**:

- Provide clear tool descriptions
- Use typed parameters with JSON schema
- Return structured data
- Handle errors gracefully
- Document tool capabilities

**Don't**:

- Create tools with side effects (prefer read-only)
- Skip parameter validation
- Return unstructured text
- Allow dangerous operations without safeguards

### Security

**Do**:

- Validate all tool parameters
- Restrict tool permissions (principle of least privilege)
- Implement rate limiting
- Monitor tool usage
- Use network firewalls

**Don't**:

- Trust tool input without validation
- Grant excessive permissions
- Skip logging
- Ignore security warnings

## Troubleshooting

### Issue: MCP Server Not Reachable

**Error**: `Connection refused` or timeout errors

**Cause**: MCP server not running or wrong URL.

**Solutions**:
```bash
# Check server is running
curl http://localhost:3000/mcp/health

# Verify port and URL
ps aux | grep mcp

# Check firewall rules
```

### Issue: Tool Not Available

**Error**: `Tool 'web_search' not found`

**Cause**: Tool not registered in MCP server.

**Solution**:
```bash
# List available tools
curl http://localhost:3000/mcp/tools

# Verify tool name spelling matches exactly
```

### Issue: Tool Invocation Fails

**Error**: `Tool invocation failed: invalid parameters`

**Cause**: Parameters don't match tool schema.

**Solution**:
```python
# Check available tools from MCP server
from karenina.utils.mcp import sync_fetch_tool_descriptions

tool_descriptions = sync_fetch_tool_descriptions({"local": "http://localhost:3000/mcp"})
for name, desc in tool_descriptions.items():
    print(f"{name}: {desc}")
```

### Issue: Verification Slower with MCP

**Symptom**: Verification takes much longer with MCP enabled.

**Cause**: Tool invocations add latency.

**Solution**:

- Use faster MCP servers (local is better than remote)
- Cache tool results when possible
- Reduce network latency
- Set appropriate timeouts

## Limitations

**Current Limitations**:

- MCP integration primarily designed for server/GUI deployment
- Standalone library support is experimental
- Tool invocation tracking may be limited
- Some providers may not support function calling

**Best Use**:

- Use with karenina-server and karenina-gui for full features
- Standalone library works but with reduced visibility into tool usage
- Consider manual traces for reproducible testing instead

## Summary

MCP integration enables:

1. **Tool access** - LLMs can use external tools during verification
2. **Current data** - Access real-time information beyond training data
3. **Database queries** - Query structured databases
4. **File operations** - Read data from files
5. **API calls** - Integrate with external APIs

**Configure** by setting `mcp_urls_dict` in `ModelConfig` and ensure MCP server is running and accessible.

**Note**: MCP integration is most powerful when used with the full karenina-server and karenina-gui stack. Standalone library support exists but is more limited.