In [None]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata

import sys
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch

sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))


class MockLLMResponse:
    def __init__(self, content: str = "Mock response"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}


class MockStructuredOutput:
    def __init__(self, **kwargs):
        self.count = kwargs.get("count", 46)
        self.target = kwargs.get("target", "BCL2")
        self.subunits = kwargs.get("subunits", 4)
        self.diseases = kwargs.get("diseases", ["asthma", "bronchitis", "pneumonia"])
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def model_dump(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}


def create_mock_chat_model():
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("46 chromosomes")
    mock.ainvoke.return_value = MockLLMResponse("46 chromosomes")
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch(
        "karenina.infrastructure.llm.interface.init_chat_model_unified",
        side_effect=lambda **kwargs: create_mock_chat_model(),
    ),
]

for p in _llm_patches:
    p.start()

import atexit
import shutil

from karenina.benchmark import Benchmark


def _cleanup():
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print("✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")

# Model Configuration

This guide covers how to configure LLM models in Karenina using `ModelConfig`. You'll learn about model providers, interfaces, advanced parameters, and how to pass vendor-specific options.

**Quick Navigation:**

- [What is ModelConfig?](#what-is-modelconfig) - Core concepts and use cases
- [Basic ModelConfig](#basic-modelconfig) - Minimal configuration example
- [ModelConfig Parameters](#modelconfig-parameters) - Required and optional parameters
- [Interfaces](#interfaces) - LangChain, OpenAI endpoint, OpenRouter, manual
- [Model Providers](#model-providers) - OpenAI, Google, Anthropic configuration
- [Temperature Parameter](#temperature-parameter) - Controlling randomness and determinism
- [Extra Keyword Arguments](#extra-keyword-arguments) - Vendor-specific options and API keys
- [System Prompts](#system-prompts) - Custom system prompt configuration
- [MCP Tool Integration](#mcp-tool-integration) - Enable tool use during answer generation
- [Common Configuration Patterns](#common-configuration-patterns) - Typical setup examples
- [Best Practices](#best-practices) - Recommendations for benchmarking and API keys
- [Troubleshooting](#troubleshooting) - Common errors and solutions

## What is ModelConfig?

`ModelConfig` is the configuration object that defines which LLM to use and how to interact with it. It's used in three key places:

1. **Template generation**: LLMs that generate answer templates for questions
2. **Answering models**: LLMs that generate responses to benchmark questions
3. **Parsing models** (judges): LLMs that extract structured data from responses using templates

A single `ModelConfig` can be used for all three roles, or you can use different models for each role.

## Basic ModelConfig

The simplest model configuration:

In [None]:
from karenina.schemas import ModelConfig

model_config = ModelConfig(
    id="my-model", model_name="gpt-4.1-mini", model_provider="openai", temperature=0.0, interface="langchain"
)

print("ModelConfig created:")
print(f"  ID: {model_config.id}")
print(f"  Model: {model_config.model_name}")
print(f"  Provider: {model_config.model_provider}")
print(f"  Interface: {model_config.interface}")
print(f"  Temperature: {model_config.temperature}")

## ModelConfig Parameters

### Required Parameters

| Parameter | Type | Description | Example |
|-----------|------|-------------|---------|
| `id` | `str` | Unique identifier for this model configuration | `"gpt-4.1-mini"`, `"my-custom-model"` |
| `model_name` | `str` | Full model name as recognized by the provider | `"gpt-4.1-mini"`, `"claude-sonnet-4.5"`, `"gemini-2.5-flash"` |
| `interface` | `str` | Interface type (see [Interfaces](#interfaces)) | `"langchain"`, `"openai_endpoint"`, `"openrouter"`, `"manual"` |

### Optional Parameters

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `model_provider` | `str` | Required for `langchain` | Provider name (see [Providers](#model-providers)) |
| `temperature` | `float` | `0.1` | Sampling temperature (0.0-1.0). Use 0.0 for deterministic benchmarking |
| `system_prompt` | `str` | `None` | Optional system prompt override |
| `max_retries` | `int` | `2` | Maximum retry attempts for API calls |
| `endpoint_base_url` | `str` | `None` | Custom endpoint URL (for `openai_endpoint` interface) |
| `endpoint_api_key` | `SecretStr` | `None` | API key for custom endpoint (for `openai_endpoint` interface) |
| `mcp_urls_dict` | `dict[str, str]` | `None` | MCP server URLs for tool use |
| `mcp_tool_filter` | `list[str]` | `None` | Filter specific MCP tools |
| `extra_kwargs` | `dict[str, Any]` | `None` | Additional keyword arguments (see [Extra Keyword Arguments](#extra-keyword-arguments)) |
| `manual_traces` | `ManualTraces` | `None` | Pre-computed traces (for `manual` interface) |

## Interfaces

Karenina supports four interfaces for connecting to LLMs. Choose based on your use case:

### 1. LangChain Interface (`langchain`)

**Default and recommended interface** for most use cases. Uses LangChain's model integrations.

In [None]:
model_config = ModelConfig(
    id="gpt-4.1-mini", model_name="gpt-4.1-mini", model_provider="openai", interface="langchain", temperature=0.0
)

print("LangChain interface configuration:")
print("  ✓ Uses LangChain's standardized model integrations")
print("  ✓ Built-in retry logic and error handling")
print("  ✓ API key from environment variable: OPENAI_API_KEY")
print("\nSupported providers: openai, google_genai, anthropic, and more")

**When to use:**

- ✅ Working with OpenAI, Google, Anthropic, or other LangChain-supported providers
- ✅ Need standardized interface across multiple providers
- ✅ Want built-in retry logic and error handling

**Requirements:**

- API key must be set in environment variables (see [Configuration](../configuration.md#api-keys))
- Or API key can be passed via `extra_kwargs` (see [Extra Keyword Arguments](#extra-keyword-arguments))

**Supported providers**: `openai`, `google_genai`, `anthropic`, and others

### 2. OpenAI Endpoint Interface (`openai_endpoint`)

Use this interface for **custom endpoints** that implement the OpenAI-compatible API (e.g., vLLM, Ollama, local models).

In [None]:
# Example configuration (for demonstration - endpoint would need to be running)
# model_config = ModelConfig(
#     id="local-model",
#     model_name="glm-4.6",
#     interface="openai_endpoint",
#     endpoint_base_url="http://localhost:8000/v1",
#     endpoint_api_key="dummy-key",
#     temperature=0.0
# )

print("OpenAI Endpoint interface configuration pattern:")
print("  ✓ For local models (vLLM, Ollama, etc.)")
print("  ✓ For custom inference servers")
print("  ✓ Requires endpoint_base_url to point to your server")
print("\nExample parameters shown in commented code above")

**When to use:**

- ✅ Running local models (vLLM, Ollama, etc.)
- ✅ Using custom inference servers
- ✅ OpenAI-compatible APIs

**Requirements:**

- `endpoint_base_url` must point to your server
- Some servers require `endpoint_api_key` (even if just a dummy value)

### 3. OpenRouter Interface (`openrouter`)

Use OpenRouter for unified access to multiple LLM providers through a single API.

In [None]:
# Example configuration (requires OPENROUTER_API_KEY)
# model_config = ModelConfig(
#     id="claude-via-openrouter",
#     model_name="anthropic/claude-3.5-sonnet",
#     interface="openrouter",
#     temperature=0.0
# )

print("OpenRouter interface configuration pattern:")
print("  ✓ Unified billing across multiple providers")
print("  ✓ Easy switching between providers")
print("  ✓ Requires OPENROUTER_API_KEY environment variable")
print("\nNote: model_provider not required - specified in model_name")

**When to use:**

- ✅ Want unified billing across multiple providers
- ✅ Want to switch between providers easily

**Requirements:**
- `OPENROUTER_API_KEY` must be set in environment variables
- pass the api key via `extra_kwargs`

**Note**: `model_provider` is not required for OpenRouter interface since the provider is specified in the `model_name`

### 4. Manual Interface (`manual`)

For testing and debugging with pre-computed responses (no LLM API calls).

In [None]:
from karenina import Benchmark
from karenina.infrastructure.llm.manual_traces import ManualTraces

demo_benchmark = Benchmark.create(name="Manual Traces Demo", description="Demonstrating manual interface")

demo_benchmark.add_question(question="What is 2+2?", raw_answer="4", finished=True)

manual_traces = ManualTraces(demo_benchmark)

manual_traces.register_trace("What is 2+2?", "The answer is 4", map_to_id=True)

model_config = ModelConfig(interface="manual", manual_traces=manual_traces)

print("Manual interface configuration:")
print("  ✓ No LLM API calls made")
print("  ✓ Uses pre-computed responses from traces")
print("  ✓ Ideal for testing and debugging")
print("  ✓ No API costs")

**When to use:**

- ✅ Testing workflows without API costs
- ✅ Debugging specific scenarios
- ✅ Evaluating pre-recorded LLM responses
- ✅ Comparing different answer generation approaches

See the **[Manual Traces Guide](../advanced/manual-traces.md)** for comprehensive documentation.

## Model Providers

Model providers are specified with the `model_provider` parameter (required for `langchain` interface).

### Supported Providers

| Provider | Value | Example Models | API Key Required |
|----------|-------|----------------|------------------|
| OpenAI | `"openai"` | `gpt-4.1-mini`, `gpt-4.1-mini`, `gpt-4-turbo` | `OPENAI_API_KEY` |
| Google | `"google_genai"` | `gemini-2.5-flash`, `gemini-2.5-pro` | `GOOGLE_API_KEY` |
| Anthropic | `"anthropic"` | `claude-4-5-sonnet`, `claude-4-5-opus` | `ANTHROPIC_API_KEY` |

### Example Configurations

In [None]:
openai_config = ModelConfig(
    id="gpt-4.1-mini", model_name="gpt-4.1-mini", model_provider="openai", interface="langchain", temperature=0.0
)

google_config = ModelConfig(
    id="gemini-flash",
    model_name="gemini-2.5-flash",
    model_provider="google_genai",
    interface="langchain",
    temperature=0.0,
)

anthropic_config = ModelConfig(
    id="claude-sonnet",
    model_name="claude-sonnet-4.5",
    model_provider="anthropic",
    interface="langchain",
    temperature=0.0,
)

print("Provider configurations created:")
for config in [openai_config, google_config, anthropic_config]:
    print(f"  - {config.id}: {config.model_provider}/{config.model_name}")

## Temperature Parameter

The `temperature` parameter controls randomness in model outputs:

- **`0.0`** - Fully deterministic (recommended for benchmarking)
- **`0.1-0.3`** - Low randomness (slight variation)
- **`0.7-0.9`** - High randomness (creative responses)
- **`1.0+`** - Maximum randomness

**For benchmarking**: Always use `temperature=0.0` to ensure reproducible results.

In [None]:
answering_model = ModelConfig(
    id="answering", model_name="gpt-4.1-mini", model_provider="openai", interface="langchain", temperature=0.3
)

parsing_model = ModelConfig(
    id="parsing", model_name="gpt-4.1-mini", model_provider="openai", interface="langchain", temperature=0.0
)

print("Temperature settings:")
print(f"  Answering model: {answering_model.temperature} (low variation)")
print(f"  Parsing model: {parsing_model.temperature} (fully deterministic)")

## Extra Keyword Arguments

The `extra_kwargs` field allows you to pass additional keyword arguments to the underlying model interface.

### Example 1: Passing API Key Directly

In [None]:
print("Passing API keys via extra_kwargs:")
print("  ✓ Alternative to environment variables")
print("  ✓ Useful for testing with multiple keys")
print("  ✓ Good for temporary key usage")
print("\nNote: .env files still recommended for security")

### Example 2: Disabling Thinking Mode

In [None]:
print("Controlling model behavior with extra_kwargs:")
print("  ✓ Enable/disable thinking modes")
print("  ✓ Control reasoning separation")
print("  ✓ Pass vendor-specific options")

### Example 3: Passing Generation Parameters

In [None]:
model_config = ModelConfig(
    id="custom-params",
    model_name="gpt-4.1-mini",
    model_provider="openai",
    interface="langchain",
    temperature=0.0,
    extra_kwargs={"max_tokens": 500, "top_p": 0.9, "frequency_penalty": 0.1, "presence_penalty": 0.1},
)

print("Generation parameters via extra_kwargs:")
print(f"  max_tokens: {model_config.extra_kwargs['max_tokens']}")
print(f"  top_p: {model_config.extra_kwargs['top_p']}")
print(f"  frequency_penalty: {model_config.extra_kwargs['frequency_penalty']}")
print(f"  presence_penalty: {model_config.extra_kwargs['presence_penalty']}")

### How extra_kwargs Works

The arguments are passed to different places depending on the interface:

| Interface | Where Arguments Go |
|-----------|-------------------|
| `langchain` | Passed to LangChain model constructor |
| `openai_endpoint` | Passed to OpenAI client's chat completion call |
| `openrouter` | Passed to OpenRouter API call |

## System Prompts

You can override the default system prompt for template generation, answering, or parsing:

In [None]:
answering_model = ModelConfig(
    id="biology-expert",
    model_name="gpt-4.1-mini",
    model_provider="openai",
    interface="langchain",
    temperature=0.0,
    system_prompt="You are a knowledgeable genomics expert. Provide detailed, accurate answers.",
)

print("Custom system prompt configured:")
print(f"  ID: {answering_model.id}")
print(f"  System prompt: {answering_model.system_prompt}")

## MCP Tool Integration

Karenina supports Model Context Protocol (MCP) for tool access during answer generation.

In [None]:
print("MCP Tool Integration:")
print("  ✓ mcp_urls_dict: Maps tool categories to MCP server URLs")
print("  ✓ mcp_tool_filter: Optional whitelist of allowed tools")
print("\nSupported interfaces: langchain, openai_endpoint, openrouter")
print("Not supported with: manual interface")
print("\nFor comprehensive documentation, see MCP Integration Guide")

## Common Configuration Patterns

### Same Model for All Roles

Use a single model configuration for template generation, answering, and parsing (simplest approach):

In [None]:
from karenina import Benchmark

model_config = ModelConfig(
    id="gpt-4.1-mini", model_name="gpt-4.1-mini", model_provider="openai", interface="langchain", temperature=0.0
)

print("Single model for all roles:")
print(f"  ID: {model_config.id}")
print("  ✓ Template generation")
print("  ✓ Answering")
print("  ✓ Parsing")
print("\nSimpler approach - same model everywhere")

### Different Models for Different Roles

Configure different models for specific tasks (optimal for cost/quality):

In [None]:
answering_model = ModelConfig(
    id="sonnet-4.5", model_name="claude-4.5-sonnet", model_provider="anthropic", interface="langchain", temperature=0.0
)

utility_model = ModelConfig(
    id="gpt-4.1-mini", model_name="gpt-4.1-mini", model_provider="openai", interface="langchain", temperature=0.0
)

print("Different models for different roles:")
print(f"  Answering: {answering_model.id} ({answering_model.model_provider})")
print(f"  Utility: {utility_model.id} ({utility_model.model_provider})")
print("\nOptimal for cost/quality balance")

### Configuring Multiple Models

Create multiple model configurations for comparison testing:

In [None]:
models_to_test = [
    ModelConfig(
        id="gpt-4.1-mini", model_name="gpt-4.1-mini", model_provider="openai", interface="langchain", temperature=0.0
    ),
    ModelConfig(
        id="claude-sonnet",
        model_name="claude-sonnet-4.5",
        model_provider="anthropic",
        interface="langchain",
        temperature=0.0,
    ),
    ModelConfig(
        id="gemini-flash",
        model_name="gemini-2.5-flash",
        model_provider="google_genai",
        interface="langchain",
        temperature=0.0,
    ),
]

print("Models configured for comparison testing:")
for m in models_to_test:
    print(f"  - {m.id}: {m.model_provider}/{m.model_name}")

## Best Practices

### For Benchmarking

- ✅ Always use `temperature=0.0` for reproducible results
- ✅ Use the same parsing model across different answering models for fair comparison
- ✅ Document your model configurations in your project README
- ✅ Use descriptive `id` values (e.g., "gpt-4.1-mini-biology-expert")

### For API Keys

- ✅ Store API keys in `.env` files (see [Configuration](../configuration.md#api-keys))
- ✅ Use different keys for development and production
- ✅ Rotate keys regularly
- ❌ Never commit API keys to version control
- ⚠️ Only pass keys via `extra_kwargs` when necessary (testing, temporary use)

### For Model Selection

- ✅ Use `gpt-4.1-mini` as the default (fast, cost-effective)
- ✅ Use `gpt-5` or `claude-4-5-sonnet` for higher quality (more expensive)
- ✅ Use same model for all roles initially (simpler)
- ✅ Optimize later: cheaper model for parsing/templates, expensive for answering

## Troubleshooting

### API Key Not Found

**Solution**: Set the API key in environment variables:
```bash
export OPENAI_API_KEY="sk-..."
```

Or pass directly via `extra_kwargs`:
```python
extra_kwargs={"api_key": "sk-..."}
```

### Invalid Model Name

**Solution**: Check the correct model name for your provider:
- OpenAI: `gpt-4.1-mini`, `gpt-4.1-mini`, `gpt-4-turbo`
- Google: `gemini-2.5-flash`, `gemini-1.5-pro`
- Anthropic: `claude-sonnet-4.5`, `claude-3-5-sonnet-20241022`

## Next Steps

- **[Running Verification](verification.md)** - Learn about `VerificationConfig` and running benchmarks
- **[Configuration Guide](../configuration.md)** - Environment variables and API key setup
- **[Configuration Presets](../advanced/presets.md)** - Save and load model configurations
- **[Manual Traces](../advanced/manual-traces.md)** - Detailed guide to pre-computed responses
- **[MCP Integration](../advanced/mcp-integration.md)** - Comprehensive tool integration guide
- **[Templates](templates.md)** - Generate and manage answer templates