In [None]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata

import hashlib
import json
import sys
import tempfile
from datetime import datetime
from pathlib import Path
from unittest.mock import MagicMock, patch

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import after path is set
from karenina.schemas.workflow.template_results import TemplateResults
from karenina.schemas.workflow.verification.result import VerificationResult
from karenina.schemas.workflow.verification.result_components import (
    VerificationResultMetadata,
    VerificationResultRubric,
    VerificationResultTemplate,
)
from karenina.schemas.workflow.verification_result_set import VerificationResultSet


# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""

    def __init__(self, content: str = "Mock response"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}

    def __str__(self):
        return self.content


class MockStructuredOutput:
    """Mock structured output response that adapts to any template."""

    def __init__(self, **kwargs):
        # Set common attributes with realistic defaults
        self.count = kwargs.get("count", 46)
        self.target = kwargs.get("target", "BCL2")
        self.subunits = kwargs.get("subunits", 4)
        self.diseases = kwargs.get("diseases", ["asthma", "bronchitis", "pneumonia"])
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def model_dump(self):
        return self.dict()


def create_mock_chat_model():
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("46 chromosomes")
    mock.ainvoke.return_value = MockLLMResponse("46 chromosomes")
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


def compute_result_id(question_id: str, answering_model: str, parsing_model: str, timestamp: str) -> str:
    """Compute deterministic 16-char SHA256 hash."""
    data = {
        "answering_mcp_servers": [],
        "answering_model": answering_model,
        "parsing_model": parsing_model,
        "question_id": question_id,
        "replicate": None,
        "timestamp": timestamp,
    }
    json_str = json.dumps(data, sort_keys=True, ensure_ascii=True)
    hash_obj = hashlib.sha256(json_str.encode("utf-8"))
    return hash_obj.hexdigest()[:16]


def create_mock_verification_result(question_id: str, question_text: str, answer: str, passed: bool = True):
    """Create a mock VerificationResult for testing."""
    timestamp = datetime.now().isoformat()
    template_id = hashlib.md5(str(question_id).encode()).hexdigest()[:32]

    # Create mock template result
    template = VerificationResultTemplate(
        raw_llm_response=f"The answer is {answer}.",
        parsed_llm_response={"value": answer},
        parsed_gt_response={"value": answer},
        verify_result=passed,
        template_verification_performed=True,
        usage_metadata={
            "answer_generation": {"total_tokens": 50},
            "parsing": {"total_tokens": 30},
            "total": {"total_tokens": 80},
        },
        abstention_check_performed=True,
        abstention_detected=False,
    )

    # Create mock rubric result
    rubric = VerificationResultRubric(
        rubric_evaluation_performed=True,
        llm_trait_scores={
            "Conciseness": 4,
            "Clarity": True,
        },
    )

    # Create metadata with all required fields
    metadata = VerificationResultMetadata(
        question_id=question_id,
        template_id=template_id,
        completed_without_errors=True,
        question_text=question_text,
        raw_answer=answer,
        answering_model="gpt-4.1-mini",
        parsing_model="gpt-4.1-mini",
        execution_time=1.5,
        timestamp=timestamp,
        result_id=compute_result_id(question_id, "gpt-4.1-mini", "gpt-4.1-mini", timestamp),
    )

    return VerificationResult(
        metadata=metadata,
        template=template,
        rubric=rubric,
    )


# Store original methods
_original_run_verification = None
_original_generate_all_templates = None


def mock_generate_all_templates(self, *args, **kwargs):
    """Mock generate_all_templates - just print message and return."""
    # In a real environment, this would generate templates
    # For the notebook demo, we just return success
    return {}


def mock_run_verification(self, config):
    """Mock run_verification that returns realistic results."""
    global _original_run_verification

    # Get all finished questions
    finished = self.get_finished_questions(ids_only=False)

    if len(finished) == 0:
        # If no finished questions, return empty results
        return VerificationResultSet(results=[], template_results=TemplateResults(results=[]))

    results = []
    # Map question keywords to expected answers
    mock_data = [
        {"keywords": ["chromosomes"], "answer": "46", "passed": True},
        {"keywords": ["venetoclax", "bcl2"], "answer": "BCL2", "passed": True},
        {"keywords": ["hemoglobin", "subunits"], "answer": "4", "passed": True},
        {"keywords": ["inflammatory", "lung"], "answer": "asthma, bronchitis, pneumonia", "passed": True},
    ]

    for question in finished:
        q_id = question["id"]
        q_text = question["question"]
        raw_answer = question.get("raw_answer", "")

        passed = True
        mock_ans = raw_answer
        q_text_lower = q_text.lower()

        for data in mock_data:
            if any(kw in q_text_lower for kw in data["keywords"]):
                passed = data["passed"]
                mock_ans = data["answer"]
                break

        results.append(
            create_mock_verification_result(question_id=q_id, question_text=q_text, answer=mock_ans, passed=passed)
        )

    template_results = TemplateResults(results=results)

    return VerificationResultSet(
        results=results,
        template_results=template_results,
        rubric_results=None,
    )


# Patch all LLM providers before any imports
_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch(
        "karenina.infrastructure.llm.interface.init_chat_model_unified",
        side_effect=lambda **kwargs: create_mock_chat_model(),
    ),
]

for p in _llm_patches:
    p.start()

# Patch Benchmark methods
from karenina.benchmark import Benchmark

_original_run_verification = Benchmark.run_verification
_original_generate_all_templates = Benchmark.generate_all_templates
Benchmark.run_verification = mock_run_verification
Benchmark.generate_all_templates = mock_generate_all_templates


def temp_path(filename: str) -> Path:
    """Helper to create paths in temp directory."""
    return TEMP_DIR / filename


# Cleanup
import atexit
import shutil


def _cleanup():
    Benchmark.run_verification = _original_run_verification
    Benchmark.generate_all_templates = _original_generate_all_templates
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print("✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")
print("✓ Mock verification results enabled - examples will show realistic output")

# Few-Shot Prompting

Few-shot prompting is a technique where example question-answer pairs are provided to the LLM before asking the main question, helping guide responses toward expected formats, styles, and content.

## What is Few-Shot Prompting?

**Few-shot prompting** provides the LLM with examples of the task before asking it to perform the same task. For example:

```
Question: What is the approved drug target of Venetoclax?
Answer: BCL2

Question: How many chromosomes are in a human somatic cell?
Answer: 46

Question: How many protein subunits does hemoglobin A have?
Answer: [Model will answer here]
```

This technique can significantly improve:

- **Response quality**: Models learn from good examples
- **Consistency**: Responses follow demonstrated patterns
- **Format adherence**: Models match example structure
- **Accuracy**: Examples clarify expectations

## Why Use Few-Shot Prompting?

### 1. Improve Answer Quality

Models perform better when shown examples:

- Without few-shot: Verbose answer like "Hemoglobin A is a tetrameric protein consisting of two alpha and two beta subunits..."
- With few-shot: Concise answer like "4"

In [None]:
# Without few-shot: Verbose answer
verbose_answer = "Hemoglobin A is a tetrameric protein consisting of two alpha and two beta subunits..."

# With few-shot: Concise answer (like examples)
concise_answer = "4"

print(f"Verbose: {verbose_answer[:50]}...")
print(f"Concise: {concise_answer}")

### 2. Enforce Formatting

Guide models to specific answer formats by showing examples.

In [None]:
# Examples show concise numerical answers
few_shot_examples = [
    {"question": "How many chromosomes...", "answer": "46"},
    {"question": "How many subunits...", "answer": "4"},
]

print("Few-shot examples that demonstrate format:")
for ex in few_shot_examples:
    print(f"  Q: {ex['question']}")
    print(f"  A: {ex['answer']}")
print("\nModel learns to give brief numerical answers")

### 3. Demonstrate Style

Show models the desired response style (e.g., technical nomenclature).

In [None]:
# Examples show technical nomenclature
few_shot_examples = [
    {"question": "What is the target of Venetoclax?", "answer": "BCL2"},
    {"question": "What does TP53 encode?", "answer": "tumor protein p53"},
]

print("Examples that demonstrate technical nomenclature:")
for ex in few_shot_examples:
    print(f"  {ex['answer']}")
print("\nModel learns to use standard nomenclature")

## Basic Configuration

### Simple K-Shot Mode

Use the same number of examples for all questions.

In [None]:
from karenina import Benchmark
from karenina.schemas import FewShotConfig, ModelConfig, VerificationConfig

# Create few-shot config with k=3 (use 3 examples per question)
few_shot_config = FewShotConfig(enabled=True, global_mode="k-shot", global_k=3)

# Create verification config with few-shot
model_config = ModelConfig(
    id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", temperature=0.0, interface="langchain"
)

config = VerificationConfig(
    answering_models=[model_config], parsing_models=[model_config], few_shot_config=few_shot_config
)

print(f"Few-shot enabled: {few_shot_config.enabled}")
print(f"Global mode: {few_shot_config.global_mode}")
print(f"Global k: {few_shot_config.global_k}")

### Use All Available Examples

Use every available example for each question.

In [None]:
# Configure to use all examples
few_shot_config_all = FewShotConfig(enabled=True, global_mode="all")

print(f"Global mode: {few_shot_config_all.global_mode}")
print("\nWhen to use: Maximum context, small number of high-quality examples.")

## Adding Examples to Questions

### When Creating Questions

Add few-shot examples when creating questions.

In [None]:
from karenina import Benchmark

benchmark = Benchmark.create(name="Genomics Benchmark")

# Add question with few-shot examples
benchmark.add_question(
    question="What is the approved drug target of Venetoclax?",
    raw_answer="BCL2",
    few_shot_examples=[
        {"question": "What is the approved drug target of Imatinib?", "answer": "BCR-ABL tyrosine kinase"},
        {"question": "What is the approved drug target of Trastuzumab?", "answer": "HER2"},
        {"question": "What is the approved drug target of Rituximab?", "answer": "CD20"},
    ],
)

print("Question added with few-shot examples")

### Adding Examples Later

Add examples to existing questions.

In [None]:
# Example: Adding few-shot examples to an existing question
# (In practice, you would load your benchmark first)

# Load benchmark
# benchmark = Benchmark.load("genomics_benchmark.jsonld")

# Get question
# question_id = list(benchmark.questions.keys())[0]
# question = benchmark.get_question(result.question_id)

# Add few-shot examples
# question.few_shot_examples = [
#     {"question": "How many autosomal chromosome pairs...", "answer": "22"},
#     {"question": "How many sex chromosomes...", "answer": "2"},
# ]

# Save updated benchmark
# benchmark.save("genomics_benchmark.jsonld")

print("Examples shown above demonstrate adding examples to existing questions")

## Complete Example

Here's an end-to-end workflow using few-shot prompting with a genomics benchmark.

In [None]:
from pathlib import Path

from karenina import Benchmark
from karenina.schemas import FewShotConfig, ModelConfig, VerificationConfig

# ============================================================
# STEP 1: Create benchmark with genomics questions
# ============================================================

benchmark = Benchmark.create(
    name="Genomics Knowledge Benchmark",
    description="Testing LLM knowledge of genomics with few-shot prompting",
    version="1.0.0",
)

print("Step 1: Created benchmark")

In [None]:
# ============================================================
# STEP 2: Add questions with few-shot examples
# ============================================================

# Question 1: Drug target with similar drug examples
benchmark.add_question(
    question="What is the approved drug target of Venetoclax?",
    raw_answer="BCL2",
    author={"name": "Pharma Curator"},
    few_shot_examples=[
        {"question": "What is the approved drug target of Imatinib?", "answer": "BCR-ABL tyrosine kinase"},
        {"question": "What is the approved drug target of Trastuzumab?", "answer": "HER2"},
    ],
)

# Question 2: Numerical answer with similar numerical examples
benchmark.add_question(
    question="How many chromosomes are in a human somatic cell?",
    raw_answer="46",
    author={"name": "Genetics Curator"},
    few_shot_examples=[
        {"question": "How many autosomal chromosome pairs are in humans?", "answer": "22"},
        {"question": "How many sex chromosomes are in humans?", "answer": "2"},
    ],
)

# Question 3: Protein structure with similar structure examples
benchmark.add_question(
    question="How many protein subunits does hemoglobin A have?",
    raw_answer="4",
    author={"name": "Biochemistry Curator"},
    few_shot_examples=[
        {"question": "How many subunits does RNA polymerase have?", "answer": "5"},
        {"question": "How many catalytic subunits does DNA polymerase III have?", "answer": "3"},
    ],
)

print("Step 2: Added 3 questions with few-shot examples")

In [None]:
# ============================================================
# STEP 3: Generate templates
# ============================================================

print("Step 3: Generating templates...")
benchmark.generate_all_templates(model="gpt-4.1-mini", model_provider="openai", temperature=0.0, interface="langchain")
print("✓ Templates generated")

In [None]:
# ============================================================
# STEP 4: Configure few-shot prompting
# ============================================================

# Create model config for verification
model_config = ModelConfig(
    id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", temperature=0.0, interface="langchain"
)

# Option A: Use k-shot mode (same number of examples per question)
few_shot_config = FewShotConfig(
    enabled=True,
    global_mode="k-shot",
    global_k=2,  # Use 2 examples per question
)

print("Step 4: Configured few-shot (k-shot, k=2)")
print(f"  Enabled: {few_shot_config.enabled}")
print(f"  Mode: {few_shot_config.global_mode}")
print(f"  K value: {few_shot_config.global_k}")

In [None]:
# ============================================================
# STEP 5: Run verification with few-shot
# ============================================================

print("\nStep 5: Running verification with few-shot prompting...")
config = VerificationConfig(
    answering_models=[model_config], parsing_models=[model_config], few_shot_config=few_shot_config
)

results = benchmark.run_verification(config)
print(f"✓ Verification complete: {len(results.results)} questions")

In [None]:
# ============================================================
# STEP 6: Analyze results
# ============================================================

# Check if we have results (in mock environment, questions may not be "finished")
if len(results.results) > 0:
    passed = sum(1 for r in results.results if r.verify_result)
    print(f"\nPass rate: {passed}/{len(results.results)} ({passed / len(results.results) * 100:.1f}%)")

    # Show individual results
    for result in results.results:
        question = benchmark.get_question(result.question_id)
        print(f"\nQuestion: {question['question']}")
        print(f"  Expected: {question['raw_answer']}")
        print(f"  Model answer: {result.template.raw_llm_response[:50]}...")
        print(f"  Correct: {'✓' if result.verify_result else '✗'}")
else:
    print("\nNo results to display - in production, this would show verification results")
    print("Mock environment: Questions need templates to be verified")

In [None]:
# Save benchmark with results
save_path = temp_path("genomics_benchmark_few_shot.jsonld")
benchmark.save(str(save_path))
print(f"\n✓ Benchmark saved to: {save_path}")

## Advanced Configurations

### Different K Values Per Question

Use different numbers of examples for different questions.

In [None]:
# Configure different k values per question
few_shot_config_per_question = FewShotConfig.k_shot_for_questions(
    question_k_mapping={
        "question_id_1": 5,  # Use 5 examples for complex question
        "question_id_2": 2,  # Use 2 examples for simple question
        "question_id_3": 3,  # Use 3 examples
    },
    global_k=3,  # Fallback for questions not in mapping
)

print("Per-question k-shot configuration:")
for qid, cfg in few_shot_config_per_question.question_configs.items():
    print(f"  {qid}: k={cfg.k}")
print(f"\nGlobal fallback k: {few_shot_config_per_question.global_k}")
print("\nWhen to use: Questions have varying complexity levels.")

### Custom Example Selection by Index

Manually select specific examples by their position.

In [None]:
# Select specific examples by index (0-based)
# In practice, get question IDs from your benchmark
# question_ids = list(benchmark.questions.keys())

# Example configuration (using placeholder IDs)
few_shot_config_custom = FewShotConfig.from_index_selections(
    {
        "question_1": [0, 1],  # Use first 2 examples
        "question_2": [0, 2],  # Use 1st and 3rd examples
        "question_3": [1, 2, 3],  # Use 2nd, 3rd, and 4th examples
    }
)

print("Custom index selection configuration:")
for qid, cfg in few_shot_config_custom.question_configs.items():
    print(f"  {qid}: indices={cfg.selected_examples}")
print("\nWhen to use: Fine-grained control over which examples are used.")

### Adding External Examples

Add examples that aren't from the question's available pool.

In [None]:
# Create config with global external examples
few_shot_config_external = FewShotConfig(
    enabled=True,
    global_mode="k-shot",
    global_k=2,
    global_external_examples=[
        {"question": "What is the molecular weight of glucose?", "answer": "180.16 g/mol"},
        {"question": "What is the pH of neutral water?", "answer": "7.0"},
    ],
)

print(f"Global external examples: {len(few_shot_config_external.global_external_examples)}")
for i, ex in enumerate(few_shot_config_external.global_external_examples, 1):
    print(f"  {i}. Q: {ex['question'][:40]}...")
    print(f"     A: {ex['answer']}")
print("\nWhen to use: Want to include domain-specific high-quality examples for all questions.")

## Modes Overview

### "k-shot" Mode

Use the first k examples for each question. Best for consistent number of examples across all questions.

In [None]:
# k-shot mode
k_shot_mode = FewShotConfig(
    global_mode="k-shot",
    global_k=3,  # Use 3 examples
)
print(f"Mode: {k_shot_mode.global_mode}")
print(f"K value: {k_shot_mode.global_k}")
print("Best for: Consistent number of examples across all questions.")

### "all" Mode

Use all available examples for each question. Best for small number of high-quality examples.

In [None]:
# all mode
all_mode = FewShotConfig(global_mode="all")
print(f"Mode: {all_mode.global_mode}")
print("Best for: Small number of high-quality examples, maximum context.")

### "custom" Mode

Manually select specific examples. Best for fine-grained control.

In [None]:
# custom mode via from_index_selections
custom_mode = FewShotConfig.from_index_selections(
    {
        "question_1": [0, 2, 4],  # Select by index
    }
)
print(f"Mode: {custom_mode.global_mode}")
print("Best for: Fine-grained control, curated example selection.")

### "none" Mode

Disable few-shot for specific questions. Best for testing impact of few-shot.

In [None]:
# none mode for specific questions
from karenina.schemas import QuestionFewShotConfig

none_mode = FewShotConfig(
    global_mode="k-shot",
    global_k=3,
    question_configs={
        "special_question_id": QuestionFewShotConfig(mode="none")  # No examples
    },
)
print(f"Global mode: {none_mode.global_mode}")
print("Special question has mode='none'")
print("Best for: Testing impact of few-shot on specific questions.")

## Prompt Format

Few-shot prompts are constructed in a simple Q&A format:

```
Question: What is the approved drug target of Imatinib?
Answer: BCR-ABL tyrosine kinase

Question: What is the approved drug target of Trastuzumab?
Answer: HER2

Question: What is the approved drug target of Venetoclax?
Answer: [Model generates answer here]
```

The LLM sees the examples before answering, learning from their format and content.

## When to Use Few-Shot

### ✅ Use Few-Shot When:

- **Enforcing formats**: Need specific answer structure (numerical, gene symbols, etc.)
- **Improving conciseness**: Models tend to be verbose, examples show brevity
- **Demonstrating style**: Want technical nomenclature or specific terminology
- **Complex tasks**: Task benefits from seeing examples
- **Consistency matters**: Need similar answers across similar questions

### ❌ Don't Use Few-Shot When:

- **Simple tasks**: Model already performs well without examples
- **Token limits**: Using large models with limited context windows
- **No good examples**: Don't have high-quality representative examples
- **Testing baselines**: Measuring model performance without assistance
- **Fast iteration**: Adding complexity during initial testing

## Best Practices

### 1. Start with K-Shot Mode

Begin with k-shot before moving to custom selection.

In [None]:
# Start simple
simple_config = FewShotConfig(global_mode="k-shot", global_k=3)
print("Start with simple k-shot configuration")
print(f"Mode: {simple_config.global_mode}, K: {simple_config.global_k}")
print("Can iterate to custom if needed")

### 2. Use 2-3 Examples

More examples aren't always better. Start small.

In [None]:
# Good starting point
config_k2 = FewShotConfig(global_mode="k-shot", global_k=2)

# Can increase if needed
config_k5 = FewShotConfig(global_mode="k-shot", global_k=5)

print("Recommended: Start with k=2-3")
print(f"  k=2: {config_k2.global_k} examples")
print(f"  k=5: {config_k5.global_k} examples (increase if needed)")
print("\nWhy: Diminishing returns after 3-5 examples, increased token costs.")

### 3. Choose Representative Examples

Select examples that represent the task well.

In [None]:
# Good: Similar domain, clear answers
good_examples = [
    {"question": "What is the target of Venetoclax?", "answer": "BCL2"},
    {"question": "What is the target of Imatinib?", "answer": "BCR-ABL"},
]

print("Good examples (similar domain, clear answers):")
for ex in good_examples:
    print(f"  {ex['answer']}")

print("\nAvoid: Unrelated domain examples like math problems or general knowledge")

### 4. Match Example Format to Expected Answers

Examples should match the format you want.

In [None]:
# For concise numerical answers
numerical_examples = [
    {"question": "How many chromosomes...", "answer": "46"},
    {"question": "How many subunits...", "answer": "4"},
]

print("Numerical format examples:")
for ex in numerical_examples:
    print(f"  {ex['answer']}")

print("\nTip: Match example style to desired output style")

### 5. Test With and Without Few-Shot

Measure the impact of few-shot prompting.

In [None]:
# Example of comparing with and without few-shot
# In practice, you would run both verifications

print("Testing methodology:")
print("")
print("1. Baseline (no few-shot):")
print("   config_baseline = VerificationConfig(")
print("       answering_models=[model_config],")
print("       parsing_models=[model_config]")
print("   )")
print("")
print("2. With few-shot:")
print("   config_few_shot = VerificationConfig(")
print("       answering_models=[model_config],")
print("       parsing_models=[model_config],")
print("       few_shot_config=FewShotConfig(global_mode='k-shot', global_k=3)")
print("   )")
print("")
print("3. Compare pass rates to measure improvement")

### 6. Monitor Token Usage

More examples consume more tokens:

- Each example: ~50-200 tokens (depending on length)
- 3 examples: ~150-600 tokens
- 10 examples: ~500-2000 tokens

**Watch for**: Context window limits, increased API costs.

### 7. Use External Examples Sparingly

Only add external examples when necessary.

In [None]:
# Good: Add domain-specific high-quality examples
good_external = FewShotConfig(
    global_external_examples=[{"question": "High-quality domain example", "answer": "Perfect answer"}]
)

print("Good: 1-2 high-quality external examples")
print("Bad: Too many unrelated external examples (50+)")
print("")
print(f"External examples count: {len(good_external.global_external_examples)}")

## Troubleshooting

### Issue 1: Examples Not Being Used

**Symptom**: Few-shot enabled but no improvement in results.

In [None]:
# Verify few-shot is enabled
config_check = FewShotConfig(enabled=True, global_mode="k-shot", global_k=3)

print(f"Few-shot enabled: {config_check.enabled}")
print(f"Global mode: {config_check.global_mode}")
print(f"Global k: {config_check.global_k}")

print("\nSolutions:")
print("1. Verify few-shot is enabled")
print("2. Check questions have examples")
print("3. Check mode isn't 'none'")

### Issue 2: Too Many Examples

**Symptom**: LLM context limit exceeded, slow responses.

In [None]:
# Solution: Reduce k value
reduced_config = FewShotConfig(global_mode="k-shot", global_k=2)

print("Solution: Reduce k value")
print(f"Reduced k to: {reduced_config.global_k}")
print("")
print("Alternative: Switch to custom selection to pick specific examples")

### Issue 3: Poor Example Quality

**Symptom**: Few-shot makes results worse.

In [None]:
# Solution: Use custom selection to pick better examples
better_config = FewShotConfig.from_index_selections(
    {
        "question_1": [0, 2],  # Skip poor example at index 1
    }
)

print("Solutions:")
print("1. Review example quality")
print("2. Use custom selection to skip poor examples")
print("3. Add external high-quality examples")
print("")
print(f"Custom mode: {better_config.global_mode}")

### Issue 4: Inconsistent Results

**Symptom**: Results vary between runs.

In [None]:
# Solution: Set temperature to 0
deterministic_config = ModelConfig(
    id="gpt-4.1-mini",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.0,  # Deterministic
    interface="langchain",
)

print("Solutions:")
print("1. Set temperature to 0")
print(f"   Temperature: {deterministic_config.temperature}")
print("")
print("2. Use deterministic example selection")
print("   K-shot mode uses question ID as seed for reproducibility")

## Performance Considerations

### Token Usage

Few-shot prompting increases token consumption:

| Examples | Estimated Tokens (Input) | Cost Impact |
|----------|-------------------------|-------------|
| 0 (no few-shot) | Baseline | Baseline |
| 2 examples | +100-400 tokens | +5-10% |
| 5 examples | +250-1000 tokens | +10-20% |
| 10 examples | +500-2000 tokens | +20-40% |

### Latency

More examples slightly increase latency:

- Token generation time: ~50-100ms per 100 tokens
- 3 examples: +50-200ms additional latency

**Recommendation:** Start with k=2-3 to balance quality and cost.

## Next Steps

Once you have few-shot prompting configured, you can:

- [Verification](../using-karenina/verification.md) - Run verifications with few-shot
- [Presets](presets.md) - Save few-shot configurations in presets
- [Deep-Judgment](deep-judgment.md) - Combine with deep-judgment parsing
- [Templates](../using-karenina/templates.md) - Design templates that work with few-shot

## Related Documentation

- [Verification](../using-karenina/verification.md) - Core verification workflow
- [Adding Questions](../using-karenina/adding-questions.md) - How to add questions with examples
- [Presets](presets.md) - Save few-shot configurations
- [Templates](../using-karenina/templates.md) - Template creation