In [None]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata

import hashlib
import json
import os
import sys
import tempfile
from datetime import datetime
from pathlib import Path
from unittest.mock import MagicMock, patch

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import after path is set
from karenina.schemas.workflow.template_results import TemplateResults
from karenina.schemas.workflow.verification.result import VerificationResult
from karenina.schemas.workflow.verification.result_components import (
    VerificationResultMetadata,
    VerificationResultRubric,
    VerificationResultTemplate,
)
from karenina.schemas.workflow.verification_result_set import VerificationResultSet


# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""

    def __init__(self, content: str = "Mock response"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}

    def __str__(self):
        return self.content


class MockStructuredOutput:
    """Mock structured output response that adapts to any template."""

    def __init__(self, **kwargs):
        # Set common attributes with realistic defaults
        self.count = kwargs.get("count", 46)
        self.target = kwargs.get("target", "BCL2")
        self.subunits = kwargs.get("subunits", 4)
        self.diseases = kwargs.get("diseases", ["asthma", "bronchitis", "pneumonia"])
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def model_dump(self):
        return self.dict()


def create_mock_chat_model():
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("46 chromosomes")
    mock.ainvoke.return_value = MockLLMResponse("46 chromosomes")
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


def compute_result_id(question_id: str, answering_model: str, parsing_model: str, timestamp: str) -> str:
    """Compute deterministic 16-char SHA256 hash."""
    data = {
        "answering_mcp_servers": [],
        "answering_model": answering_model,
        "parsing_model": parsing_model,
        "question_id": question_id,
        "replicate": None,
        "timestamp": timestamp,
    }
    json_str = json.dumps(data, sort_keys=True, ensure_ascii=True)
    hash_obj = hashlib.sha256(json_str.encode("utf-8"))
    return hash_obj.hexdigest()[:16]


def create_mock_verification_result(
    question_id: str, question_text: str, answer: str, passed: bool = True, embedding_check: dict | None = None
):
    """Create a mock VerificationResult for testing."""
    timestamp = datetime.now().isoformat()
    template_id = hashlib.md5(str(question_id).encode()).hexdigest()[:32]

    # Create mock template result with embedding check metadata
    template = VerificationResultTemplate(
        raw_llm_response=f"The answer is {answer}.",
        parsed_llm_response={"value": answer},
        parsed_gt_response={"value": answer},
        verify_result=passed,
        template_verification_performed=True,
        usage_metadata={
            "answer_generation": {"total_tokens": 50},
            "parsing": {"total_tokens": 30},
            "total": {"total_tokens": 80},
        },
        abstention_check_performed=True,
        abstention_detected=False,
        # Embedding check metadata
        embedding_check_performed=embedding_check.get("performed", False) if embedding_check else False,
        embedding_similarity_score=embedding_check.get("similarity") if embedding_check else None,
        embedding_override_applied=embedding_check.get("override", False) if embedding_check else False,
        embedding_model_used=embedding_check.get("model") if embedding_check else None,
    )

    # Create mock rubric result
    rubric = VerificationResultRubric(
        rubric_evaluation_performed=True,
        llm_trait_scores={
            "Conciseness": 4,
            "Clarity": True,
        },
    )

    # Create metadata with all required fields
    metadata = VerificationResultMetadata(
        question_id=question_id,
        template_id=template_id,
        completed_without_errors=True,
        question_text=question_text,
        raw_answer=answer,
        answering_model="gpt-4.1-mini",
        parsing_model="gpt-4.1-mini",
        execution_time=1.5,
        timestamp=timestamp,
        result_id=compute_result_id(question_id, "gpt-4.1-mini", "gpt-4.1-mini", timestamp),
    )

    return VerificationResult(
        metadata=metadata,
        template=template,
        rubric=rubric,
    )


# Store original run_verification
_original_run_verification = None

# Store the benchmark questions globally for the mock
_benchmark_questions = []


def mock_run_verification(self, config):
    """Mock run_verification that returns realistic results with embedding check."""
    global _original_run_verification, _benchmark_questions

    # Get all finished questions
    finished = self.get_finished_questions(ids_only=False)

    # If no finished questions, generate mock results from stored questions
    if len(finished) == 0:
        # Use the globally stored questions
        questions_to_process = _benchmark_questions
    else:
        questions_to_process = finished

    # If still no questions, return empty results
    if len(questions_to_process) == 0:
        return VerificationResultSet(results=[], template_results=TemplateResults(results=[]))

    results = []

    # Map question keywords to expected answers and embedding check scenarios
    mock_data = [
        {
            "keywords": ["venetoclax", "bcl2"],
            "answer": "BCL2",
            "passed": False,  # Initial verification fails
            "embedding": {"performed": True, "similarity": 0.9123, "override": True, "model": "all-MiniLM-L6-v2"},
        },
        {
            "keywords": ["chromosomes"],
            "answer": "46",
            "passed": False,  # Initial verification fails
            "embedding": {"performed": True, "similarity": 0.8801, "override": True, "model": "all-MiniLM-L6-v2"},
        },
        {
            "keywords": ["hemoglobin", "subunits"],
            "answer": "4",
            "passed": False,  # Initial verification fails
            "embedding": {"performed": True, "similarity": 0.8634, "override": True, "model": "all-MiniLM-L6-v2"},
        },
    ]

    for question in questions_to_process:
        # Handle both dict format and Question object format
        if isinstance(question, dict):
            q_id = question.get("id", "unknown")
            q_text = question.get("question", "")
            raw_answer = question.get("raw_answer", "")
        else:
            # Question object
            q_id = getattr(question, "id", "unknown")
            q_text = getattr(question, "question", "")
            raw_answer = getattr(question, "raw_answer", "")

        passed = True
        mock_ans = raw_answer
        embedding_data = None
        q_text_lower = q_text.lower()

        for data in mock_data:
            if any(kw in q_text_lower for kw in data["keywords"]):
                passed = data["passed"]
                mock_ans = data["answer"]
                embedding_data = data["embedding"]
                break

        # Generate a deterministic question_id if needed
        if not q_id or q_id == "unknown":
            q_id = f"question-{abs(hash(q_text)) % 1000000:07d}"

        results.append(
            create_mock_verification_result(
                question_id=q_id, question_text=q_text, answer=mock_ans, passed=passed, embedding_check=embedding_data
            )
        )

    template_results = TemplateResults(results=results)

    return VerificationResultSet(
        results=results,
        template_results=template_results,
        rubric_results=None,
    )


# Patch add_question to store questions globally
_original_add_question = None


def mock_add_question(self, *args, **kwargs):
    """Mock add_question that stores questions globally."""
    # Call the original to add the question
    qid = _original_add_question(self, *args, **kwargs)
    # Store the question data for the mock verification
    question_data = {"id": qid, **kwargs}
    if args:
        # Handle positional args
        if len(args) > 0:
            question_data["question"] = args[0]
        if len(args) > 1:
            question_data["raw_answer"] = args[1]
    _benchmark_questions.append(question_data)
    return qid


# Patch all LLM providers before any imports
_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch(
        "karenina.infrastructure.llm.interface.init_chat_model_unified",
        side_effect=lambda **kwargs: create_mock_chat_model(),
    ),
]

for p in _llm_patches:
    p.start()

# Patch Benchmark methods
from karenina.benchmark import Benchmark

_original_run_verification = Benchmark.run_verification
_original_add_question = Benchmark.add_question
Benchmark.run_verification = mock_run_verification
Benchmark.add_question = mock_add_question


def temp_path(filename: str) -> Path:
    """Helper to create paths in temp directory."""
    return TEMP_DIR / filename


# Cleanup
import atexit
import shutil


def _cleanup():
    Benchmark.run_verification = _original_run_verification
    Benchmark.add_question = _original_add_question
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print("✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")
print("✓ Mock verification results enabled - examples will show realistic output")
print("✓ Embedding check scenarios configured for demonstration")

# Embedding Check (Semantic Fallback)

Embedding check provides a semantic fallback mechanism that can rescue verification failures when answers are semantically correct despite structural differences.

## What is Embedding Check?

**Embedding check** is an optional feature that uses sentence embeddings to detect semantically equivalent answers that fail strict template matching. When verification fails, this feature computes the semantic similarity between the expected answer and the model's response. If similarity exceeds a configurable threshold, an LLM validates semantic equivalence and can override the initial failure.

**Key benefits:**

- **Reduces false negatives**: Catches paraphrased but correct answers
- **Flexible evaluation**: Handles structural variations without changing templates
- **Semantic awareness**: Uses deep learning embeddings for meaning comparison
- **LLM validation**: Confirms equivalence with parsing model judgment
- **Zero overhead when disabled**: Only runs on failed verifications

## How Embedding Check Works

Embedding check activates **only when initial verification returns `False`**:

**1. Initial Verification Fails**

Template-based verification returns `False` due to structural mismatch.

**2. Compute Embedding Similarity**

Uses SentenceTransformer models to generate embeddings for both the expected answer and the model's response, then computes cosine similarity (0.0-1.0).

**3. Check Threshold**

If similarity score exceeds the configured threshold (default: 0.85), proceed to LLM validation.

**4. LLM Semantic Validation**

The parsing model evaluates whether the two answers are semantically equivalent, providing a yes/no judgment with reasoning.

**5. Override Result**

If the LLM confirms semantic equivalence, the verification result is overridden from `False` to `True`.

## Common Use Cases

### Use Case 1: Paraphrased Answers

**Scenario**: LLM provides the correct answer with different wording.

**Example**:

- Expected Answer: "BCL2"
- Model Response: "The BCL-2 protein"

**Result**:

- Initial verification: `False` (different structure)
- Embedding similarity: `0.91`
- Semantic check: `True` (same protein mentioned)
- Final result: `True` (overridden) ✓

### Use Case 2: Numerical Format Differences

**Scenario**: Same number in different representations.

**Example**:

- Expected Answer: "46"
- Model Response: "Forty-six chromosomes"

**Result**:

- Initial verification: `False` (string "46" ≠ "Forty-six chromosomes")
- Embedding similarity: `0.88`
- Semantic check: `True` (same numerical value)
- Final result: `True` (overridden) ✓

### Use Case 3: Structural Variations

**Scenario**: Correct information in different structure.

**Example**:

- Expected Answer: "4"
- Model Response: "Hemoglobin A consists of four protein subunits"

**Result**:

- Initial verification: `False` (template expects just number)
- Embedding similarity: `0.86`
- Semantic check: `True` (correct count mentioned)
- Final result: `True` (overridden) ✓

## Enabling Embedding Check

Embedding check is **disabled by default**. Enable it using environment variables.

### Installation

Embedding check requires the `sentence-transformers` library. Install it with the optional dependency:

```bash
pip install karenina[embeddings]
```

### Configuration

In [None]:
# Enable embedding check
os.environ["EMBEDDING_CHECK"] = "true"

# Specify embedding model (default: all-MiniLM-L6-v2)
os.environ["EMBEDDING_CHECK_MODEL"] = "all-MiniLM-L6-v2"

# Set similarity threshold 0.0-1.0 (default: 0.85)
os.environ["EMBEDDING_CHECK_THRESHOLD"] = "0.85"

print("Embedding check configuration:")
print(f"  EMBEDDING_CHECK = {os.getenv('EMBEDDING_CHECK')}")
print(f"  EMBEDDING_CHECK_MODEL = {os.getenv('EMBEDDING_CHECK_MODEL')}")
print(f"  EMBEDDING_CHECK_THRESHOLD = {os.getenv('EMBEDDING_CHECK_THRESHOLD')}")

### Supported Embedding Models

Any SentenceTransformer model is supported. Popular choices:

| Model | Speed | Accuracy | Use Case |
|-------|-------|----------|----------|
| `all-MiniLM-L6-v2` (default) | Fast | Good | General purpose, balanced |
| `all-mpnet-base-v2` | Slower | Better | Higher accuracy needed |
| `multi-qa-MiniLM-L6-cos-v1` | Fast | Good | Question-answering tasks |
| `paraphrase-multilingual-MiniLM-L12-v2` | Medium | Good | Multilingual support |
| `all-distilroberta-v1` | Fast | Medium | Fast inference |

## Complete Example

Here's an end-to-end workflow using embedding check with a genomics benchmark:

In [None]:
from karenina import Benchmark
from karenina.schemas import ModelConfig, VerificationConfig

# 1. Enable embedding check (already set above, confirming)
os.environ["EMBEDDING_CHECK"] = "true"
os.environ["EMBEDDING_CHECK_MODEL"] = "all-MiniLM-L6-v2"
os.environ["EMBEDDING_CHECK_THRESHOLD"] = "0.85"

# 2. Create benchmark with genomics questions
benchmark = Benchmark.create(
    name="Genomics Knowledge Benchmark",
    description="Testing LLM knowledge of genomics and molecular biology",
    version="1.0.0",
)

# Add questions
benchmark.add_question(
    question="What is the approved drug target of Venetoclax?", raw_answer="BCL2", author={"name": "Pharma Curator"}
)

benchmark.add_question(
    question="How many chromosomes are in a human somatic cell?", raw_answer="46", author={"name": "Genetics Curator"}
)

benchmark.add_question(
    question="How many protein subunits does hemoglobin A have?",
    raw_answer="4",
    author={"name": "Biochemistry Curator"},
)

print(f"Created benchmark: {benchmark.name}")
print(f"Added {len(benchmark.get_finished_questions())} questions")

In [None]:
# 3. Generate templates
model_config = ModelConfig(
    id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", temperature=0.0, interface="langchain"
)

print("Generating templates...")
benchmark.generate_all_templates(
    model=model_config.model_name,
    model_provider=model_config.model_provider,
    temperature=model_config.temperature,
    interface=model_config.interface,
)

print(f"Generated {len(benchmark.get_finished_questions())} templates")

In [None]:
# 4. Run verification with embedding check
print("Running verification...")

# Enable embedding check in config as well
config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    embedding_check_enabled=True,  # Enable embedding check in config
    embedding_check_model="all-MiniLM-L6-v2",
    embedding_check_threshold=0.85,
)

results = benchmark.run_verification(config)

print(f"Verification complete! Processed {len(results)} questions")

In [None]:
# 5. Analyze embedding check results
print("\n=== Embedding Check Results ===")
override_count = 0

for result in results.results:
    if result.template and result.template.embedding_check_performed:
        print(f"\nQuestion: {result.metadata.question_text[:60]}...")
        print(f"  Expected: {result.metadata.raw_answer}")
        print(f"  Model Response: {result.template.raw_llm_response[:60]}...")
        print(f"  Similarity Score: {result.template.embedding_similarity_score:.4f}")
        print(f"  Model Used: {result.template.embedding_model_used}")
        print(f"  Override Applied: {result.template.embedding_override_applied}")

        if result.template.embedding_override_applied:
            override_count += 1
            print("  ✓ Verification overridden: False → True")

print(f"\nTotal overrides: {override_count}")

## Understanding Results

### Result Metadata

When embedding check runs, results include additional metadata:

In [None]:
# Access embedding check metadata from a result
# Note: Access through result.template, not result directly
for result in results.results:
    if result.template and result.template.embedding_check_performed:
        print("Embedding check metadata:")
        print(f"  embedding_check_performed: {result.template.embedding_check_performed}")
        print(f"  embedding_similarity_score: {result.template.embedding_similarity_score}")
        print(f"  embedding_override_applied: {result.template.embedding_override_applied}")
        print(f"  embedding_model_used: {result.template.embedding_model_used}")
        break  # Just show first example

### Filtering for Overrides

Find all cases where embedding check rescued a failed verification:

In [None]:
# Get all overridden results
overridden = [r for r in results.results if r.template and r.template.embedding_override_applied]

print(f"Found {len(overridden)} overridden verifications")

for result in overridden:
    print(f"  {result.metadata.question_id}: similarity={result.template.embedding_similarity_score:.4f}")

### Computing Override Statistics

In [None]:
# Calculate embedding check statistics
total_questions = len(results)
embedding_checks_performed = sum(1 for r in results.results if r.template and r.template.embedding_check_performed)
overrides_applied = sum(1 for r in results.results if r.template and r.template.embedding_override_applied)

print(f"Total questions: {total_questions}")
print(f"Embedding checks performed: {embedding_checks_performed}")
print(f"Overrides applied: {overrides_applied}")
if embedding_checks_performed > 0:
    print(f"Override rate: {overrides_applied / embedding_checks_performed * 100:.1f}%")

## Performance Considerations

### When Disabled

- **Zero overhead**: Feature not loaded or executed
- **No dependencies required**: sentence-transformers not needed

### When Enabled

Embedding check only runs on **failed verifications**, so the impact depends on your failure rate.

**Cost impact:**

Embedding check adds one additional LLM call (semantic validation) for each failed verification where similarity exceeds the threshold. This uses your configured parsing model.

## Tuning the Similarity Threshold

The similarity threshold (default: 0.85) controls when LLM validation is triggered.

### Threshold Guidelines

| Threshold | Behavior | Use Case |
|-----------|----------|----------|
| **0.80-0.85** | Moderate selectivity | General purpose, balanced |
| **0.85-0.90** (default) | Higher selectivity | Reduce false overrides |
| **0.90-0.95** | Very selective | Only very similar answers |
| **0.75-0.80** | Lower selectivity | Catch more paraphrases |

### Finding the Right Threshold

**Start with default (0.85):**

```python
os.environ["EMBEDDING_CHECK_THRESHOLD"] = "0.85"
```

**Too many false overrides?** → Increase threshold:

```python
os.environ["EMBEDDING_CHECK_THRESHOLD"] = "0.90"
```

**Missing valid paraphrases?** → Decrease threshold:

```python
os.environ["EMBEDDING_CHECK_THRESHOLD"] = "0.80"
```

### Threshold Experimentation

Test different thresholds on a sample. Note: This requires `sentence-transformers` to be installed. In this demo, we'll show the API usage:

In [None]:
# Example: Testing different thresholds
# Note: compute_embedding_similarity requires sentence-transformers
# This is a demonstration of the API - in actual use, install with:
# pip install karenina[embeddings]


# Test cases (expected answer, model response)
test_cases = [
    ("BCL2", "The BCL-2 protein"),
    ("46", "Forty-six chromosomes"),
    ("4", "Four protein subunits"),
    ("hemoglobin", "haemoglobin"),  # Spelling variant
]

# Try different thresholds
thresholds = [0.75, 0.80, 0.85, 0.90, 0.95]

print("Testing similarity scores across different answer pairs:\n")

# In this demo, we'll simulate the similarity scores
# In actual use, compute_embedding_similarity would return real scores
mock_similarities = [0.91, 0.88, 0.86, 0.84]

for i, (expected, response) in enumerate(test_cases):
    similarity = mock_similarities[i]
    print(f"Expected: '{expected}'")
    print(f"Response: '{response}'")
    print(f"Similarity: {similarity:.4f}")

    for threshold in thresholds:
        would_trigger = "✓" if similarity >= threshold else "✗"
        print(f"  Threshold {threshold}: {would_trigger}")
    print()

## When to Use Embedding Check

### ✅ Use Embedding Check When:

- **Paraphrased answers are common**: Models often rephrase correct answers
- **Format flexibility needed**: Accept "46" and "forty-six" as equivalent
- **Reducing false negatives**: Minimize cases where correct answers are marked wrong
- **Testing creative models**: Models that elaborate or rephrase more frequently
- **Multi-language evaluation**: Detecting equivalent meanings across languages

### ❌ Don't Use Embedding Check When:

- **Strict format required**: Exact format is critical (e.g., gene symbols, IDs)
- **High precision needed**: False positives are more costly than false negatives
- **Templates handle variations**: Templates already account for expected variations
- **Performance is critical**: Cannot afford extra 500-2000ms per failed verification
- **Deterministic evaluation required**: Need reproducible results without LLM judgment

## Best Practices

### 1. Enable Selectively

Don't enable embedding check for all benchmarks. Use it when you know paraphrasing is common:

In [None]:
# Good: Enable for natural language questions
os.environ["EMBEDDING_CHECK"] = "true"
# benchmark_nl = Benchmark.load("natural_language_qa.jsonld")

# Good: Disable for strict format questions
os.environ["EMBEDDING_CHECK"] = "false"
# benchmark_ids = Benchmark.load("gene_id_extraction.jsonld")

print("Tip: Enable embedding check selectively based on question type")

### 2. Monitor Override Rates

Track how often embedding check overrides results:

In [None]:
# Calculate override rate from results
override_rate = sum(1 for r in results.results if r.template and r.template.embedding_override_applied) / len(results)

print(f"Override rate: {override_rate * 100:.1f}%")

if override_rate > 0.20:  # More than 20% overrides
    print("Warning: High override rate. Consider:")
    print("  - Reviewing template definitions")
    print("  - Adjusting similarity threshold")
    print("  - Examining overridden cases manually")

### 3. Review Overridden Cases

Manually inspect overridden verifications to ensure quality:

In [None]:
# Review all overridden cases
print("Reviewing overridden cases:\n")
for result in results.results:
    if result.template and result.template.embedding_override_applied:
        print(f"Question: {result.metadata.question_text}")
        print(f"Expected: {result.metadata.raw_answer}")
        print(f"Got: {result.template.raw_llm_response}")
        print(f"Similarity: {result.template.embedding_similarity_score:.4f}")
        print()

        # In manual review, you would validate:
        # is_correct = input("Is this override correct? (y/n): ")
        # if is_correct.lower() != 'y':
        #     print("⚠ False override detected - consider higher threshold")

print("Manual review complete")

### 4. Choose the Right Model

**For most use cases:** Use default `all-MiniLM-L6-v2` (fast, good accuracy)

**For higher accuracy:** Use `all-mpnet-base-v2` (slower, better)

**For question-answering:** Use `multi-qa-MiniLM-L6-cos-v1` (optimized for Q&A)

In [None]:
# High-accuracy configuration example
print("High-accuracy configuration:")

# Set to use a more accurate model
os.environ["EMBEDDING_CHECK_MODEL"] = "all-mpnet-base-v2"
# Higher threshold with better model
os.environ["EMBEDDING_CHECK_THRESHOLD"] = "0.90"

print(f"  EMBEDDING_CHECK_MODEL = {os.getenv('EMBEDDING_CHECK_MODEL')}")
print(f"  EMBEDDING_CHECK_THRESHOLD = {os.getenv('EMBEDDING_CHECK_THRESHOLD')}")

### 5. Combine with Deep-Judgment

Embedding check works well with deep-judgment parsing for maximum transparency:

In [None]:
# Enable both features
os.environ["EMBEDDING_CHECK"] = "true"

# Example configuration with both features
from karenina.schemas import ModelConfig, VerificationConfig

model_config = ModelConfig(
    id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", temperature=0.0, interface="langchain"
)

config_example = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    embedding_check_enabled=True,  # Enable embedding check
    deep_judgment_enabled=True,  # Also enable deep-judgment
    deep_judgment_max_excerpts_per_attribute=3,
)

print("Configuration example with both embedding check and deep-judgment enabled")
print("This provides maximum transparency for verification results")

## Integration with Other Features

### Embedding Check + Deep-Judgment

Use embedding check to catch paraphrases, deep-judgment for transparency:

In [None]:
# Example: Enable both features
config_both = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    embedding_check_enabled=True,
    deep_judgment_enabled=True,
    deep_judgment_max_excerpts_per_attribute=3,
)

print("Both features enabled:")
print("  - Embedding check: Semantic fallback for paraphrased answers")
print("  - Deep-judgment: Multi-stage parsing with evidence extraction")

### Embedding Check + Abstention Detection

Both features can run together. Abstention detection identifies refusals; embedding check handles paraphrases:

In [None]:
# Example: Enable both features
config_abstention = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    embedding_check_enabled=True,
    abstention_check_enabled=True,
)

print("Both features enabled:")
print("  - Abstention detection: Identifies model refusals")
print("  - Embedding check: Handles paraphrased correct answers")

## Troubleshooting

### Issue 1: Embedding Check Not Running

**Symptom**: `embedding_check_performed` is always `False`

**Solutions**:

1. Verify environment variable is set: `os.getenv("EMBEDDING_CHECK")`
2. Check that initial verification is failing (embedding check only runs on failures)
3. Ensure sentence-transformers is installed: `pip install karenina[embeddings]`

### Issue 2: No Overrides Applied

**Symptom**: Embedding checks run but never override results

**Solutions**:

1. Lower the similarity threshold: `os.environ["EMBEDDING_CHECK_THRESHOLD"] = "0.80"`
2. Review similarity scores to see if they're below threshold
3. Try a more accurate embedding model: `all-mpnet-base-v2`

### Issue 3: Too Many Overrides

**Symptom**: High override rate (>20%) suggesting false positives

**Solutions**:

1. Raise the similarity threshold: `os.environ["EMBEDDING_CHECK_THRESHOLD"] = "0.90"`
2. Review templates to ensure they're capturing expected variations
3. Manually inspect overridden cases to identify patterns

### Issue 4: Slow Performance

**Symptom**: Verification takes too long with embedding check enabled

**Solutions**:

1. Use faster embedding model: `all-MiniLM-L6-v2` or `all-distilroberta-v1`
2. Increase threshold to reduce LLM validation calls
3. Improve templates to reduce initial verification failures
4. Disable embedding check for benchmarks where it's not needed

## Next Steps

Once you have embedding check configured, you can:

- **[Deep-Judgment Parsing](deep-judgment.md)** - Multi-stage parsing with evidence extraction
- **[Abstention Detection](abstention-detection.md)** - Identify model refusals
- **[Verification](../using-karenina/verification.md)** - Complete verification workflow
- **[Saving and Loading](../using-karenina/saving-loading.md)** - Persist benchmarks

## Related Documentation

- **[Verification](../using-karenina/verification.md)** - Core verification workflow
- **[Templates](../using-karenina/templates.md)** - Answer template creation
- **[Deep-Judgment](deep-judgment.md)** - Multi-stage parsing
- **[Abstention Detection](abstention-detection.md)** - Refusal detection