In [None]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata

import hashlib
import json
import sys
import tempfile
from datetime import datetime
from pathlib import Path
from unittest.mock import MagicMock, patch

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import karenina modules after path is set
from karenina.schemas.workflow.verification.result import VerificationResult
from karenina.schemas.workflow.verification.result_components import (
    VerificationResultDeepJudgment,
    VerificationResultMetadata,
    VerificationResultTemplate,
)


# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""

    def __init__(self, content: str = "Mock response"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}

    def __str__(self):
        return self.content


class MockStructuredOutput:
    """Mock structured output response that adapts to any template."""

    def __init__(self, **kwargs):
        self.drug_target = kwargs.get("drug_target", "BCL-2")
        self.mechanism = kwargs.get("mechanism", "Inhibits BCL-2 protein")
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def model_dump(self):
        return self.dict()


def create_mock_chat_model():
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("BCL-2")
    mock.ainvoke.return_value = MockLLMResponse("BCL-2")
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


def compute_result_id(question_id: str, answering_model: str, parsing_model: str, timestamp: str) -> str:
    """Compute deterministic 16-char SHA256 hash."""
    data = {
        "answering_mcp_servers": [],
        "answering_model": answering_model,
        "parsing_model": parsing_model,
        "question_id": question_id,
        "replicate": None,
        "timestamp": timestamp,
    }
    json_str = json.dumps(data, sort_keys=True, ensure_ascii=True)
    hash_obj = hashlib.sha256(json_str.encode("utf-8"))
    return hash_obj.hexdigest()[:16]


def create_mock_deep_judgment_result():
    """Create a mock deep-judgment result for demonstration."""
    return VerificationResultDeepJudgment(
        deep_judgment_enabled=True,
        deep_judgment_performed=True,
        extracted_excerpts={
            "drug_target": [{"text": "targets the BCL-2 protein", "confidence": "high", "similarity_score": 0.95}],
            "mechanism": [
                {"text": "inhibits BCL-2, which promotes apoptosis", "confidence": "high", "similarity_score": 0.92}
            ],
        },
        attribute_reasoning={
            "drug_target": "The excerpt 'targets the BCL-2 protein' explicitly states BCL-2 as the target protein. This directly answers the question.",
            "mechanism": "The response explains that venetoclax inhibits BCL-2, which promotes apoptosis in cancer cells. This describes the mechanism of action.",
        },
        deep_judgment_stages_completed=["excerpts", "reasoning", "parameters"],
        deep_judgment_model_calls=3,
        deep_judgment_excerpt_retry_count=0,
        attributes_without_excerpts=[],
        deep_judgment_search_enabled=False,
        hallucination_risk_assessment=None,
    )


def create_mock_verification_result(question_id: str, question_text: str, deep_judgment: bool = True):
    """Create a mock VerificationResult for testing."""
    timestamp = datetime.now().isoformat()
    template_id = hashlib.md5(str(question_id).encode()).hexdigest()[:32]

    # Create mock template result
    template = VerificationResultTemplate(
        raw_llm_response="Venetoclax targets the BCL-2 protein, which is an anti-apoptotic protein. By inhibiting BCL-2, venetoclax promotes apoptosis in cancer cells.",
        parsed_llm_response={"drug_target": "BCL-2", "mechanism": "Inhibits BCL-2"},
        parsed_gt_response={"drug_target": "BCL-2", "mechanism": "Inhibits BCL-2"},
        verify_result=True,
        template_verification_performed=True,
        usage_metadata={
            "answer_generation": {"total_tokens": 50},
            "parsing": {"total_tokens": 30},
            "total": {"total_tokens": 80},
        },
        abstention_check_performed=True,
        abstention_detected=False,
    )

    # Create metadata with all required fields
    metadata = VerificationResultMetadata(
        question_id=question_id,
        template_id=template_id,
        completed_without_errors=True,
        question_text=question_text,
        raw_answer="BCL-2",
        answering_model="gpt-4.1-mini",
        parsing_model="gpt-4.1-mini",
        execution_time=2.5,
        timestamp=timestamp,
        result_id=compute_result_id(question_id, "gpt-4.1-mini", "gpt-4.1-mini", timestamp),
    )

    return VerificationResult(
        metadata=metadata,
        template=template,
        rubric=None,
        deep_judgment=create_mock_deep_judgment_result() if deep_judgment else None,
    )


# Store original run_verification
_original_run_verification = None


def mock_run_verification(self, config):
    """Mock run_verification that returns realistic deep-judgment results."""
    from karenina.schemas.workflow.template_results import TemplateResults
    from karenina.schemas.workflow.verification_result_set import VerificationResultSet

    global _original_run_verification

    # Get all finished questions
    finished = self.get_finished_questions(ids_only=False)

    if len(finished) == 0:
        if _original_run_verification:
            return _original_run_verification(self, config)
        return VerificationResultSet(results=[], template_results=TemplateResults(results=[]))

    results = []
    # Check if deep judgment is enabled
    deep_judgment_enabled = config.deep_judgment_enabled if hasattr(config, "deep_judgment_enabled") else False

    for question in finished:
        q_id = question["id"]
        q_text = question["question"]

        results.append(
            create_mock_verification_result(question_id=q_id, question_text=q_text, deep_judgment=deep_judgment_enabled)
        )

    template_results = TemplateResults(results=results)

    return VerificationResultSet(
        results=results,
        template_results=template_results,
        rubric_results=None,
    )


# Patch all LLM providers before any imports
_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch(
        "karenina.infrastructure.llm.interface.init_chat_model_unified",
        side_effect=lambda **kwargs: create_mock_chat_model(),
    ),
]

for p in _llm_patches:
    p.start()

# Patch Benchmark.run_verification
from karenina.benchmark import Benchmark

_original_run_verification = Benchmark.run_verification
Benchmark.run_verification = mock_run_verification


def temp_path(filename: str) -> Path:
    """Helper to create paths in temp directory."""
    return TEMP_DIR / filename


# Cleanup
import atexit
import shutil


def _cleanup():
    Benchmark.run_verification = _original_run_verification
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print("✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")
print("✓ Mock deep-judgment verification results enabled")

# Deep-Judgment Parsing

Deep-judgment is an advanced parsing mode that provides enhanced transparency and accountability by extracting verbatim evidence from LLM responses before drawing conclusions. This guide explains what it is, when to use it, and how to configure it.

## What is Deep-Judgment?

**Deep-judgment parsing** is a multi-stage evaluation process that goes beyond standard template-based verification. Instead of directly extracting structured data from LLM responses, deep-judgment performs a three-stage analysis:

1. **Excerpt Extraction**: Identifies verbatim quotes that support each answer attribute
2. **Reasoning Generation**: Explains how the excerpts map to attribute values
3. **Parameter Extraction**: Extracts final structured values with full context

This approach creates an **audit trail** showing exactly what evidence the LLM provided and how it was interpreted.

### Standard Parsing vs Deep-Judgment

**Standard Parsing** (default):
```
LLM Response → Parse Attributes → Verify Correctness
```

**Deep-Judgment Parsing**:
```
LLM Response → Extract Excerpts → Generate Reasoning → Parse Attributes → Verify Correctness
```

## Why Use Deep-Judgment?

### 1. Transparency

Every extracted attribute is backed by explicit evidence from the LLM response. You can see exactly which parts of the answer support each claim.

**Example**:
```
Question: "What is the approved drug target of Venetoclax?"

LLM Response: "Venetoclax targets the BCL-2 protein, which is an anti-apoptotic
protein. By inhibiting BCL-2, venetoclax promotes apoptosis in cancer cells."

Standard Parsing:
  drug_target: "BCL-2" ✓

Deep-Judgment Parsing:
  Excerpt: "targets the BCL-2 protein"
  Reasoning: "The response explicitly states BCL-2 as the target protein"
  drug_target: "BCL-2" ✓
```

## How Deep-Judgment Works

Deep-judgment uses a **three-stage autoregressive process** where each stage builds on the previous one:

### Stage 1: Excerpt Extraction

The parsing model identifies **verbatim quotes** from the LLM response that support each template attribute.

**For each attribute**:

- Extract 0-3 excerpts (configurable)
- Assign confidence level: low/medium/high
- Validate excerpts actually exist in the response (fuzzy matching)
- If no excerpts found, request explanation from LLM

**Example**:
```python
# Question: "What is the approved drug target of Venetoclax?"
# Response: "Venetoclax targets BCL-2, a key anti-apoptotic protein"

excerpts = {
    "drug_target": [
        {
            "text": "targets BCL-2",
            "confidence": "high",
            "similarity_score": 0.95
        }
    ]
}
```

In [None]:
# Example: Extracted excerpts structure
excerpts = {"drug_target": [{"text": "targets BCL-2", "confidence": "high", "similarity_score": 0.95}]}
print("Extracted Excerpts:")
for attr, exc_list in excerpts.items():
    print(f"\nAttribute: {attr}")
    for exc in exc_list:
        print(f"  Text: {exc['text']}")
        print(f"  Confidence: {exc['confidence']}")
        print(f"  Similarity: {exc['similarity_score']}")

### Stage 2: Reasoning Generation

The parsing model explains how the excerpts from Stage 1 inform each attribute value.

**Example**:
```python
reasoning = {
    "drug_target": "The excerpt 'targets BCL-2' explicitly states BCL-2
                  as the protein target. This directly answers the question."
}
```

In [None]:
# Example: Attribute reasoning
reasoning = {
    "drug_target": "The excerpt 'targets BCL-2' explicitly states BCL-2 as the protein target.",
    "mechanism": "The response explains that venetoclax inhibits BCL-2, which promotes apoptosis in cancer cells.",
}
print("Attribute Reasoning:")
for attr, reason in reasoning.items():
    print(f"\n{attr}: {reason}")

### Stage 3: Parameter Extraction

Using the reasoning context from Stage 2, the parsing model extracts structured attribute values using standard template parsing.

**Example**:
```python
parsed_answer = {
    "drug_target": "BCL-2"
}
```

### Validation and Auto-Fail

If any attribute has **missing excerpts** (no verbatim evidence found), verification **automatically fails** even if the final parsed answer seems correct. This ensures all claims are backed by explicit evidence.

**Exception**: If abstention is detected (LLM refused to answer), auto-fail is skipped since abstention takes priority.

## Enabling Deep-Judgment

Deep-judgment is disabled by default. Enable it in your verification configuration:

In [None]:
from karenina import Benchmark
from karenina.schemas import ModelConfig, VerificationConfig

# Configure models
model_config = ModelConfig(
    id="gpt-4.1-mini", model_provider="openai", model_name="gpt-4.1-mini", temperature=0.0, interface="langchain"
)

# Enable deep-judgment
config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    deep_judgment_enabled=True,  # Enable deep-judgment parsing
)

print("Deep-judgment enabled:", config.deep_judgment_enabled)

### Advanced Configuration

You can tune deep-judgment behavior with additional parameters:

In [None]:
config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    # Deep-judgment configuration
    deep_judgment_enabled=True,  # Enable feature
    deep_judgment_max_excerpts_per_attribute=3,  # Excerpts per attribute (1-5)
    deep_judgment_fuzzy_match_threshold=0.80,  # Similarity threshold (0.0-1.0)
    deep_judgment_excerpt_retry_attempts=2,  # Retry attempts (0-5)
)

print("Deep-Judgment Configuration:")
print(f"  Enabled: {config.deep_judgment_enabled}")
print(f"  Max excerpts per attribute: {config.deep_judgment_max_excerpts_per_attribute}")
print(f"  Fuzzy match threshold: {config.deep_judgment_fuzzy_match_threshold}")
print(f"  Retry attempts: {config.deep_judgment_excerpt_retry_attempts}")

**Configuration Parameters**:

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `deep_judgment_enabled` | `bool` | `False` | Enable/disable deep-judgment parsing |
| `deep_judgment_max_excerpts_per_attribute` | `int` | `3` | Maximum excerpts to extract per attribute (1-5) |
| `deep_judgment_fuzzy_match_threshold` | `float` | `0.80` | Similarity threshold for excerpt validation (0.0-1.0). Higher = stricter. |
| `deep_judgment_excerpt_retry_attempts` | `int` | `2` | Retry attempts when excerpt validation fails (0-5) |

## Search-Enhanced Deep-Judgment

**Search-enhanced deep-judgment** extends the standard three-stage process with an additional validation layer that checks extracted excerpts against external evidence sources. This helps detect potential hallucinations by verifying that the information in excerpts can be corroborated by external search results.

In [None]:
# Enable search-enhanced deep-judgment
config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config],
    deep_judgment_enabled=True,  # Enable deep-judgment
    deep_judgment_search_enabled=True,  # Enable search validation
    deep_judgment_search_tool="tavily",  # Use Tavily search (default)
)

print("Search-Enhanced Deep-Judgment:")
print(f"  Deep-judgment enabled: {config.deep_judgment_enabled}")
print(f"  Search enabled: {config.deep_judgment_search_enabled}")
print(f"  Search tool: {config.deep_judgment_search_tool}")

## Understanding Results

When deep-judgment is enabled, verification results include additional metadata about the extraction process.

In [None]:
# Create a mock result to show the structure
mock_result = create_mock_verification_result(
    question_id="q1", question_text="What is the approved drug target of Venetoclax?", deep_judgment=True
)

# Access deep-judgment results
if mock_result.deep_judgment_performed:
    print("Deep-Judgment Results:")
    print("\nMetadata:")
    print(f"  Question: {mock_result.question_text}")
    print(f"  Stages Completed: {mock_result.deep_judgment_stages_completed}")
    print(f"  Model Calls: {mock_result.deep_judgment_model_calls}")
    print(f"  Excerpt Retries: {mock_result.deep_judgment_excerpt_retry_count}")

In [None]:
# Display extracted excerpts
if mock_result.extracted_excerpts:
    print("\nExtracted Excerpts:")
    for attr, excerpts in mock_result.extracted_excerpts.items():
        print(f"\n  Attribute: {attr}")
        for exc in excerpts:
            if exc.get("explanation"):
                # Missing excerpt with explanation
                print(f"    [Missing] {exc['explanation']}")
            else:
                # Found excerpt
                print(f'    Text: "{exc["text"]}"')
                print(f"    Confidence: {exc['confidence']}")
                print(f"    Similarity: {exc['similarity_score']:.2f}")

In [None]:
# Display reasoning traces
if mock_result.attribute_reasoning:
    print("\nReasoning:")
    for attr, reasoning in mock_result.attribute_reasoning.items():
        print(f"  {attr}: {reasoning}")

In [None]:
# Show auto-fail status
if mock_result.attributes_without_excerpts:
    print(f"\n⚠️  AUTO-FAIL: Missing excerpts for {', '.join(mock_result.attributes_without_excerpts)}")
else:
    print("\n✓ All attributes have supporting excerpts")

### Excerpt Structure

Each excerpt includes:

```python
{
    "text": str,              # Verbatim quote from response (empty if missing)
    "confidence": str,        # "low" | "medium" | "high" | "none"
    "similarity_score": float, # 0.0-1.0 (fuzzy match validation score)
    "explanation": str        # Optional: why excerpt couldn't be found
}
```

In [None]:
# Example with excerpts
example_with_excerpts = {
    "drug_target": [
        {"text": "targets BCL-2 protein", "confidence": "high", "similarity_score": 0.95},
        {"text": "inhibits BCL-2", "confidence": "medium", "similarity_score": 0.87},
    ]
}

print("Example with excerpts:")
for attr, excerpts in example_with_excerpts.items():
    print(f"\n{attr}:")
    for exc in excerpts:
        print(f'  - "{exc["text"]}" (confidence: {exc["confidence"]}, similarity: {exc["similarity_score"]})')

## Use Cases

### When to Use Deep-Judgment

✅ **High-stakes evaluation** where evidence transparency is critical:

- Medical diagnosis benchmarks
- Legal document analysis
- Scientific fact-checking
- Regulatory compliance

✅ **Debugging parsing failures**:

- Understanding why verification fails
- Identifying gaps in LLM responses
- Refining question or template design

✅ **Quality assurance**:

- Ensuring responses contain sufficient evidence
- Validating that answers aren't just plausible-sounding
- Auditing LLM reasoning processes

### When NOT to Use Deep-Judgment

❌ **High-volume verification** where speed is critical:

- Deep-judgment is 3-5x slower than standard parsing
- Uses 3-5 LLM calls per question vs. 1 call for standard

❌ **Low-stakes evaluation** where audit trails aren't needed:

- Quick prototyping
- Informal testing
- Cost-sensitive applications

## Performance Considerations

### Execution Time

Deep-judgment significantly increases verification time:

- **Standard parsing**: 1 LLM call per question (~500-2000ms)
- **Deep-judgment parsing**: 3-5 LLM calls per question (~1500-10000ms)
  - Stage 1 (excerpts): 1 call + retries
  - Stage 2 (reasoning): 1 call
  - Stage 3 (parameters): 1 call

**Impact**: 3-5x slower than standard verification

## Configuration Tips

### Fuzzy Match Threshold

Controls how strictly excerpts must match the original response:

```python
# Lenient matching (accepts paraphrases)
deep_judgment_fuzzy_match_threshold=0.70

# Default matching (balanced)
deep_judgment_fuzzy_match_threshold=0.80

# Strict matching (only very close matches)
deep_judgment_fuzzy_match_threshold=0.90
```

**Trade-offs**:

- **Lower threshold (0.60-0.75)**: More lenient, may accept paraphrased excerpts
- **Higher threshold (0.85-0.95)**: Stricter, only accepts near-exact matches

## Best Practices

### 1. Start with Standard Parsing

Begin with standard parsing for your entire benchmark. Only enable deep-judgment when you need to:

- Debug specific parsing failures
- Audit high-stakes results
- Understand model behavior

### 2. Use Clear, Evidence-Based Templates

Design templates that expect explicit evidence in responses.

## Related Features

Deep-judgment works alongside other advanced features:

- **Abstention Detection**: Detects when LLMs refuse to answer. Takes priority over deep-judgment auto-fail.
- **Rubrics**: Assess qualitative aspects. Use together with deep-judgment for comprehensive evaluation.
- **Verification**: Core verification system. Deep-judgment enhances standard verification with evidence extraction.