In [1]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata

import sys
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))


# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""

    def __init__(self, content: str = "BCL2"):
        self.content = content

    def __str__(self):
        return self.content


class MockStructuredOutput:
    """Mock structured output for template parsing."""

    def __init__(self, **kwargs):
        self.count = kwargs.get("count", 46)
        self.target = kwargs.get("target", "BCL2")
        self.subunits = kwargs.get("subunits", 4)
        self.diseases = kwargs.get("diseases", ["asthma", "bronchitis", "pneumonia"])
        self.mentions_bcl2_protein = kwargs.get("mentions_bcl2_protein", True)
        self.mentions_apoptosis_regulation = kwargs.get("mentions_apoptosis_regulation", False)
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def model_dump(self):
        return self.dict()


def create_mock_chat_model(default_response: str = "BCL2"):
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse(default_response)
    mock.ainvoke.return_value = MockLLMResponse(default_response)
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


def create_mock_benchmark():
    """Create a mock Benchmark object for demonstrations."""
    from karenina import Benchmark

    benchmark = Benchmark.create(
        name="Demo Benchmark", description="Mock benchmark for documentation", version="1.0.0", creator="Documentation"
    )
    return benchmark


# Patch all LLM providers before any imports
_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch(
        "karenina.infrastructure.llm.interface.init_chat_model_unified",
        side_effect=lambda **kwargs: create_mock_chat_model(),
    ),
]

for p in _llm_patches:
    p.start()


# Helper to replace file paths in examples
def temp_path(filename: str) -> Path:
    """Get a temporary file path for documentation examples."""
    return TEMP_DIR / filename


# Cleanup on kernel shutdown
import atexit
import shutil


def _cleanup():
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print("✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src")

✓ Mock setup complete
✓ Temp directory: /var/folders/34/129m5tdd04vf10ptyj12w6f80000gp/T/karenina_docs_opgi7rlv
✓ Karenina package loaded from: /Users/carli/Projects/karenina-monorepo/karenina/src


# Rubrics

Rubrics provide qualitative evaluation criteria beyond the basic template verification. They enable assessment of answer traits like clarity, conciseness, safety, and domain-specific requirements.

**Quick Navigation:**

- [What Are Rubrics?](#what-are-rubrics) - Core concepts and capabilities
- [Why Use Rubrics?](#why-use-rubrics) - Quality assessment, domain validation, compliance
- [Rubric Scope](#rubric-scope) - Global vs question-specific rubrics
- [Four Types of Rubric Traits](#four-types) - LLM-based, regex-based, callable-based, metric-based
- [Creating a Global Rubric](#creating-global) - Apply traits to all questions
- [Creating Question-Specific Rubrics](#question-specific) - Apply traits to specific questions
- [Working with Rubric Results](#working-with-results) - Access and analyze evaluation results
- [Complete Example](#complete-example) - End-to-end workflow with all trait types

## Four Types of Rubric Traits

Karenina supports four types of evaluation traits:

| Trait Type | Description | Best For |
|------------|-------------|----------|
| **LLMRubricTrait** | AI-evaluated traits using LLM judgment | Subjective qualities (clarity, tone, completeness) |
| **RegexTrait** | Pattern-matching traits using regular expressions | Format compliance, keyword presence/absence |
| **CallableTrait** | Custom Python function-based traits | Complex validation logic (word counts, structure) |
| **MetricRubricTrait** | Confusion matrix-based metrics for classification | Precision, recall, F1 for item identification |

In [None]:
from karenina.schemas import LLMRubricTrait

# Score-based trait: Measures conciseness on a 1-5 scale
conciseness_trait = LLMRubricTrait(
    name="Conciseness",
    description="Rate the conciseness of the answer on a scale from 1 (very verbose) to 5 (extremely concise).",
    kind="score",
)

# Score-based trait: Measures clarity
clarity_trait = LLMRubricTrait(
    name="Clarity",
    description="Rate how clear and understandable the answer is, from 1 (confusing) to 5 (crystal clear).",
    kind="score",
)

print(f"✓ Created score-based traits: {conciseness_trait.name}, {clarity_trait.name}")
print(f"  Conciseness range: {conciseness_trait.min_score}-{conciseness_trait.max_score}")
print(f"  Clarity range: {clarity_trait.min_score}-{clarity_trait.max_score}")

**Boolean Mode (pass/fail):**

- Provides **yes/no judgment** on whether criteria are met
- Returns: `true` (pass) or `false` (fail)
- Best for: Questions with clear criteria that either are or aren't satisfied

# Binary trait: Checks for safety concerns
safety_trait = LLMRubricTrait(
    name="Safety Concerns",
    description="Does the answer include any mention of safety concerns or warnings?",
    kind="boolean"  # Note: use "boolean", not "binary"
)

print(f"✓ Created boolean trait: {safety_trait.name}")

### 2. Regex-Based Traits (`RegexTrait`)

**What they are:** Deterministic pattern-matching traits that check if answers match specific regex patterns. These provide **100% reproducible** validation without any LLM judgment.

**When to use:**

- Checking for **required terminology** or keywords
- Validating **format compliance** (dates, gene symbols, IDs)
- Detecting **prohibited content** (profanity, inappropriate terms)
- Ensuring **specific patterns** are present or absent

from karenina.schemas import RegexTrait

# Answer must mention "BH3 proteins"
bh3_trait = RegexTrait(
    name="Mentions BH3 Proteins",
    description="Answer must mention BH3 proteins (the mechanism of BCL2 inhibition)",
    pattern=r"\bBH3\b",
    case_sensitive=False,
    invert_result=False  # Pass if pattern found
)

print(f"✓ Created regex trait: {bh3_trait.name}")

# Test evaluation
test_answer = "The answer involves BH3 proteins binding to BCL2."
result = bh3_trait.evaluate(test_answer)
print(f"  Test answer: {test_answer}")
print(f"  BH3 pattern found: {result}")

**Using `invert_result` Parameter:**

The `invert_result` parameter changes the pass/fail logic:

| `invert_result` | Pattern Matches | Result |
|----------------|----------------|--------|
| `False` (default) | Pattern **found** in answer | ✅ Pass |
| `False` (default) | Pattern **NOT found** in answer | ❌ Fail |
| `True` | Pattern **found** in answer | ❌ Fail |
| `True` | Pattern **NOT found** in answer | ✅ Pass |

### 3. Callable Traits (`CallableTrait`)

**What they are:** Custom Python function-based traits for complex validation logic that cannot be expressed with regex patterns.

**When to use:**

- Complex validation requiring **Python logic** (word counts, sentence structure analysis)
- **Custom scoring algorithms** beyond simple pattern matching
- When regex patterns are too limited but you want deterministic evaluation

**⚠️ SECURITY WARNING**: CallableTrait uses cloudpickle serialization. Only load from trusted sources.

# Define evaluation function
def check_word_count(text: str) -> bool:
    """Check if answer has at least 50 words."""
    return len(text.split()) >= 50

# Create callable trait
from karenina.schemas import CallableTrait
word_count_trait = CallableTrait.from_callable(
    name="Minimum Word Count",
    description="Answer must have at least 50 words",
    func=check_word_count,
    kind="boolean"
)

print(f"✓ Created callable trait: {word_count_trait.name}")

# Evaluate
short_answer = "This is a short answer."
long_answer = "This is a much longer answer with many more words. " * 10

print(f"  Short answer (fewer than 50 words): {word_count_trait.evaluate(short_answer)}")
print(f"  Long answer (more than 50 words): {word_count_trait.evaluate(long_answer)}")

### 4. Metric-Based Traits (`MetricRubricTrait`)

Confusion matrix-based traits for quantitative classification evaluation.

**Available Metrics:**

- **Precision**: TP / (TP + FP)
- **Recall**: TP / (TP + FN)
- **F1 Score**: 2 × (Precision × Recall) / (Precision + Recall)
- **Accuracy**: (TP + TN) / (TP + TN + FP + FN)
- **Specificity**: TN / (TN + FP)

from karenina.schemas import MetricRubricTrait

# TP-Only Mode: Identifying inflammatory diseases
inflammatory_trait = MetricRubricTrait(
    name="Inflammatory Disease Identification",
    description="Evaluate accuracy of identifying inflammatory lung diseases",
    metrics=["precision", "recall", "f1"],
    tp_instructions=[
        "asthma",
        "bronchitis",
        "pneumonia",
        "pleurisy"
    ],
    fp_instructions=[
        "emphysema",
        "pulmonary fibrosis",
        "sarcoidosis"
    ]
)

print(f"✓ Created metric trait: {inflammatory_trait.name}")
print(f"  Evaluation mode: {inflammatory_trait.evaluation_mode}")
print(f"  Metrics: {inflammatory_trait.metrics}")
print(f"  TP instructions: {inflammatory_trait.tp_instructions}")

### When to Use Each Trait Type

| Need | Use This Trait Type |
|------|---------------------|
| Subjective quality assessment | LLM-Based (`LLMRubricTrait`) |
| Exact keyword or format validation | Regex-Based (`RegexTrait`) |
| Complex validation logic | Callable-Based (`CallableTrait`) |
| Classification accuracy metrics | Metric-Based (`MetricRubricTrait`) |
| Nuanced scoring (1-5) | LLM-Based (`LLMRubricTrait`, kind="score") |
| Yes/no judgment | LLM-Based (`LLMRubricTrait`, kind="boolean") |
| Deterministic pattern matching | Regex-Based (`RegexTrait`) |
| Custom scoring algorithms | Callable-Based (`CallableTrait`, kind="score") |

## Creating a Global Rubric

Global rubrics apply to **all questions** in your benchmark. They're perfect for general quality traits like clarity and conciseness.

from karenina import Benchmark
from karenina.schemas import Rubric, LLMRubricTrait

# Create benchmark
benchmark = Benchmark.create(name="Genomics Knowledge Benchmark")

# Add questions
benchmark.add_question(
    question="How many chromosomes are in a human somatic cell?",
    raw_answer="46"
)

benchmark.add_question(
    question="What is the approved drug target of Venetoclax?",
    raw_answer="BCL2"
)

# Create global rubric with LLM-based traits
# Note: use llm_traits, not traits
global_rubric = Rubric(
    name="Answer Quality Assessment",
    llm_traits=[
        LLMRubricTrait(
            name="Conciseness",
            description="Rate the conciseness of the answer on a scale from 1 (very verbose) to 5 (extremely concise).",
            kind="score"
        ),
        LLMRubricTrait(
            name="Clarity",
            description="Rate how clear and understandable the answer is, from 1 (confusing) to 5 (crystal clear).",
            kind="score"
        )
    ]
)

# Set as global rubric - applies to ALL questions
benchmark.set_global_rubric(global_rubric)

print(f"✓ Created benchmark with {len(benchmark.get_all_questions())} questions")
print(f"✓ Set global rubric with traits: {global_rubric.get_llm_trait_names()}")
print(f"  These traits will be evaluated for EVERY question")

## Creating Question-Specific Rubrics

Question-specific rubrics apply to **a single question only**. They're perfect for domain validation and specialized requirements.

from karenina.schemas import Rubric, RegexTrait

# This rubric is ONLY for the Venetoclax question
venetoclax_rubric = Rubric(
    name="Drug Mechanism Validation",
    regex_traits=[  # Note: use regex_traits, not traits
        RegexTrait(
            name="Mentions BH3 Proteins",
            description="Answer must mention BH3 proteins",
            pattern=r"\bBH3\b",
            case_sensitive=False
        )
    ]
)

benchmark2 = Benchmark.create(name="Question-Specific Rubric Example")

# Add question with specific rubric
benchmark2.add_question(
    question="What is the approved drug target of Venetoclax?",
    raw_answer="BCL2",
    rubric=venetoclax_rubric,
)

benchmark2.add_question(
    question="How many chromosomes in a human cell?",
    raw_answer="46"
)

print(f"✓ Created benchmark with question-specific rubric")
print(f"  Venetoclax question has rubric: {venetoclax_rubric.get_regex_trait_names()}")
print(f"  Chromosomes question has no question-specific rubric")

## Working with Rubric Results

After running verification with rubrics, you can access the results:

from karenina.schemas import ModelConfig, VerificationConfig

# Configure verification
model_config = ModelConfig(
    id="gpt-4.1-mini",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.0,
    interface="langchain"
)

config = VerificationConfig(
    answering_models=[model_config],
    parsing_models=[model_config]
)

# Run verification (using our mock)
results = benchmark.run_verification(config)

# Access rubric results for each question
for result in results:
    print(f"\nQuestion: {result.metadata.question_text[:50]}...")
    
    # LLM-based trait scores
    if result.rubric.llm_trait_scores:
        print("  LLM Trait Scores:")
        for trait_name, score in result.rubric.llm_trait_scores.items():
            print(f"    {trait_name}: {score}/5")
    
    # Regex trait results
    if result.rubric.regex_trait_results:
        print("  Regex Trait Results:")
        for trait_name, passed in result.rubric.regex_trait_results.items():
            status = "✓ Pass" if passed else "✗ Fail"
            print(f"    {trait_name}: {status}")
    
    # Metric trait results
    if result.rubric.metric_trait_metrics:
        print("  Metric Trait Results:")
        for trait_name, metrics in result.rubric.metric_trait_metrics.items():
            print(f"    {trait_name}:")
            for metric_name, value in metrics.items():
                print(f"      {metric_name}: {value:.2f}")

## Complete Example

Here is a complete workflow showing both global and question-specific rubrics with all trait types:

from karenina import Benchmark
from karenina.schemas import (
    Rubric, LLMRubricTrait, RegexTrait, MetricRubricTrait,
    ModelConfig, VerificationConfig
)

# 1. Create benchmark
benchmark = Benchmark.create(
    name="Genomics Knowledge Benchmark",
    description="Testing LLM knowledge of genomics",
    version="1.0.0"
)

# 2. Create global rubric (applies to ALL questions)
global_rubric = Rubric(
    name="General Quality Assessment",
    llm_traits=[
        LLMRubricTrait(
            name="Conciseness",
            description="Rate conciseness from 1 (verbose) to 5 (concise).",
            kind="score"
        ),
        LLMRubricTrait(
            name="Clarity",
            description="Rate clarity from 1 (confusing) to 5 (clear).",
            kind="score"
        )
    ]
)

benchmark.set_global_rubric(global_rubric)

# 3. Add questions with different rubrics
benchmark.add_question(
    question="How many chromosomes are in a human somatic cell?",
    raw_answer="46"
)

# Question with regex-based question-specific rubric
venetoclax_rubric = Rubric(
    name="Drug Mechanism Validation",
    regex_traits=[
        RegexTrait(
            name="Mentions BH3 Proteins",
            description="Answer must mention BH3 proteins",
            pattern=r"\bBH3\b",
            case_sensitive=False
        )
    ]
)

benchmark.add_question(
    question="What is the approved drug target of Venetoclax?",
    raw_answer="BCL2",
    rubric=venetoclax_rubric,
)

# 4. Generate templates
model_config = ModelConfig(
    id="gpt-4.1-mini",
    model_provider="openai",
    model_name="gpt-4.1-mini",
    temperature=0.1,
    interface="langchain"
)

results = benchmark.generate_all_templates(
    model="gpt-4.1-mini",
    model_provider="openai",
    temperature=0.1,
    interface="langchain"
)

print(f"✓ Generated {len(results)} templates")

# 5. Save benchmark
save_path = temp_path("genomics_benchmark_with_rubrics.jsonld")
benchmark.save(save_path)

print(f"✓ Benchmark saved to {save_path.name}")

## Next Steps

Once you have rubrics configured:

- [Run verification](verification.md) to apply both template and rubric evaluation
- [Analyze results](verification.md#accessing-verification-results) to understand performance across different criteria
- [Save and load benchmarks](saving-loading.md) to preserve your rubric configurations
- [Export results](saving-loading.md#exporting-verification-results) to CSV or JSON for further analysis

## Related Documentation

- [Adding Questions](adding-questions.md) - Populate your benchmark with questions
- [Templates](templates.md) - Structured answer evaluation for factual correctness
- [Verification](verification.md) - Run evaluations with multiple models
- [Quick Start](../quickstart.md) - End-to-end workflow example