In [1]:
# Mock Setup - Hidden in rendered documentation
# This cell sets up mocking infrastructure for TaskEval examples

import sys
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import after path is set - make these available globally
from karenina.benchmark.task_eval import TaskEval
from karenina.schemas.domain import LLMRubricTrait, RegexTrait, Rubric
from karenina.schemas.workflow import ModelConfig, VerificationConfig


# Mock LLM response generator
class MockLLMResponse:
    """Mock response object that mimics LangChain message structure."""

    def __init__(self, content: str = "Mock response"):
        self.content = content
        self.response_metadata = {"token_usage": {"total_tokens": 50}}

    def __str__(self):
        return self.content


class MockStructuredOutput:
    """Mock structured output response that adapts to any template."""

    def __init__(self, **kwargs):
        # Set common attributes with realistic defaults
        self.endpoint_created = kwargs.get("endpoint_created", True)
        self.has_error_handling = kwargs.get("has_error_handling", True)
        self.action_taken = kwargs.get("action_taken", "implement_api")
        self.result = kwargs.get("result", "success")
        for k, v in kwargs.items():
            if not hasattr(self, k):
                setattr(self, k, v)

    def dict(self):
        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}

    def model_dump(self):
        return self.dict()


def create_mock_chat_model():
    """Create a mock chat model that returns predictable responses."""
    mock = MagicMock()
    mock.invoke.return_value = MockLLMResponse("Successfully implemented REST API with proper error handling")
    mock.ainvoke.return_value = MockLLMResponse("Successfully implemented REST API with proper error handling")
    structured_mock = MagicMock()
    structured_mock.invoke.return_value = MockStructuredOutput()
    structured_mock.ainvoke.return_value = MockStructuredOutput()
    mock.with_structured_output.return_value = structured_mock
    mock.bind_tools.return_value = mock
    return mock


# Patch all LLM providers before any imports
_llm_patches = [
    patch("langchain_openai.ChatOpenAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_anthropic.ChatAnthropic", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch("langchain_google_genai.ChatGoogleGenerativeAI", side_effect=lambda **kwargs: create_mock_chat_model()),
    patch(
        "karenina.infrastructure.llm.interface.init_chat_model_unified",
        side_effect=lambda **kwargs: create_mock_chat_model(),
    ),
]

for p in _llm_patches:
    p.start()

# Cleanup
import atexit
import shutil


def _cleanup():
    for p in _llm_patches:
        try:
            p.stop()
        except:
            pass
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print("✓ Karenina package loaded")
print("✓ Mock TaskEval evaluation enabled")

✓ Mock setup complete
✓ Karenina package loaded
✓ Mock TaskEval evaluation enabled


# TaskEval: Task-Centric Trace Evaluation

Evaluate pre-logged agent workflow outputs by attaching verification criteria to existing traces.

## What is TaskEval?

**TaskEval** is a trace-centric evaluation framework that inverts the traditional benchmarking workflow.


## Task-Centric vs Question-Centric

| Use Case | Use Benchmark | Use TaskEval |
|----------|--------------|-------------|
| Test LLM knowledge | ✅ | ❌ |
| Evaluate agent workflows | ❌ | ✅ |

## Quick Start

Here's a minimal example showing the task-centric workflow.

In [2]:
# Quick Start Example: TaskEval with templates and rubrics

# 1. Create TaskEval instance
task = TaskEval(task_id="agent_code_generation")

# 2. Log agent execution traces
task.log("Reasoning: Need to implement a REST API endpoint")
task.log("Plan: Use FastAPI with proper error handling")
task.log("Implementation: Created /api/users endpoint")
task.log("Testing: Verified with test cases")

# 3. Attach verification criteria
task.add_rubric(
    Rubric(
        llm_traits=[
            LLMRubricTrait(name="clarity", description="How clear is the plan?", kind="score", min_score=1, max_score=5)
        ]
    )
)

print("✓ TaskEval setup complete")

✓ TaskEval setup complete


### Configuring and Running Evaluation

In [3]:
# Configure evaluation (parsing_only = no answer generation)
config = VerificationConfig(
    parsing_models=[ModelConfig(id="parser", model_provider="openai", model_name="gpt-4o-mini")], parsing_only=True
)

# Evaluate traces against verification criteria
result = task.evaluate(config)

print("Evaluation complete!")
print(result.summary())

Failed to parse abstention detection response as JSON: Expecting value: line 1 column 1 (char 0)


Batch evaluation failed: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Rubric evaluator initialization/configuration failed for question 39430513e323dbe155b2201a8b877bd2: Failed to evaluate rubric traits using batch strategy: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Evaluation complete!
0/1 traces passed


### Displaying Results

TaskEval provides multiple ways to access results.

In [4]:
# Display formatted results
print(result.display())

# Access individual verification results
for question_id, vr_list in result.global_eval.verification_results.items():
    vr = vr_list[0]
    print(f"\nQuestion: {question_id}")
    print(f"Passed: {vr.template.verify_result}")
    if vr.rubric and vr.rubric.llm_trait_scores:
        print(f"LLM Traits: {vr.rubric.llm_trait_scores}")

════════════════════════════════════════════════════════════════════════════════
                        TASK EVALUATION RESULTS
════════════════════════════════════════════════════════════════════════════════
Task ID: agent_code_generation
Timestamp: 2026-01-04 11:35:49

────────────────────────────────────────────────────────────
GLOBAL EVALUATION
────────────────────────────────────────────────────────────
Verification Results:
  Question: rubric_only_eval
     Status: ⚠ NO RESULT
     Output: "Reasoning: Need to implement a REST API endpoint

Plan: Use FastAPI with proper error handling

Implementation: Created /api/users endpoint

Testing: Verified with test cases"

════════════════════════════════════════════════════════════════════════════════
SUMMARY: 0/1 traces passed
════════════════════════════════════════════════════════════════════════════════

Question: rubric_only_eval
Passed: None


## Dict Trace Logging

**Dict traces** are the recommended way to log structured agent outputs.

In [5]:
# Dict traces example - structured logging
task_dict = TaskEval(task_id="structured_eval")

# Log a dict trace - each key becomes a separate evaluation point
task_dict.log(
    {
        "analysis": "Examined requirements and constraints",
        "plan": "Generated 3-step implementation plan",
        "execution": "Implemented all steps successfully",
        "testing": "Verified with test cases",
    }
)

task_dict.add_rubric(Rubric(llm_traits=[LLMRubricTrait(name="completeness", kind="score", min_score=1, max_score=5)]))

config = VerificationConfig(
    parsing_models=[ModelConfig(id="parser", model_provider="openai", model_name="gpt-4o-mini")], parsing_only=True
)
result_dict = task_dict.evaluate(config)

print("Dict trace results:")
for question_id in result_dict.global_eval.verification_results.keys():
    print(f"  - {question_id}")

Failed to parse abstention detection response as JSON: Expecting value: line 1 column 1 (char 0)


Batch evaluation failed: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Rubric evaluator initialization/configuration failed for question 07ad92091c4a1f320394a53da4151b75: Failed to evaluate rubric traits using batch strategy: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Failed to parse abstention detection response as JSON: Expecting value: line 1 column 1 (char 0)


Batch evaluation failed: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Rubric evaluator initialization/configuration failed for question 407414879a82b1978bf9b4c6905d9737: Failed to evaluate rubric traits using batch strategy: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Failed to parse abstention detection response as JSON: Expecting value: line 1 column 1 (char 0)


Batch evaluation failed: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Rubric evaluator initialization/configuration failed for question 6c94baa466119ca82916fafb26b0cffb: Failed to evaluate rubric traits using batch strategy: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Failed to parse abstention detection response as JSON: Expecting value: line 1 column 1 (char 0)


Batch evaluation failed: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Rubric evaluator initialization/configuration failed for question ce1b3b8ca5c2dc8d09f88c295f59f30d: Failed to evaluate rubric traits using batch strategy: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Dict trace results:
  - dict_key_analysis
  - dict_key_execution
  - dict_key_plan
  - dict_key_testing


## Rubric Types

- **LLMRubricTrait**: Qualitative assessment (score or boolean)
- **RegexTrait**: Deterministic pattern matching (boolean)
- **MetricRubricTrait**: Quantitative metrics (precision, recall, F1)

In [6]:
# Comprehensive rubrics example with all three trait types
task_rubrics = TaskEval(task_id="comprehensive_rubrics")

task_rubrics.log(
    {
        "requirements": "Analyzed requirements",
        "architecture": "Designed 3-tier architecture",
        "implementation": "Implemented with error handling",
        "testing": "Created unit tests",
    }
)

comprehensive_rubric = Rubric(
    llm_traits=[
        LLMRubricTrait(name="clarity", kind="score", min_score=1, max_score=5),
    ],
    regex_traits=[
        RegexTrait(name="has_code", pattern=r"```"),
    ],
)

task_rubrics.add_rubric(comprehensive_rubric)

config = VerificationConfig(
    parsing_models=[ModelConfig(id="parser", model_provider="openai", model_name="gpt-4o-mini")], parsing_only=True
)
result_rubrics = task_rubrics.evaluate(config)

print("Comprehensive rubric results:")

Failed to parse abstention detection response as JSON: Expecting value: line 1 column 1 (char 0)


Batch evaluation failed: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Rubric evaluator initialization/configuration failed for question d82176653c6d79efae93171e60ee45f3: Failed to evaluate rubric traits using batch strategy: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Failed to parse abstention detection response as JSON: Expecting value: line 1 column 1 (char 0)


Batch evaluation failed: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Rubric evaluator initialization/configuration failed for question caa8286b87c14f40d780df79bcacaac3: Failed to evaluate rubric traits using batch strategy: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Failed to parse abstention detection response as JSON: Expecting value: line 1 column 1 (char 0)


Batch evaluation failed: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Rubric evaluator initialization/configuration failed for question 0fd9fbdd2414e2701d8b526e7694edc9: Failed to evaluate rubric traits using batch strategy: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Failed to parse abstention detection response as JSON: Expecting value: line 1 column 1 (char 0)


Batch evaluation failed: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Rubric evaluator initialization/configuration failed for question ce1b3b8ca5c2dc8d09f88c295f59f30d: Failed to evaluate rubric traits using batch strategy: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Comprehensive rubric results:


## Step-Specific Evaluation

Evaluate different workflow phases with phase-specific rubrics.

In [7]:
# Step-specific evaluation example
task_steps = TaskEval(task_id="multi_step_agent")

# Add rubric globally (for this simple example)
planning_rubric = Rubric(llm_traits=[LLMRubricTrait(name="plan_quality", kind="score", min_score=1, max_score=5)])
task_steps.add_rubric(planning_rubric)  # Global rubric

task_steps.log({"analysis": "Analyzed the problem", "plan": "Created plan"})

config = VerificationConfig(
    parsing_models=[ModelConfig(id="parser", model_provider="openai", model_name="gpt-4o-mini")], parsing_only=True
)
result_steps = task_steps.evaluate(config)

print("Step-specific results:")
print(f"Questions evaluated: {len(result_steps.global_eval.verification_results)}")

Failed to parse abstention detection response as JSON: Expecting value: line 1 column 1 (char 0)


Batch evaluation failed: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Rubric evaluator initialization/configuration failed for question 07ad92091c4a1f320394a53da4151b75: Failed to evaluate rubric traits using batch strategy: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Failed to parse abstention detection response as JSON: Expecting value: line 1 column 1 (char 0)


Batch evaluation failed: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Rubric evaluator initialization/configuration failed for question 6c94baa466119ca82916fafb26b0cffb: Failed to evaluate rubric traits using batch strategy: Could not parse response into BatchRubricScores: Successfully implemented REST API with proper error handling


Step-specific results:
Questions evaluated: 2


## Best Practices

1. Use **Dict Traces** for structured evaluation
2. Choose appropriate **Rubric Types** for your use case
3. Use **Step IDs** for multi-step workflows
4. Use **Replicates** for critical evaluations