# Analyzing Verification Results

This notebook demonstrates how to explore and analyze verification results in Karenina.
It covers the full workflow: inspecting individual `VerificationResult` objects,
converting results to pandas DataFrames with all three builders, filtering and
aggregating, and exporting.

For the full reference on result structure, see
[VerificationResult Structure](../07-analyzing-results/verification-result.md).
For DataFrame details, see [DataFrame Analysis](../07-analyzing-results/dataframe-analysis.md).

In [None]:
# Setup cell: creates mock VerificationResult objects for documentation examples.
# This cell is hidden in the rendered documentation.
import datetime

from karenina.schemas.results import VerificationResultSet
from karenina.schemas.verification import VerificationResult
from karenina.schemas.verification.model_identity import ModelIdentity
from karenina.schemas.verification.result_components import (
    VerificationResultMetadata,
    VerificationResultRubric,
    VerificationResultTemplate,
)

# Two answering models and one parsing model
_answering_gpt4o = ModelIdentity(model_name="gpt-4o", interface="langchain")
_answering_claude = ModelIdentity(
    model_name="claude-sonnet-4-20250514", interface="claude_agent_sdk"
)
_parsing = ModelIdentity(model_name="gpt-4o-mini", interface="langchain")
_ts = datetime.datetime.now(tz=datetime.UTC).isoformat()


def _make_result(
    qid, question_text, answering, verified, response,
    rubric_scores=None, regex_scores=None, callable_scores=None,
    parsed_gt=None, parsed_llm=None, replicate=None,
    exec_time=1.5,
):
    rid = VerificationResultMetadata.compute_result_id(
        qid, answering, _parsing, _ts, replicate
    )
    template = VerificationResultTemplate(
        raw_llm_response=response,
        verify_result=verified,
        template_verification_performed=True,
        parsed_gt_response=parsed_gt or {"answer": response},
        parsed_llm_response=parsed_llm or {"answer": response},
    )
    rubric = None
    if rubric_scores or regex_scores or callable_scores:
        rubric = VerificationResultRubric(
            rubric_evaluation_performed=True,
            llm_trait_scores=rubric_scores,
            regex_trait_scores=regex_scores,
            callable_trait_scores=callable_scores,
        )
    return VerificationResult(
        metadata=VerificationResultMetadata(
            question_id=qid,
            template_id="tmpl_" + qid[:8],
            completed_without_errors=True,
            question_text=question_text,
            answering=answering,
            parsing=_parsing,
            execution_time=exec_time,
            timestamp=_ts,
            result_id=rid,
            replicate=replicate,
        ),
        template=template,
        rubric=rubric,
    )


# 6 results: 2 models x 3 questions
_mock_results = [
    _make_result(
        "q1", "What is the capital of France?", _answering_gpt4o, True, "Paris",
        rubric_scores={"clarity": 4, "conciseness": True},
        regex_scores={"no_hedging": True},
        parsed_gt={"capital": "Paris"}, parsed_llm={"capital": "Paris"},
        exec_time=1.2,
    ),
    _make_result(
        "q2", "What is 6 multiplied by 7?", _answering_gpt4o, True, "42",
        rubric_scores={"clarity": 5, "conciseness": True},
        parsed_gt={"result": "42"}, parsed_llm={"result": "42"},
        exec_time=0.9,
    ),
    _make_result(
        "q3", "What element has atomic number 8?", _answering_gpt4o, False, "Nitrogen",
        rubric_scores={"clarity": 3, "conciseness": False},
        parsed_gt={"element": "Oxygen"}, parsed_llm={"element": "Nitrogen"},
        exec_time=1.8,
    ),
    _make_result(
        "q1", "What is the capital of France?", _answering_claude, True, "Paris",
        rubric_scores={"clarity": 5, "conciseness": True},
        regex_scores={"no_hedging": True},
        parsed_gt={"capital": "Paris"}, parsed_llm={"capital": "Paris"},
        exec_time=1.4,
    ),
    _make_result(
        "q2", "What is 6 multiplied by 7?", _answering_claude, True, "42",
        rubric_scores={"clarity": 5, "conciseness": True},
        parsed_gt={"result": "42"}, parsed_llm={"result": "42"},
        exec_time=1.1,
    ),
    _make_result(
        "q3", "What element has atomic number 8?", _answering_claude, True, "Oxygen",
        rubric_scores={"clarity": 4, "conciseness": True},
        parsed_gt={"element": "Oxygen"}, parsed_llm={"element": "Oxygen"},
        exec_time=1.6,
    ),
]

results = VerificationResultSet(results=_mock_results)

## Exploring a VerificationResult

Each call to `run_verification()` returns a `VerificationResultSet` — a collection
of `VerificationResult` objects, one per question verified. Let's start by
inspecting a single result.

In [None]:
# Get the first result from the set
result = results.results[0]

print("Sections available:")
print("  metadata:       always present")
print(f"  template:       {result.template is not None}")
print(f"  rubric:         {result.rubric is not None}")
print(f"  deep_judgment:  {result.deep_judgment is not None}")

### Metadata

The `metadata` section is always present and identifies the question, models,
and execution context.

In [None]:
meta = result.metadata

print("Identification:")
print(f"  question_id:  {meta.question_id}")
print(f"  template_id:  {meta.template_id}")
print(f"  result_id:    {meta.result_id}")
print(f"  question:     {meta.question_text}")

print("\nModels:")
print(f"  answering:  {meta.answering_model}")
print(f"  parsing:    {meta.parsing_model}")

print("\nExecution:")
print(f"  completed:  {meta.completed_without_errors}")
print(f"  time:       {meta.execution_time:.1f}s")

### Template Results

The `template` section contains parsed responses and verification outcomes.

In [None]:
if result.template:
    tmpl = result.template
    print(f"Raw LLM response: {tmpl.raw_llm_response!r}")
    print(f"Parsed LLM:       {tmpl.parsed_llm_response}")
    print(f"Parsed GT:        {tmpl.parsed_gt_response}")
    print(f"verify() result:  {tmpl.verify_result}")

### Rubric Results

The `rubric` section contains trait scores, split by type for type-safe access.
The `get_all_trait_scores()` convenience method merges them into a flat dictionary.

In [None]:
if result.rubric:
    rub = result.rubric
    print("Trait scores by type:")
    print(f"  LLM traits:      {rub.llm_trait_scores}")
    print(f"  Regex traits:    {rub.regex_trait_scores}")
    print(f"  Callable traits: {rub.callable_trait_scores}")
    print(f"\nAll scores (flat): {rub.get_all_trait_scores()}")

    # Look up a specific trait
    match = rub.get_trait_by_name("clarity")
    if match:
        value, trait_type = match
        print(f"\nLookup 'clarity': value={value}, type={trait_type}")

---

## Working with VerificationResultSet

The `VerificationResultSet` provides filtering and grouping before you convert
to DataFrames.

In [None]:
print(f"Total results: {len(results.results)}")

# Filter to a single model
filtered = results.filter(answering_models=["langchain:gpt-4o"])
print(f"GPT-4o results: {len(filtered.results)}")

# Group by model
by_model = results.group_by_model()
for model, group in by_model.items():
    print(f"  {model}: {len(group.results)} results")

# Group by question
by_question = results.group_by_question()
for qid, group in by_question.items():
    print(f"  {qid}: {len(group.results)} results")

### Result Set Summary

`get_summary()` provides comprehensive statistics in one call.

In [None]:
summary = results.get_summary()
print(f"Total results:    {summary['num_results']}")
print(f"Completed:        {summary['num_completed']}")
print(f"With templates:   {summary['num_with_template']}")
print(f"With rubrics:     {summary['num_with_rubric']}")
print(f"Unique questions: {summary['num_questions']}")
print(f"Models used:      {summary['num_models']}")

---

## Template DataFrames

`TemplateResults` creates one row per parsed field, enabling field-level
comparison between ground truth and LLM-extracted values.

In [None]:
template_results = results.get_template_results()
df = template_results.to_dataframe()

print(f"Shape: {df.shape}")
print("\nField comparison:")
print(
    df[["question_id", "answering_model", "field_name",
        "gt_value", "llm_value", "field_match"]].to_string(index=False)
)

### Pass Rate by Model and Question

In [None]:
# Built-in aggregation helpers
print("Pass rate by model:")
for model, rate in template_results.aggregate_pass_rate(by="answering_model").items():
    print(f"  {model}: {rate:.0%}")

print("\nPass rate by question:")
for qid, rate in template_results.aggregate_pass_rate(by="question_id").items():
    print(f"  {qid}: {rate:.0%}")

### Filtering Template Results

In [None]:
# Filter to only failed results
failed = template_results.filter(failed_only=True)
print(f"Failed results: {len(failed)}")

# Filter by model
gpt_results = template_results.filter(answering_models=["langchain:gpt-4o"])
print(f"GPT-4o results: {len(gpt_results)}")

# Summary statistics
tmpl_summary = template_results.get_template_summary()
print(f"\nSummary: {tmpl_summary['num_passed']} passed, {tmpl_summary['num_failed']} failed")
print(f"Pass rate: {tmpl_summary['pass_rate']:.0%}")

---

## Rubric DataFrames

`RubricResults` creates one row per trait evaluated. Filter by trait type using
the `trait_type` parameter.

In [None]:
rubric_results = results.get_rubrics_results()
df_rubric = rubric_results.to_dataframe()

print("All rubric traits:")
print(
    df_rubric[["question_id", "answering_model", "trait_name",
               "trait_score", "trait_type"]].to_string(index=False)
)

In [None]:
# Filter to LLM score traits only (numeric 1-5 scale)
df_scores = rubric_results.to_dataframe(trait_type="llm_score")
print(f"LLM score traits: {len(df_scores)} rows")
print(
    df_scores[["question_id", "answering_model",
               "trait_name", "trait_score"]].to_string(index=False)
)

### Aggregating Trait Scores

In [None]:
# Average LLM trait scores by model
avg_by_model = rubric_results.aggregate_llm_traits(
    strategy="mean", by="answering_model"
)
print("Average LLM trait scores by model:")
for model, traits in avg_by_model.items():
    print(f"  {model}:")
    for trait, score in traits.items():
        print(f"    {trait}: {score:.1f}")

In [None]:
trait_summary = rubric_results.get_trait_summary()
print(f"Results with rubric data: {trait_summary['num_results']}")
print(f"LLM traits:     {trait_summary['llm_traits']}")
print(f"Regex traits:   {trait_summary['regex_traits']}")
print(f"Callable traits: {trait_summary['callable_traits']}")

---

## Deep Judgment DataFrames

`JudgmentResults` handles deep judgment data, creating one row per
(attribute x excerpt) pair. This is the most granular DataFrame.

In [None]:
judgment_results = results.get_judgment_results()
print(f"Results with deep judgment: {len(judgment_results.get_results_with_judgment())}")
# Deep judgment was not enabled in our mock data, so this is empty.
# When enabled, the DataFrame provides columns for excerpt text,
# confidence scores, similarity scores, and hallucination risk.

---

## Common Analysis Patterns

### Model Comparison with pandas

Combine template pass rates and rubric scores for a side-by-side model comparison.

In [None]:

# Template pass rates by model
template_df = results.get_template_results().to_dataframe()
model_pass = (
    template_df.drop_duplicates(subset=["result_index"])
    .groupby("answering_model")["verify_result"]
    .mean()
)
print("Template pass rate by model:")
print(model_pass.to_string())

### Question Difficulty

Identify which questions are hardest by looking at pass rates across all models.

In [None]:
question_pass = (
    template_df.drop_duplicates(subset=["result_index"])
    .groupby("question_id")["verify_result"]
    .agg(["mean", "count"])
    .rename(columns={"mean": "pass_rate", "count": "num_runs"})
    .sort_values("pass_rate")
)
print("Question difficulty (sorted by pass rate):")
print(question_pass.to_string())

---

## Exporting Results

### DataFrame Export to CSV

In [None]:
import os
import tempfile

with tempfile.NamedTemporaryFile(suffix=".csv", delete=False, mode="w") as f:
    csv_path = f.name
    template_df.to_csv(csv_path, index=False)
    size = os.path.getsize(csv_path)
    print(f"Exported {len(template_df)} rows to CSV ({size} bytes)")
    os.unlink(csv_path)

### Exporting Rubric DataFrames

In [None]:
rubric_df = rubric_results.to_dataframe()

with tempfile.NamedTemporaryFile(suffix=".csv", delete=False, mode="w") as f:
    csv_path = f.name
    rubric_df.to_csv(csv_path, index=False)
    size = os.path.getsize(csv_path)
    print(f"Exported {len(rubric_df)} rubric rows to CSV ({size} bytes)")
    os.unlink(csv_path)

---

## Next Steps

- [VerificationResult Structure](../07-analyzing-results/verification-result.md) — Complete field reference
- [DataFrame Analysis](../07-analyzing-results/dataframe-analysis.md) — Full DataFrame API details
- [Exporting Results](../07-analyzing-results/exporting.md) — Benchmark-level export (JSON and CSV)
- [Iterating on Benchmarks](../07-analyzing-results/iterating.md) — Improve templates based on results