In [37]:
# Mock Setup - Hidden in rendered documentation
# This cell is tagged with "hide-cell" in notebook metadata

import tempfile
import sys
import os
import hashlib
import json
from pathlib import Path
from unittest.mock import Mock, MagicMock, patch, PropertyMock
from typing import Any, Dict, List
from datetime import datetime

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import after path is set
from karenina.schemas.workflow.verification.result import VerificationResult
from karenina.schemas.workflow.verification.result_components import (
    VerificationResultMetadata,
    VerificationResultTemplate,
    VerificationResultRubric,
    VerificationResultDeepJudgment,
    VerificationResultDeepJudgmentRubric,
)
from karenina.schemas.workflow.verification_result_set import VerificationResultSet
from karenina.schemas.workflow.template_results import TemplateResults
from karenina.schemas.workflow.rubric_results import RubricResults
from karenina.schemas.workflow.judgment_results import JudgmentResults

# Import pandas for DataFrame operations
import pandas as pd

def create_mock_results():
    """Create realistic mock verification results for DataFrame analysis examples."""
    results = []
    
    # Mock data for different questions
    mock_scenarios = [
        {
            "question_id": "q_chromosomes",
            "question_text": "How many chromosomes do humans have?",
            "raw_response": "Humans have 46 chromosomes in total, arranged in 23 pairs.",
            "parsed_gt": {"count": 46, "unit": "chromosomes"},
            "parsed_llm": {"count": 46, "unit": "chromosomes"},
            "verify_result": True,
            "llm_traits": {"Accuracy": 5, "Clarity": 4, "Completeness": 4},
            "embedding_score": 0.95,
        },
        {
            "question_id": "q_drug_target",
            "question_text": "What is the target of venetoclax?",
            "raw_response": "Venetoclax targets the BCL2 protein, which regulates apoptosis.",
            "parsed_gt": {"target": "BCL2", "class": "BCL-2 inhibitor"},
            "parsed_llm": {"target": "BCL2", "class": "BCL-2 inhibitor"},
            "verify_result": True,
            "llm_traits": {"Accuracy": 5, "Clarity": 5, "Completeness": 5},
            "embedding_score": 0.92,
        },
        {
            "question_id": "q_hemoglobin",
            "question_text": "How many subunits does hemoglobin have?",
            "raw_response": "Hemoglobin has 4 subunits consisting of 2 alpha and 2 beta chains.",
            "parsed_gt": {"subunits": 4, "composition": "2 alpha, 2 beta"},
            "parsed_llm": {"subunits": 4, "composition": "2 alpha, 2 beta"},
            "verify_result": True,
            "llm_traits": {"Accuracy": 5, "Clarity": 4, "Completeness": 5},
            "embedding_score": 0.88,
        },
        {
            "question_id": "q_inflammatory_lung",
            "question_text": "Name three inflammatory lung diseases.",
            "raw_response": "Three inflammatory lung diseases are asthma, bronchitis, and pneumonia.",
            "parsed_gt": {"diseases": ["asthma", "bronchitis", "pneumonia"]},
            "parsed_llm": {"diseases": ["asthma", "bronchitis", "pneumonia"]},
            "verify_result": True,
            "llm_traits": {"Accuracy": 4, "Clarity": 5, "Completeness": 4},
            "embedding_score": 0.85,
        },
        {
            "question_id": "q_partial_fail",
            "question_text": "What is the molecular weight of insulin?",
            "raw_response": "Insulin has a molecular weight of approximately 5800 Daltons.",
            "parsed_gt": {"weight": 5808, "unit": "Daltons"},
            "parsed_llm": {"weight": 5800, "unit": "Daltons"},
            "verify_result": False,  # Slight mismatch
            "llm_traits": {"Accuracy": 3, "Clarity": 5, "Completeness": 4},
            "embedding_score": 0.90,
        },
    ]
    
    # Create second model results for comparison
    for model in ["gpt-4o-mini", "claude-3-5-sonnet"]:
        for scenario in mock_scenarios:
            # Vary results slightly by model
            is_gpt = model == "gpt-4o-mini"
            
            timestamp = datetime.now().isoformat()
            template_id = hashlib.md5(f"{scenario['question_id']}_{model}".encode()).hexdigest()[:32]
            
            # Compute result_id
            result_data = {
                "answering_mcp_servers": [],
                "answering_model": model,
                "parsing_model": model,
                "question_id": scenario['question_id'],
                "replicate": None,
                "timestamp": timestamp,
            }
            json_str = json.dumps(result_data, sort_keys=True, ensure_ascii=True)
            result_id = hashlib.sha256(json_str.encode("utf-8")).hexdigest()[:16]
            
            # Create template result
            template = VerificationResultTemplate(
                raw_llm_response=scenario['raw_response'],
                parsed_llm_response=scenario['parsed_llm'],
                parsed_gt_response=scenario['parsed_gt'],
                verify_result=scenario['verify_result'] if is_gpt else True,
                template_verification_performed=True,
                usage_metadata={
                    "answer_generation": {"total_tokens": 50, "input_tokens": 30, "output_tokens": 20},
                    "parsing": {"total_tokens": 30, "input_tokens": 15, "output_tokens": 15},
                    "total": {"total_tokens": 80, "input_tokens": 45, "output_tokens": 35}
                },
                abstention_check_performed=True,
                abstention_detected=False,
                embedding_check_performed=True,
                embedding_similarity_score=scenario['embedding_score'] - (0.05 if not is_gpt else 0),
                embedding_model_used="text-embedding-3-small",
                regex_validations_performed=False,
                recursion_limit_reached=False,
                answering_mcp_servers=[],
            )
            
            # Create rubric result
            rubric = VerificationResultRubric(
                rubric_evaluation_performed=True,
                llm_trait_scores=scenario['llm_traits'],
                regex_trait_scores={},
                callable_trait_scores={},
                metric_trait_scores={},
                metric_trait_confusion_lists={},
            )
            
            # Create metadata
            metadata = VerificationResultMetadata(
                question_id=scenario['question_id'],
                template_id=template_id,
                completed_without_errors=True,
                question_text=scenario['question_text'],
                raw_answer=scenario['raw_response'],
                answering_model=model,
                parsing_model=model,
                execution_time=1.5 if is_gpt else 2.0,
                timestamp=timestamp,
                result_id=result_id,
                keywords=[],
                replicate=None,
                answering_system_prompt="You are a helpful assistant.",
                parsing_system_prompt="Extract structured information.",
                error=None,
                run_name="demo_run",
            )
            
            results.append(VerificationResult(
                metadata=metadata,
                template=template,
                rubric=rubric,
                deep_judgment=None,
                deep_judgment_rubric=None,
            ))
    
    return results

# Create mock result set
mock_results = create_mock_results()
result_set = VerificationResultSet(results=mock_results)

# Extract result wrappers
template_results = result_set.get_template_results()
rubric_results = result_set.get_rubrics_results()
judgment_results = result_set.get_judgment_results()

# Cleanup
import atexit
import shutil

def _cleanup():
    shutil.rmtree(TEMP_DIR, ignore_errors=True)

atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Loaded {len(mock_results)} mock verification results")
print(f"✓ Results from {len(result_set.get_model_names())} models: {result_set.get_model_names()}")
print(f"✓ {len(result_set.get_question_ids())} questions available for analysis")

✓ Mock setup complete
✓ Loaded 10 mock verification results
✓ Results from 2 models: ['claude-3-5-sonnet', 'gpt-4o-mini']
✓ 5 questions available for analysis


# Analyzing Verification Results with DataFrames

This guide covers how to analyze verification results using the DataFrame-first approach, providing flexible and powerful data analysis with pandas.

## Overview

### What This Guide Covers

This guide focuses on **analyzing verification results** after you've run verification. The DataFrame-first approach provides a modern, flexible way to wrangle and analyze verification output using pandas DataFrames.

**Typical Workflow**:

1. Run verification (see verification.md for details)
2. Extract results using `result_set.get_templates()`, `get_rubrics()`, or `get_judgments()`
3. Convert to DataFrame for analysis
4. Analyze with pandas

## Quick Start

### Basic Workflow

After running verification, you can analyze results with DataFrames:

In [38]:
# STEP 1: Extract result type wrappers from verification output
template_results = result_set.get_template_results()
rubric_results = result_set.get_rubrics_results()
judgment_results = result_set.get_judgment_results()

print(f"Template results: {len(template_results)}")
print(f"Rubric results: {len(rubric_results)}")
print(f"Judgment results: {len(judgment_results)}")

Template results: 10
Rubric results: 10
Judgment results: 10


In [39]:
# STEP 2: Convert verification results to DataFrames
template_df = template_results.to_dataframe()
rubric_df = rubric_results.to_dataframe()
judgment_df = judgment_results.to_dataframe()

print(f"Template DataFrame shape: {template_df.shape}")
print(f"Rubric DataFrame shape: {rubric_df.shape}")
print(f"Judgment DataFrame shape: {judgment_df.shape}")

Template DataFrame shape: (18, 34)
Rubric DataFrame shape: (30, 17)
Judgment DataFrame shape: (10, 40)


In [40]:
# STEP 3: Analyze with pandas
pass_rate = template_df.groupby('question_id')['field_match'].mean()
print("Pass rate by question:")
print(pass_rate)

Pass rate by question:
question_id
q_chromosomes          1.0
q_drug_target          1.0
q_hemoglobin           1.0
q_inflammatory_lung    1.0
q_partial_fail         0.5
Name: field_match, dtype: float64


### Basic Example: Template Verification Analysis

In [41]:
# Get template results DataFrame
df = template_results.to_dataframe()

# View available columns
print("Available columns:")
print(df.columns.tolist())

Available columns:
['completed_without_errors', 'error', 'recursion_limit_reached', 'question_id', 'template_id', 'question_text', 'keywords', 'replicate', 'answering_mcp_servers', 'answering_model', 'parsing_model', 'answering_system_prompt', 'parsing_system_prompt', 'raw_llm_response', 'field_name', 'gt_value', 'llm_value', 'field_match', 'field_type', 'verify_result', 'embedding_check_performed', 'embedding_similarity_score', 'embedding_model_used', 'embedding_override_applied', 'abstention_check_performed', 'abstention_detected', 'abstention_reasoning', 'abstention_override_applied', 'regex_validations_performed', 'regex_overall_success', 'execution_time', 'timestamp', 'run_name', 'result_index']


In [42]:
# Filter to successful verifications only
successful = df[df['completed_without_errors'] == True]
print(f"Successful verifications: {len(successful)} out of {len(df)}")

Successful verifications: 18 out of 18


In [43]:
# Calculate pass rate by question
pass_rates = successful.groupby('question_id')['field_match'].mean()
print("\nPass rates by question:")
print(pass_rates)


Pass rates by question:
question_id
q_chromosomes          1.0
q_drug_target          1.0
q_hemoglobin           1.0
q_inflammatory_lung    1.0
q_partial_fail         0.5
Name: field_match, dtype: float64


In [44]:
# Find questions with low pass rates
low_performers = pass_rates[pass_rates < 1.0]
print(f"\nQuestions with <100% pass rate: {len(low_performers)}")
print(low_performers)


Questions with <100% pass rate: 1
question_id
q_partial_fail    0.5
Name: field_match, dtype: float64


In [45]:
# Analyze by field
field_performance = successful.groupby('field_name').agg({
    'field_match': ['mean', 'count']
})
print("\nField performance:")
print(field_performance)


Field performance:
            field_match      
                   mean count
field_name                   
class               1.0     2
composition         1.0     2
count               1.0     2
diseases            1.0     2
subunits            1.0     2
target              1.0     2
unit                1.0     4
weight              0.0     2


## DataFrame Methods Reference

### TemplateResults

#### `to_dataframe()`

Convert template verification results to pandas DataFrame with field-level explosion.

**Key Columns**:
- **Status**: `completed_without_errors`, `error`, `recursion_limit_reached`
- **Field Comparison**: `field_name`, `gt_value`, `llm_value`, `field_match`, `field_type`
- **Verification Checks**: `embedding_check_performed`, `embedding_similarity_score`, `abstention_detected`, `regex_validations_performed`
- **Identification**: `question_id`, `template_id`, `answering_model`, `parsing_model`

In [46]:
df = template_results.to_dataframe()

# Each field gets its own row
# For example, a question with 3 fields → 3 rows
print(f"Total rows (exploded by field): {len(df)}")
print(f"Unique questions: {df['question_id'].nunique()}")

Total rows (exploded by field): 18
Unique questions: 5


In [47]:
# Analyze specific fields
weight_fields = df[df['field_name'] == 'weight']
if len(weight_fields) > 0:
    match_rate = weight_fields['field_match'].mean()
    print(f"Weight field match rate: {match_rate:.2%}")

Weight field match rate: 0.00%


#### `to_regex_dataframe()`

Convert regex validation results to DataFrame with pattern explosion.

In [48]:
# For regex validation results
regex_df = template_results.to_regex_dataframe()
print(f"Regex DataFrame rows: {len(regex_df)}")

if len(regex_df) > 0:
    # Analyze pattern success rates
    pattern_stats = regex_df.groupby('pattern_name').agg({
        'matched': 'mean'
    })
    print("\nPattern success rates:")
    print(pattern_stats)
else:
    print("No regex validations performed in this example.")

Regex DataFrame rows: 0
No regex validations performed in this example.


#### `to_usage_dataframe(totals_only=False)`

Convert token usage data to DataFrame.

In [49]:
# Detailed usage by stage
usage_df = template_results.to_usage_dataframe(totals_only=False)
stage_costs = usage_df.groupby('usage_stage')['total_tokens'].sum()
print("Token usage by stage:")
print(stage_costs)

Token usage by stage:
usage_stage
answer_generation    500
parsing              300
Name: total_tokens, dtype: int64


In [50]:
# Total usage only
totals_df = template_results.to_usage_dataframe(totals_only=True)
total_cost = totals_df['total_tokens'].sum()
print(f"\nTotal tokens used: {total_cost}")


Total tokens used: 800


### RubricResults

#### `to_dataframe(trait_type="all")`

Convert rubric evaluation results to DataFrame with trait explosion.

**Parameters**:
- `trait_type`: Filter trait type - "llm_score", "llm_binary", "llm", "regex", "callable", "metric", or "all"

**Key Columns**:
- `trait_name`: Name of the rubric trait
- `trait_type`: Type (llm_score, llm_binary, regex, callable, metric)
- `trait_score`: Score value

In [51]:
# Get all LLM-scored traits
llm_df = rubric_results.to_dataframe(trait_type="llm_score")
print(f"LLM trait DataFrame shape: {llm_df.shape}")
print("\nSample LLM trait scores:")
print(llm_df[['question_id', 'trait_name', 'trait_score']].head(10))

LLM trait DataFrame shape: (30, 17)

Sample LLM trait scores:
           question_id    trait_name  trait_score
0        q_chromosomes      Accuracy            5
1        q_chromosomes       Clarity            4
2        q_chromosomes  Completeness            4
3        q_drug_target      Accuracy            5
4        q_drug_target       Clarity            5
5        q_drug_target  Completeness            5
6         q_hemoglobin      Accuracy            5
7         q_hemoglobin       Clarity            4
8         q_hemoglobin  Completeness            5
9  q_inflammatory_lung      Accuracy            4


In [52]:
# Analyze trait performance
trait_scores = llm_df.groupby('trait_name')['trait_score'].agg(['mean', 'std', 'count'])
print("\nTrait score summary:")
print(trait_scores)


Trait score summary:
              mean       std  count
trait_name                         
Accuracy       4.4  0.843274     10
Clarity        4.6  0.516398     10
Completeness   4.4  0.516398     10


In [53]:
# Find low-scoring traits
low_scores = trait_scores[trait_scores['mean'] < 4.0]
print("\nTraits with average score < 4.0:")
print(low_scores)


Traits with average score < 4.0:
Empty DataFrame
Columns: [mean, std, count]
Index: []


In [54]:
# Compare models
model_comparison = llm_df.pivot_table(
    values='trait_score',
    index='question_id',
    columns='answering_model',
    aggfunc='mean'
)
print("\nModel comparison by question:")
print(model_comparison)


Model comparison by question:
answering_model      claude-3-5-sonnet  gpt-4o-mini
question_id                                        
q_chromosomes                 4.333333     4.333333
q_drug_target                 5.000000     5.000000
q_hemoglobin                  4.666667     4.666667
q_inflammatory_lung           4.333333     4.333333
q_partial_fail                4.000000     4.000000


## Common Patterns

### Pattern 1: Calculate Pass Rates

**Template verification pass rate by question:**

In [55]:
df = template_results.to_dataframe()

# Filter to successful verifications
successful = df[df['completed_without_errors'] == True]

# Calculate pass rate by question
pass_rates = successful.groupby('question_id')['field_match'].mean()
print("Pass rates by question:")
print(pass_rates)

# Get questions below threshold
failing = pass_rates[pass_rates < 1.0]
print(f"\nQuestions with <100% pass rate: {list(failing.index)}")

Pass rates by question:
question_id
q_chromosomes          1.0
q_drug_target          1.0
q_hemoglobin           1.0
q_inflammatory_lung    1.0
q_partial_fail         0.5
Name: field_match, dtype: float64

Questions with <100% pass rate: ['q_partial_fail']


**Rubric trait scores by model:**

In [56]:
rubric_df = rubric_results.to_dataframe(trait_type="llm_score")

# Average score by model
model_scores = rubric_df.groupby('answering_model')['trait_score'].mean()
print("Average trait scores by model:")
print(model_scores)

Average trait scores by model:
answering_model
claude-3-5-sonnet    4.466667
gpt-4o-mini          4.466667
Name: trait_score, dtype: float64


In [57]:
# Detailed breakdown by trait and model
model_trait_scores = rubric_df.pivot_table(
    values='trait_score',
    index='answering_model',
    columns='trait_name',
    aggfunc='mean'
)
print("\nModel scores by trait:")
print(model_trait_scores)


Model scores by trait:
trait_name         Accuracy  Clarity  Completeness
answering_model                                   
claude-3-5-sonnet       4.4      4.6           4.4
gpt-4o-mini             4.4      4.6           4.4


### Pattern 2: Multi-Dimensional Analysis

**Template + Rubric combined analysis:**

In [58]:
# Get DataFrames
template_df = template_results.to_dataframe()
rubric_df = rubric_results.to_dataframe(trait_type="llm_score")

# Aggregate to question level
template_agg = template_df.groupby('question_id')['field_match'].mean()
rubric_agg = rubric_df.groupby('question_id')['trait_score'].mean()

# Merge
combined = pd.DataFrame({
    'template_pass_rate': template_agg,
    'rubric_avg_score': rubric_agg
})

print("Combined template and rubric metrics:")
print(combined)

Combined template and rubric metrics:
                     template_pass_rate  rubric_avg_score
question_id                                              
q_chromosomes                       1.0          4.333333
q_drug_target                       1.0          5.000000
q_hemoglobin                        1.0          4.666667
q_inflammatory_lung                 1.0          4.333333
q_partial_fail                      0.5          4.000000


### Pattern 3: Field-Level Analysis

**Identify problematic fields:**

In [59]:
df = template_results.to_dataframe()
successful = df[df['completed_without_errors'] == True]

# Calculate match rate by field
field_performance = successful.groupby('field_name').agg({
    'field_match': ['mean', 'count', 'sum']
})

# Sort by match rate
field_performance = field_performance.sort_values(('field_match', 'mean'))

print("Field performance (worst to best):")
print(field_performance)

Field performance (worst to best):
            field_match          
                   mean count sum
field_name                       
weight              0.0     2   0
class               1.0     2   2
composition         1.0     2   2
count               1.0     2   2
diseases            1.0     2   2
subunits            1.0     2   2
target              1.0     2   2
unit                1.0     4   4


### Pattern 4: Model Comparison

In [60]:
df = template_results.to_dataframe()

# Pivot: questions × models
model_comparison = df.pivot_table(
    values='field_match',
    index='question_id',
    columns='answering_model',
    aggfunc='mean'
)

print("Model comparison by question:")
print(model_comparison)

Model comparison by question:
answering_model      claude-3-5-sonnet  gpt-4o-mini
question_id                                        
q_chromosomes                      1.0          1.0
q_drug_target                      1.0          1.0
q_hemoglobin                       1.0          1.0
q_inflammatory_lung                1.0          1.0
q_partial_fail                     0.5          0.5


In [61]:
# Calculate relative performance
# First compute the spread before adding string columns
model_comparison = model_comparison.copy()
model_comparison['performance_spread'] = model_comparison.max(axis=1) - model_comparison.min(axis=1)
model_comparison['best_model'] = model_comparison.idxmax(axis=1)

print("\nWith best model and spread:")
print(model_comparison)


With best model and spread:
answering_model      claude-3-5-sonnet  gpt-4o-mini  performance_spread  \
question_id                                                               
q_chromosomes                      1.0          1.0                 0.0   
q_drug_target                      1.0          1.0                 0.0   
q_hemoglobin                       1.0          1.0                 0.0   
q_inflammatory_lung                1.0          1.0                 0.0   
q_partial_fail                     0.5          0.5                 0.0   

answering_model             best_model  
question_id                             
q_chromosomes        claude-3-5-sonnet  
q_drug_target        claude-3-5-sonnet  
q_hemoglobin         claude-3-5-sonnet  
q_inflammatory_lung  claude-3-5-sonnet  
q_partial_fail       claude-3-5-sonnet  


In [62]:
# Find questions with high model variance
high_variance = model_comparison[model_comparison['performance_spread'] > 0]
print(f"\nQuestions with model variance: {len(high_variance)}")
print(high_variance[['best_model', 'performance_spread']])


Questions with model variance: 0
Empty DataFrame
Columns: [best_model, performance_spread]
Index: []


### Pattern 5: Error Analysis

In [63]:
df = template_results.to_dataframe()

# Get failed verifications (where field_match is False)
failed_fields = df[df['field_match'] == False]

if len(failed_fields) > 0:
    print("Failed field comparisons:")
    print(failed_fields[['question_id', 'field_name', 'gt_value', 'llm_value']].to_string())
else:
    print("No failed field comparisons in this dataset.")

Failed field comparisons:
       question_id field_name gt_value llm_value
8   q_partial_fail     weight     5808      5800
17  q_partial_fail     weight     5808      5800


### Pattern 7: Cost Analysis

In [64]:
usage_df = template_results.to_usage_dataframe(totals_only=True)

# Assuming cost per 1K tokens
INPUT_COST_PER_1K = 0.0001
OUTPUT_COST_PER_1K = 0.0003

usage_df['input_cost'] = usage_df['input_tokens'] / 1000 * INPUT_COST_PER_1K
usage_df['output_cost'] = usage_df['output_tokens'] / 1000 * OUTPUT_COST_PER_1K
usage_df['total_cost'] = usage_df['input_cost'] + usage_df['output_cost']

# Cost by model
model_costs = usage_df.groupby('answering_model')['total_cost'].sum()
print("Cost by model:")
for model, cost in model_costs.items():
    print(f"  {model}: ${cost:.4f}")

# Total cost
print(f"\nTotal cost: ${usage_df['total_cost'].sum():.4f}")

Cost by model:
  claude-3-5-sonnet: $0.0001
  gpt-4o-mini: $0.0001

Total cost: $0.0001


## Helper Methods

Helper methods provide convenient aggregations for common operations. They are implemented using the DataFrame API internally.

### TemplateResults Helper Methods

#### `aggregate_pass_rate(by="question_id", strategy="mean")`

In [65]:
# Pass rate by question
question_rates = template_results.aggregate_pass_rate(by="question_id")
print("Pass rates by question:")
for qid, rate in question_rates.items():
    print(f"  {qid}: {rate:.2%}")

Pass rates by question:
  q_chromosomes: 100.00%
  q_drug_target: 100.00%
  q_hemoglobin: 100.00%
  q_inflammatory_lung: 100.00%
  q_partial_fail: 50.00%


In [66]:
# Pass rate by model
model_rates = template_results.aggregate_pass_rate(by="answering_model")
print("\nPass rates by model:")
for model, rate in model_rates.items():
    print(f"  {model}: {rate:.2%}")


Pass rates by model:
  claude-3-5-sonnet: 100.00%
  gpt-4o-mini: 80.00%


#### `aggregate_embedding_scores(by="question_id", strategy="mean")`

In [67]:
# Average embedding score by question
embedding_scores = template_results.aggregate_embedding_scores(by="question_id")
print("Embedding similarity scores by question:")
for qid, score in embedding_scores.items():
    print(f"  {qid}: {score:.3f}")

Embedding similarity scores by question:
  q_chromosomes: 0.925
  q_drug_target: 0.895
  q_hemoglobin: 0.855
  q_inflammatory_lung: 0.825
  q_partial_fail: 0.875


### RubricResults Helper Methods

#### `aggregate_llm_traits(by="question_id", strategy="mean")`

In [68]:
# Get average scores by question
question_traits = rubric_results.aggregate_llm_traits(by="question_id")

print("LLM trait scores by question:")
for qid, traits in question_traits.items():
    print(f"\n{qid}:")
    for trait, score in traits.items():
        print(f"  {trait}: {score:.2f}")

LLM trait scores by question:

q_chromosomes:
  Accuracy: 5.00
  Clarity: 4.00
  Completeness: 4.00

q_drug_target:
  Accuracy: 5.00
  Clarity: 5.00
  Completeness: 5.00

q_hemoglobin:
  Accuracy: 5.00
  Clarity: 4.00
  Completeness: 5.00

q_inflammatory_lung:
  Accuracy: 4.00
  Clarity: 5.00
  Completeness: 4.00

q_partial_fail:
  Accuracy: 3.00
  Clarity: 5.00
  Completeness: 4.00


In [69]:
# Get average scores by model
model_traits = rubric_results.aggregate_llm_traits(by="answering_model")

print("\nLLM trait scores by model:")
for model, traits in model_traits.items():
    print(f"\n{model}:")
    for trait, score in traits.items():
        print(f"  {trait}: {score:.2f}")


LLM trait scores by model:

claude-3-5-sonnet:
  Accuracy: 4.40
  Clarity: 4.60
  Completeness: 4.40

gpt-4o-mini:
  Accuracy: 4.40
  Clarity: 4.60
  Completeness: 4.40


## Performance Tips

### 1. Filter Early

Filter DataFrames before aggregation to reduce computational load:

In [70]:
# Good: Filter first
df = template_results.to_dataframe()
successful = df[df['completed_without_errors'] == True]
pass_rates = successful.groupby('question_id')['field_match'].mean()
print("Pass rates (filtered first):")
print(pass_rates)

Pass rates (filtered first):
question_id
q_chromosomes          1.0
q_drug_target          1.0
q_hemoglobin           1.0
q_inflammatory_lung    1.0
q_partial_fail         0.5
Name: field_match, dtype: float64


### 2. Use Helper Methods for Simple Cases

In [71]:
# Prefer helper for simple aggregation
pass_rates = template_results.aggregate_pass_rate(by="question_id")
print("Pass rates using helper method:")
print(pass_rates)

Pass rates using helper method:
{'q_chromosomes': 1.0, 'q_drug_target': 1.0, 'q_hemoglobin': 1.0, 'q_inflammatory_lung': 1.0, 'q_partial_fail': 0.5}


### 3. Avoid Repeated Conversions

In [72]:
# Good: Convert once
template_df = template_results.to_dataframe()
pass_rates = template_df.groupby('question_id')['field_match'].mean()
field_stats = template_df.groupby('field_name')['field_match'].agg(['mean', 'count'])

print("Field statistics:")
print(field_stats)

Field statistics:
             mean  count
field_name              
class         1.0      2
composition   1.0      2
count         1.0      2
diseases      1.0      2
subunits      1.0      2
target        1.0      2
unit          1.0      4
weight        0.0      2


## Summary

The DataFrame-first approach provides:

- **Familiar API**: Use standard pandas operations
- **Flexibility**: Combine multiple verification aspects in custom ways
- **Performance**: Leverage pandas' optimized operations
- **Helper Methods**: Convenience methods for common operations

For more information:
- See the [DataFrame Quick Reference](dataframe-quick-reference.md)
- Check integration tests for real-world usage examples