# Benchmark Results Analysis

This notebook loads and analyzes benchmark results from CSV files generated by `run_benchmark_suite.py`.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)


In [None]:
# Load the CSV file
csv_path = Path("../data/simulations/benchmark_results.csv")

if not csv_path.exists():
    print(f"CSV file not found: {csv_path}")
    print("Please run: python scripts/run_benchmark_suite.py")
else:
    df = pd.read_csv(csv_path)
    print(f"âœ… Loaded {len(df)} rows from {csv_path}")
    print(f"Columns: {len(df.columns)}")


In [None]:
# Display summary statistics
print("=" * 80)
print("SUMMARY STATISTICS")
print("=" * 80)
print(f"\nTotal unique configurations: {len(df)}")
print(f"Domains: {df['domain'].nunique()} ({', '.join(df['domain'].unique())})")
print(f"Tasks: {df['task'].nunique()}")
print(f"User models: {df['user_model'].nunique()} ({', '.join(df['user_model'].unique())})")
print(f"Agent models: {df['agent_model'].nunique()} ({', '.join(df['agent_model'].unique())})")

if 'avg_reward' in df.columns:
    print(f"\nOverall average reward: {df['avg_reward'].mean():.4f}")
if 'pass^1' in df.columns:
    print(f"Overall pass^1: {df['pass^1'].mean():.4f}")
if 'avg_agent_cost' in df.columns and df['avg_agent_cost'].notna().any():
    print(f"Overall average agent cost: ${df['avg_agent_cost'].mean():.6f}")


In [None]:
# Display the full table
print("\n" + "=" * 80)
print("METRICS TABLE")
print("=" * 80)
df


In [None]:
# Filter by domain
domain = "mail_rag_phishing"  # Change this to filter by domain
domain_df = df[df['domain'] == domain]
print(f"Results for domain '{domain}': {len(domain_df)} rows")
domain_df


In [None]:
# Group by user_model_params to see temperature effects
if 'user_model_params' in df.columns:
    # Parse temperature from JSON string
    def extract_temperature(params_str):
        try:
            params = json.loads(params_str)
            return params.get('temperature', None)
        except:
            return None
    
    df['temperature'] = df['user_model_params'].apply(extract_temperature)
    
    # Group by temperature and compute averages
    if 'temperature' in df.columns:
        temp_summary = df.groupby('temperature').agg({
            'avg_reward': 'mean',
            'pass^1': 'mean',
            'avg_agent_cost': 'mean',
            'num_trials': 'sum'
        }).round(4)
        print("Summary by Temperature:")
        print(temp_summary)


In [None]:
# Compare different agent models
if 'agent_model' in df.columns:
    model_comparison = df.groupby('agent_model').agg({
        'avg_reward': 'mean',
        'pass^1': 'mean',
        'avg_agent_cost': 'mean',
        'num_trials': 'sum'
    }).round(4)
    print("Model Comparison:")
    print(model_comparison)


In [None]:
# Task performance breakdown
task_perf = df.groupby(['domain', 'task']).agg({
    'avg_reward': 'mean',
    'pass^1': 'mean',
    'num_trials': 'sum'
}).round(4).sort_values('avg_reward', ascending=False)

print("Task Performance (sorted by avg_reward):")
task_perf
