# Benchmark Data Analysis

Quick examples for loading and analyzing benchmark runs.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from bedrock_benchmark.core import BenchmarkCore
from bedrock_benchmark.storage import StorageManager

# Initialize
storage_manager = StorageManager('./experiments')
benchmark_core = BenchmarkCore(storage_manager)

## List Experiments and Runs

In [None]:
# List experiments
experiments = benchmark_core.list_experiments()
for exp in experiments:
    print(f"{exp.id}: {exp.name} ({len(exp.runs)} runs)")

# Get runs from first experiment
if experiments:
    experiment_id = experiments[0].id
    runs = benchmark_core.list_runs(experiment_id)
    print(f"\nRuns in {experiment_id}:")
    for run_id in runs:
        summary = benchmark_core.get_run_summary(run_id)
        print(f"  {run_id}: {summary['model_id']} - {summary['total_responses']} responses")

## Load Single Run

In [None]:
# Load first run into DataFrame
if runs:
    run_id = runs[0]
    df = benchmark_core.export_run_to_dataframe(run_id)
    
    print(f"Loaded run {run_id}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Show first few rows
    display(df.head())
    
    # Basic stats
    print(f"\nAverage latency: {df['latency_ms'].mean():.2f}ms")
    print(f"Total tokens: {df['input_tokens'].sum() + df['output_tokens'].sum():,}")

## Load Multiple Runs for Comparison

In [None]:
# Load multiple runs (if available)
if len(runs) >= 2:
    comparison_runs = runs[:2]  # First 2 runs
    df_multi = benchmark_core.export_multiple_runs_to_dataframe(comparison_runs)
    
    print(f"Loaded {len(comparison_runs)} runs for comparison")
    print(f"Shape: {df_multi.shape}")
    
    # Show run distribution
    print("\nRun distribution:")
    print(df_multi['run_id'].value_counts())
    
    # Compare stats by run
    comparison_stats = df_multi.groupby('run_id').agg({
        'latency_ms': ['mean', 'std'],
        'input_tokens': 'sum',
        'output_tokens': 'sum',
        'actual_response': 'count'
    }).round(2)
    
    print("\nComparison by run:")
    display(comparison_stats)
else:
    print(f"Only {len(runs)} run available - need 2+ for comparison")

## Quick Visualizations

In [None]:
# Single run visualization
if 'df' in locals() and not df.empty:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Latency histogram
    ax1.hist(df['latency_ms'], bins=20, alpha=0.7)
    ax1.set_title('Latency Distribution')
    ax1.set_xlabel('Latency (ms)')
    
    # Token scatter
    ax2.scatter(df['input_tokens'], df['output_tokens'], alpha=0.6)
    ax2.set_title('Input vs Output Tokens')
    ax2.set_xlabel('Input Tokens')
    ax2.set_ylabel('Output Tokens')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Multi-run comparison visualization
if 'df_multi' in locals() and not df_multi.empty:
    # Latency comparison boxplot
    plt.figure(figsize=(8, 5))
    df_multi.boxplot(column='latency_ms', by='run_id')
    plt.title('Latency Comparison by Run')
    plt.suptitle('')  # Remove default title
    plt.show()

## Export Data

In [None]:
# Save DataFrames to CSV
if 'df' in locals():
    df.to_csv('single_run_analysis.csv', index=False)
    print("Saved single run data to: single_run_analysis.csv")

if 'df_multi' in locals():
    df_multi.to_csv('multi_run_comparison.csv', index=False)
    print("Saved comparison data to: multi_run_comparison.csv")