# Benchmark Data Analysis

Quick examples for loading and analyzing benchmark runs.

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
from bedrock_benchmark.core import BenchmarkCore
from bedrock_benchmark.storage import StorageManager

# Initialize
storage_manager = StorageManager('./experiments')
benchmark_core = BenchmarkCore(storage_manager)

## List Experiments and Runs

In [8]:
# List experiments
experiments = benchmark_core.list_experiments()
for exp in experiments:
    print(f"{exp.id}: {exp.name} ({len(exp.runs)} runs)")

# Get runs from first experiment
if experiments:
    experiment_id = experiments[1].id
    runs = benchmark_core.list_runs(experiment_id)
    print(f"\nRuns in {experiment_id}:")
    for run_id in runs:
        summary = benchmark_core.get_run_summary(run_id)
        print(f"  {run_id}: {summary['model_id']} - {summary['total_responses']} responses")

nova-family-quick-load-test: Nova Family Quick Load Test (2 runs)
quick-test-with-ids: Quick Test with IDs (2 runs)

Runs in quick-test-with-ids:
  nova-micro-default_20251024-103200_ac51: us.amazon.nova-micro-v1:0 - 109 responses
  nova-lite-default_20251024-103104_cb9d: us.amazon.nova-lite-v1:0 - 109 responses


## Load Single Run

In [9]:
# Load first run into DataFrame
if runs:
    run_id = runs[0]
    df = benchmark_core.export_run_to_dataframe(run_id)
    
    print(f"Loaded run {run_id}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Show first few rows
    display(df.head())
    
    # Basic stats
    print(f"\nAverage latency: {df['latency_ms'].mean():.2f}ms")
    print(f"Total tokens: {df['input_tokens'].sum() + df['output_tokens'].sum():,}")

Loaded run nova-micro-default_20251024-103200_ac51
Shape: (109, 14)
Columns: ['run_id', 'item_id', 'prompt', 'expected_response', 'actual_response', 'model_id', 'system_prompt', 'timestamp', 'latency_ms', 'input_tokens', 'output_tokens', 'finish_reason', 'model_param_temperature', 'model_param_max_tokens']


Unnamed: 0,run_id,item_id,prompt,expected_response,actual_response,model_id,system_prompt,timestamp,latency_ms,input_tokens,output_tokens,finish_reason,model_param_temperature,model_param_max_tokens
0,nova-micro-default_20251024-103200_ac51,item_010,,,2,us.amazon.nova-micro-v1:0,Answer with only the essential information. Pr...,2025-10-24 10:32:00.303887,571,25,2,end_turn,0.1,50
1,nova-micro-default_20251024-103200_ac51,item_003,,,William Shakespeare,us.amazon.nova-micro-v1:0,Answer with only the essential information. Pr...,2025-10-24 10:32:00.301605,584,24,2,end_turn,0.1,50
2,nova-micro-default_20251024-103200_ac51,item_001,,,Paris,us.amazon.nova-micro-v1:0,Answer with only the essential information. Pr...,2025-10-24 10:32:00.295482,597,25,1,end_turn,0.1,50
3,nova-micro-default_20251024-103200_ac51,item_006,,,Au,us.amazon.nova-micro-v1:0,Answer with only the essential information. Pr...,2025-10-24 10:32:00.302642,605,26,1,end_turn,0.1,50
4,nova-micro-default_20251024-103200_ac51,item_002,,,4,us.amazon.nova-micro-v1:0,Answer with only the essential information. Pr...,2025-10-24 10:32:00.301168,706,26,2,end_turn,0.1,50



Average latency: 1306.50ms
Total tokens: 3,163


In [10]:
dataset_path = "dataset-tmp/load_test_100_with_ids.jsonl"
df = benchmark_core.export_run_to_dataframe(run_id, dataset_path)

print(f"Loaded run {run_id}")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Show first few rows
display(df.head())

# Basic stats
print(f"\nAverage latency: {df['latency_ms'].mean():.2f}ms")
print(f"Total tokens: {df['input_tokens'].sum() + df['output_tokens'].sum():,}")

[BenchmarkItem(id='item_001', prompt='What is the capital of France?', expected_response='Paris', metadata={}), BenchmarkItem(id='item_002', prompt='What is 2 + 2?', expected_response='4', metadata={}), BenchmarkItem(id='item_003', prompt='Who wrote Romeo and Juliet?', expected_response='William Shakespeare', metadata={}), BenchmarkItem(id='item_004', prompt='What is the largest planet in our solar system?', expected_response='Jupiter', metadata={}), BenchmarkItem(id='item_005', prompt='What year did World War II end?', expected_response='1945', metadata={})]
Loaded run nova-micro-default_20251024-103200_ac51
Shape: (109, 14)
Columns: ['run_id', 'item_id', 'prompt', 'expected_response', 'actual_response', 'model_id', 'system_prompt', 'timestamp', 'latency_ms', 'input_tokens', 'output_tokens', 'finish_reason', 'model_param_temperature', 'model_param_max_tokens']


Unnamed: 0,run_id,item_id,prompt,expected_response,actual_response,model_id,system_prompt,timestamp,latency_ms,input_tokens,output_tokens,finish_reason,model_param_temperature,model_param_max_tokens
0,nova-micro-default_20251024-103200_ac51,item_010,What is the smallest prime number?,2,2,us.amazon.nova-micro-v1:0,Answer with only the essential information. Pr...,2025-10-24 10:32:00.303887,571,25,2,end_turn,0.1,50
1,nova-micro-default_20251024-103200_ac51,item_003,Who wrote Romeo and Juliet?,William Shakespeare,William Shakespeare,us.amazon.nova-micro-v1:0,Answer with only the essential information. Pr...,2025-10-24 10:32:00.301605,584,24,2,end_turn,0.1,50
2,nova-micro-default_20251024-103200_ac51,item_001,What is the capital of France?,Paris,Paris,us.amazon.nova-micro-v1:0,Answer with only the essential information. Pr...,2025-10-24 10:32:00.295482,597,25,1,end_turn,0.1,50
3,nova-micro-default_20251024-103200_ac51,item_006,What is the chemical symbol for gold?,Au,Au,us.amazon.nova-micro-v1:0,Answer with only the essential information. Pr...,2025-10-24 10:32:00.302642,605,26,1,end_turn,0.1,50
4,nova-micro-default_20251024-103200_ac51,item_002,What is 2 + 2?,4,4,us.amazon.nova-micro-v1:0,Answer with only the essential information. Pr...,2025-10-24 10:32:00.301168,706,26,2,end_turn,0.1,50



Average latency: 1306.50ms
Total tokens: 3,163


## Load Multiple Runs for Comparison

In [6]:
# Load multiple runs (if available)
if len(runs) >= 2:
    comparison_runs = runs[:2]  # First 2 runs
    df_multi = benchmark_core.export_multiple_runs_to_dataframe(comparison_runs)
    
    print(f"Loaded {len(comparison_runs)} runs for comparison")
    print(f"Shape: {df_multi.shape}")
    
    # Show run distribution
    print("\nRun distribution:")
    print(df_multi['run_id'].value_counts())
    
    # Compare stats by run
    comparison_stats = df_multi.groupby('run_id').agg({
        'latency_ms': ['mean', 'std'],
        'input_tokens': 'sum',
        'output_tokens': 'sum',
        'actual_response': 'count'
    }).round(2)
    
    print("\nComparison by run:")
    display(comparison_stats)
else:
    print(f"Only {len(runs)} run available - need 2+ for comparison")

Loaded 2 runs for comparison
Shape: (218, 14)

Run distribution:
run_id
nova-lite-default_20251024-103104_cb9d     109
nova-micro-default_20251024-103200_ac51    109
Name: count, dtype: int64

Comparison by run:


Unnamed: 0_level_0,latency_ms,latency_ms,input_tokens,output_tokens,actual_response
Unnamed: 0_level_1,mean,std,sum,sum,count
run_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
nova-lite-default_20251024-103104_cb9d,1576.03,2883.3,2881,265,109
nova-micro-default_20251024-103200_ac51,1306.5,2573.13,2881,282,109


## Quick Visualizations

In [None]:
# Single run visualization
if 'df' in locals() and not df.empty:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Latency histogram
    ax1.hist(df['latency_ms'], bins=20, alpha=0.7)
    ax1.set_title('Latency Distribution')
    ax1.set_xlabel('Latency (ms)')
    
    # Token scatter
    ax2.scatter(df['input_tokens'], df['output_tokens'], alpha=0.6)
    ax2.set_title('Input vs Output Tokens')
    ax2.set_xlabel('Input Tokens')
    ax2.set_ylabel('Output Tokens')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Multi-run comparison visualization
if 'df_multi' in locals() and not df_multi.empty:
    # Latency comparison boxplot
    plt.figure(figsize=(8, 5))
    df_multi.boxplot(column='latency_ms', by='run_id')
    plt.title('Latency Comparison by Run')
    plt.suptitle('')  # Remove default title
    plt.show()

## Export Data

In [None]:
# Save DataFrames to CSV
if 'df' in locals():
    df.to_csv('single_run_analysis.csv', index=False)
    print("Saved single run data to: single_run_analysis.csv")

if 'df_multi' in locals():
    df_multi.to_csv('multi_run_comparison.csv', index=False)
    print("Saved comparison data to: multi_run_comparison.csv")