# Benchmark Analysis

Calculate mean absolute error and standard deviation for each model

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('benchmark-fixed.csv')
# Get columns that aren't metadata
model_cols = [col for col in df.columns if col not in ['dataset', 'sample_idx', 'label']]

# Calculate metrics for each model
metrics = []
for model in model_cols:
    diff_col = df[model] - df['label']
    metrics.append({
        'Model': model,
        'MAE': f"{diff_col.abs().mean():.2f}",
        'Error STD': f"{diff_col.std():.2f}"
    })

# Create and display metrics table
metrics_df = pd.DataFrame(metrics)
display(metrics_df)


Unnamed: 0,Model,MAE,Error STD
0,meta-llama/Llama-3.2-3B-Instruct,1.78,3.32
1,meta-llama/Llama-3.2-3B,1.65,1.3
2,meta-llama/Llama-3.3-70B-Instruct,,
3,Qwen/Qwen3-4B,,
4,Qwen/Qwen3-30B-A3B-Instruct-2507-FP8,,
5,google/gemma-3-4b-it,,
6,google/gemma-3-1b-it,,
7,google/gemma-3-4b-pt,,
8,openai/gpt-oss-20b,,
9,deepseek-ai/deepseek-llm-7b-chat,,


In [3]:
# Group by dataset and calculate statistics for each model
metrics = []

for dataset in df['dataset'].unique():
    dataset_df = df[df['dataset'] == dataset]
    for model in model_cols:
        diff_col = dataset_df[model] - dataset_df['label']
        metrics.append({
            'Model': model,
            'dataset': dataset,
            'MAE': f"{diff_col.abs().mean():.2f}",
            'Error STD': f"{diff_col.std():.2f}",
        })
        
all_metrics_df = pd.DataFrame(metrics)
display(all_metrics_df)


Unnamed: 0,Model,dataset,MAE,Error STD
0,meta-llama/Llama-3.2-3B-Instruct,data_trivial,1.46,1.83
1,meta-llama/Llama-3.2-3B,data_trivial,1.46,1.22
2,meta-llama/Llama-3.3-70B-Instruct,data_trivial,,
3,Qwen/Qwen3-4B,data_trivial,,
4,Qwen/Qwen3-30B-A3B-Instruct-2507-FP8,data_trivial,,
5,google/gemma-3-4b-it,data_trivial,,
6,google/gemma-3-1b-it,data_trivial,,
7,google/gemma-3-4b-pt,data_trivial,,
8,openai/gpt-oss-20b,data_trivial,,
9,deepseek-ai/deepseek-llm-7b-chat,data_trivial,,


In [4]:
# Calculate metrics for each dataset type
dataset_metrics = []
for dataset in df['dataset'].unique():
    dataset_df = df[df['dataset'] == dataset]
    diff_cols = dataset_df[model_cols].subtract(dataset_df['label'], axis=0)
    dataset_metrics.append({
        'Dataset': dataset,
        'MAE': f"{diff_cols.abs().mean().mean():.2f}", 
        'Error STD': f"{diff_cols.std().mean():.2f}"
    })

# Create and display metrics table
dataset_metrics_df = pd.DataFrame(dataset_metrics)
display(dataset_metrics_df)


Unnamed: 0,Dataset,MAE,Error STD
0,data_trivial,1.46,1.52
1,data_easy,1.45,1.53
2,data_medium,1.94,2.8
3,data_hard,2.0,2.8


Generate markdown

In [9]:
with open('METRICS.md', 'w') as f:
    f.write(metrics_df.to_markdown(index=False) + '\n\n')
    f.write(all_metrics_df.to_markdown(index=False) + '\n\n')
    f.write(dataset_metrics_df.to_markdown(index=False) + '\n\n')

Fix bug where dataset names didn't get properly initialized

In [6]:
# df = pd.read_csv('benchmark-meta-llama-Llama-3.2-3B.csv')
# dataset_names = ['data_trivial', 'data_easy', 'data_medium', 'data_hard']
# df['dataset'] = [dataset_names[i // 5000] for i in range(len(df))]
# df.to_csv('benchmark-fixed.csv', index=False)