# Notebook to generate plots from the benchmarks results

## Definitions

In [None]:
MODEL_COLORS = {
    "gpt-4o-mini-2024-07-18": "#8c8c8c",
    "gpt-4o-2024-08-06": "#000000",
    "gpt-4.1-nano-2025-04-14": "#DCB6B6",
    "gpt-4.1-mini-2025-04-14": "#935151",
    "gpt-4.1-2025-04-14": "#933535",
    "o4-mini-2025-04-16": "#929530",
}

MODEL_ORDER = [
    "gpt-4.1-nano-2025-04-14",
    "gpt-4o-mini-2024-07-18",
    "gpt-4.1-mini-2025-04-14",
    "gpt-4o-2024-08-06",
    "gpt-4.1-2025-04-14",
    "o4-mini-2025-04-16"
]

MODEL_SHORT_NAMES = {
    "gpt-4o-mini-2024-07-18": "4o-mini",
    "gpt-4o-2024-08-06": "4o",
    "gpt-4.1-nano-2025-04-14": "4.1-nano",
    "gpt-4.1-mini-2025-04-14": "4.1-mini",
    "gpt-4.1-2025-04-14": "4.1",
    "o4-mini-2025-04-16": "o4-mini"
}

LEVEL_EXPLANATIONS = {
    "level1": "1 card",
    "level2": "2 cards",
    "level3": "4 cards",
    "level4": "8 cards",
}

## Load data

In [None]:
import glob
import json
from pathlib import Path
import pandas as pd

BENCHMARKS_FOLDER = "../benchmarks"

def load_raw_benchmark_data():
    """Loads benchmark data as a dictionary from level and model name to results."""
    data = {}
    for file in glob.glob(f"{BENCHMARKS_FOLDER}/*.json"):
        # Get name of file without path and extension
        name = Path(file).stem
        level, model = name.split("_")
        with open(file, "r") as f:
            json_data = json.load(f)
            data[(level, model)] = json_data
    return data

def raw_data_to_dataframe(data: dict):
    """Converts raw benchmark data to a pandas DataFrame."""
    rows = []
    for (level, model), results in data.items():
        row = {
            "level": level,
            "level_with_explanation": f"{level} ({LEVEL_EXPLANATIONS.get(level, 'Unknown')})",
            "model": model,
            "model_shortname": MODEL_SHORT_NAMES.get(model, model),
            "model_color": MODEL_COLORS.get(model, "#000000"),
            "total_score": results["total_score"],
            "total_normalized_score": results["total_normalized_score"],
            "invalid_hands": results["invalid_hands"],
            "normalized_invalid_hands": results["normalized_invalid_hands"],
        }
        rows.append(row)
    df = pd.DataFrame(rows)
    df.model = df.model.astype("category")
    df.model = df.model.cat.set_categories(MODEL_ORDER)
    df.sort_values(by=["level", "model"], inplace=True)
    return df

In [None]:
raw_data = load_raw_benchmark_data()
df = raw_data_to_dataframe(raw_data)
df

## Plot performances by level an model

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=[df["level_with_explanation"], df["model_shortname"]],
        y=df["total_normalized_score"],
        text=[f'{x:.0%}' for x in df["total_normalized_score"]],
        marker={
            "color": df["model_color"],
        }, 
    )
)
fig.update_yaxes(tickformat=",.0%", range=[0, 1], title="Normalized score")
fig.update_xaxes(title="Difficulty level and model",)
fig.show()