# Notebook to generate plots from the benchmarks results

## Definitions

In [None]:
MODEL_COLORS = {
    "gpt-4.1-nano-2025-04-14": "#A9A8A8",
    "gpt-4o-mini-2024-07-18": "#7a7a7a",
    "gpt-4.1-mini-2025-04-14": "#525252",
    "gpt-4o-2024-08-06": "#272727",
    "gpt-4.1-2025-04-14": "#000000",
    "Qwen2-0.5B-Instruct": "#D7A6E6",
    "Qwen2.5-0.5B-Instruct": "#D295E4",
    "Qwen2.5-1.5B-Instruct": "#CF88E4",
    "Qwen2.5-3B-Instruct": "#C76EE2",
    "Qwen2.5-7B-Instruct": "#C052E2",
    "Qwen2.5-14B-Instruct": "#BA38E2",
    "Qwen2.5-32B-Instruct": "#B522E1",
    "Qwen3-0.6B": "#D7A6E6",
    "gpt-oss-20b": "#BFBFBF",
    "o4-mini-2025-04-16": "#666666",
}

MODEL_PATTERNS = {
    "gpt-4.1-nano-2025-04-14": "",
    "gpt-4o-mini-2024-07-18": "",
    "gpt-4.1-mini-2025-04-14": "",
    "gpt-4o-2024-08-06": "",
    "gpt-4.1-2025-04-14": "",
    "Qwen2-0.5B-Instruct": "",
    "Qwen2.5-0.5B-Instruct": "",
    "Qwen2.5-1.5B-Instruct": "",
    "Qwen2.5-3B-Instruct": "",
    "Qwen2.5-7B-Instruct": "",
    "Qwen2.5-14B-Instruct": "",
    "Qwen2.5-32B-Instruct": "",
    "Qwen3-0.6B": "/",
    "gpt-oss-20b": "/",
    "o4-mini-2025-04-16": "/",
}

MODEL_ORDER = [
    "Qwen2-0.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-14B-Instruct",
    "Qwen2.5-32B-Instruct",
    "gpt-4.1-nano-2025-04-14",
    "gpt-4o-mini-2024-07-18",
    "gpt-4.1-mini-2025-04-14",
    "gpt-4o-2024-08-06",
    "gpt-4.1-2025-04-14",
    "Qwen3-0.6B",
    "gpt-oss-20b",
    "o4-mini-2025-04-16"
]

MODEL_SHORT_NAMES = {
    "gpt-4o-mini-2024-07-18": "4o-mini",
    "gpt-4o-2024-08-06": "4o",
    "gpt-4.1-nano-2025-04-14": "4.1-nano",
    "gpt-4.1-mini-2025-04-14": "4.1-mini",
    "gpt-4.1-2025-04-14": "4.1",
    "o4-mini-2025-04-16": "o4-mini",
    "Qwen2-0.5B-Instruct": "Qwen2-0.5B",
    "Qwen2.5-0.5B-Instruct": "Qwen2.5-0.5B",
    "Qwen2.5-1.5B-Instruct": "Qwen2.5-1.5B",
    "Qwen2.5-3B-Instruct": "Qwen2.5-3B",
    "Qwen2.5-7B-Instruct": "Qwen2.5-7B",
    "Qwen2.5-14B-Instruct": "Qwen2.5-14B",
    "Qwen2.5-32B-Instruct": "Qwen2.5-32B",
    "Qwen3-0.6B": "Qwen3-0.6B",
    "gpt-oss-20b": "gpt-oss-20b"
}

LEVEL_EXPLANATIONS = {
    "level1": "1 card",
    "level2": "2 cards",
    "level3": "4 cards",
    "level4": "8 cards",
    "level5": "8 cards, 1 of 10 jokers",
    "level6": "8 cards, 2 of 37 jokers",
}

## Load data

In [None]:
import glob
import json
from pathlib import Path
import pandas as pd

BENCHMARKS_FOLDER = "../benchmarks"

def load_raw_benchmark_data():
    """Loads benchmark data as a dictionary from level and model name to results."""
    data = {}
    for file in glob.glob(f"{BENCHMARKS_FOLDER}/*.json"):
        # Get name of file without path and extension
        name = Path(file).stem
        level, model = name.split("_")
        with open(file, "r") as f:
            json_data = json.load(f)
            data[(level, model)] = json_data
    return data

def raw_data_to_dataframe(data: dict):
    """Converts raw benchmark data to a pandas DataFrame."""
    rows = []
    for (level, model), results in data.items():
        row = {
            "level": level,
            "level_with_explanation": f"{level} ({LEVEL_EXPLANATIONS.get(level, 'Unknown')})",
            "model": model,
            "model_shortname": MODEL_SHORT_NAMES.get(model, model),
            "model_color": MODEL_COLORS.get(model, "#000000"),
            "model_pattern": MODEL_PATTERNS.get(model, ""),
            "total_score": results["total_score"],
            "total_normalized_score": results["total_normalized_score"],
            "invalid_hands": results["invalid_hands"],
            "normalized_invalid_hands": results["normalized_invalid_hands"],
        }
        rows.append(row)
    df = pd.DataFrame(rows)
    df.model = df.model.astype("category")
    df.model = df.model.cat.set_categories(MODEL_ORDER)
    df.sort_values(by=["level", "model"], inplace=True)
    return df

In [None]:
raw_data = load_raw_benchmark_data()
df = raw_data_to_dataframe(raw_data)
df

## Plot performances by level an model

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=[df["level_with_explanation"], df["model_shortname"]],
        y=df["total_normalized_score"],
        text=[f'{x:.0%}' for x in df["total_normalized_score"]],
        marker={
            "color": df["model_color"],
            "pattern_shape": df["model_pattern"]
        }, 
    )
)
fig.update_yaxes(tickformat=",.0%", range=[0, 1], title="Normalized score")
fig.update_xaxes(title="Difficulty level and model",)
fig.show()

Alternative plot

In [None]:
import plotly.express as px

fig = px.bar(
    df,
    x="model_shortname",
    y="total_normalized_score",
    color="model",
    color_discrete_map=MODEL_COLORS,
    pattern_shape="model",
    pattern_shape_map=MODEL_PATTERNS,
    facet_col="level_with_explanation",
    facet_col_wrap=4,
    facet_col_spacing=0.005,
    facet_row_spacing=0.225,
    category_orders={"model_shortname": [MODEL_SHORT_NAMES[model] for model in MODEL_ORDER]},
    barmode="relative",
    range_y=[0, 1],
    text_auto=",.0%",
)
fig.update_yaxes(tickformat=",.0%", range=[0, 1], title="")
fig.update_xaxes(title="", showticklabels=True)
fig.update_layout(showlegend=False, height=700)
for annotation in fig.layout.annotations:
    annotation.text = annotation.text.replace("level_with_explanation=", "")

fig.show()

TODO:
* Add [Qwen-2.5 models](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e).
* Add Qwen-3.0 reasoning models
* Add new [open source OpenAI reasoning models](https://huggingface.co/openai/gpt-oss-20b)