In [3]:
from pathlib import Path
import json
import pandas as pd

# ---- Config ----
metrics_dir = Path("metrics")
out_csv = Path("metrics_summary.csv")

def dataset_slug(name: str) -> str:
    # "test-ai2_arc.parquet" -> "ai2_arc"
    n = name
    if n.endswith(".parquet"):
        n = n[:-8]
    if n.startswith("test-"):
        n = n[5:]
    return n

def pick_two_metrics(ds_block: dict) -> list[tuple[str, float]]:
    """Return list of (metric_name, value) for the primary two metrics of this dataset."""
    kind = ds_block.get("type", "")
    m = ds_block.get("metrics", {}) or {}

    # Priority by known kinds
    if kind == "mcq4":
        candidates = [("accuracy", ds_block.get("accuracy")), ("macro_f1", m.get("macro_f1"))]
    elif kind == "boolq":
        candidates = [("accuracy", ds_block.get("accuracy")), ("MCC", m.get("MCC"))]
    elif kind == "squad_v2":
        candidates = [("EM", m.get("EM")), ("F1", m.get("F1"))]
    else:
        # Fallback: take first two available among common keys
        pool = [
            ("accuracy", ds_block.get("accuracy")),
            ("macro_f1", m.get("macro_f1")),
            ("F1", m.get("F1")),
            ("balanced_accuracy", m.get("balanced_accuracy")),
            ("MCC", m.get("MCC")),
        ]
        candidates = [p for p in pool if p[1] is not None]

    # Keep only the first two non-None
    chosen = []
    for name, val in candidates:
        if val is not None:
            chosen.append((name, val))
        if len(chosen) == 2:
            break
    return chosen

rows = []

for jf in sorted(metrics_dir.glob("*.json")):
    with open(jf, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Use file name (without extension) as model id; matches how you save metrics JSONs
    model_id = jf.stem

    row = {"model": model_id}
    datasets = data.get("datasets", {}) or {}
    for ds_name, ds_block in datasets.items():
        slug = dataset_slug(ds_name)

        # main 2 metrics
        for metric_name, value in pick_two_metrics(ds_block):
            row[f"{slug}__{metric_name}"] = value

        # add latency mean in seconds (per dataset)
        lat_mean = (ds_block.get("latency_seconds") or {}).get("mean")
        row[f"{slug}__latency_mean_s"] = lat_mean

    rows.append(row)

# Build DataFrame
if rows:
    df = pd.DataFrame(rows).set_index("model").sort_index(axis=1)
else:
    df = pd.DataFrame(columns=["model"]).set_index("model")

# Save CSV
out_csv.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_csv, index=True)

print(f"✅ Wrote summary with shape {df.shape} to: {out_csv}")
with pd.option_context("display.max_columns", None, "display.width", 200):
    print(df.head())


✅ Wrote summary with shape (4, 9) to: metrics_summary.csv
                          ai2_arc__accuracy  ai2_arc__latency_mean_s  ai2_arc__macro_f1  boolq__MCC  boolq__accuracy  boolq__latency_mean_s  squad_v2__EM  squad_v2__F1  squad_v2__latency_mean_s
model                                                                                                                                                                                            
Qwen3-0.6B-arc_SFT_QLORA              53.33                   0.7249             0.5064      0.0632            63.33                 0.2879          3.33          3.33                    0.3358
Qwen3-0.6B_base                       63.33                   1.9426             0.5813      0.1195            23.33                 0.1797         20.00         20.00                    0.2218
Qwen3-1.7B-arc_SFT_QLORA              80.00                  12.7516             0.7200      0.0674            36.67                 0.3028         16.67         23.3