In [7]:
from pathlib import Path
import json
import pandas as pd
from typing import Optional, Tuple

# ---- Config ----
metrics_dir = Path("metrics")
out_csv = Path("metrics_summary.csv")

def dataset_slug(name: str) -> str:
    """
    Convert file name to a compact slug:
    'test-ai2_arc.parquet' -> 'ai2_arc'
    """
    n = Path(name).stem  # removes .parquet
    if n.startswith("test-"):
        n = n[5:]
    return n

def get_metric(ds_block: dict, key: str):
    """
    Try to retrieve a metric first at the dataset level, then inside 'metrics'.
    """
    val = ds_block.get(key)
    if val is None:
        val = (ds_block.get("metrics") or {}).get(key)
    return val

# Efficacy policy (in order of preference) by dataset type
EFFICACY_POLICY = {
    "mcq4": ["macro_f1", "accuracy"],
    "boolq": ["macro_f1", "MCC", "balanced_accuracy", "accuracy"],
    "squad_v2": ["F1", "EM"],
    "math_numeric": ["avg_abs_diff"],  # lower is better; we keep the raw value
}

def choose_efficacy_metric(ds_block: dict) -> Optional[Tuple[str, float]]:
    """
    Return the chosen (metric_name, value) according to EFFICACY_POLICY.
    Falls back to common keys if the preferred ones are missing.
    """
    kind = ds_block.get("type", "")
    candidates = EFFICACY_POLICY.get(kind, [])
    for k in candidates:
        v = get_metric(ds_block, k)
        if v is not None:
            return k, v

    # Generic fallback for unknown/new dataset types
    for k in ["accuracy", "macro_f1", "F1", "balanced_accuracy", "MCC", "EM", "avg_abs_diff"]:
        v = get_metric(ds_block, k)
        if v is not None:
            return k, v

    return None

rows = []

for jf in sorted(metrics_dir.glob("*.json")):
    with open(jf, "r", encoding="utf-8") as f:
        data = json.load(f)

    model_id = jf.stem  # use file name without extension as model id
    row = {"model": model_id}

    datasets = data.get("datasets", {}) or {}
    for ds_name, ds_block in datasets.items():
        slug = dataset_slug(ds_name)

        # 1) Efficacy: exactly one metric per dataset
        eff = choose_efficacy_metric(ds_block)
        if eff is not None:
            metric_name, value = eff
            row[f"{slug}__{metric_name}"] = value

        # 2) Efficiency: latency mean in seconds
        lat_mean = (ds_block.get("latency_seconds") or {}).get("mean")
        row[f"{slug}__latency_mean_s"] = lat_mean

    rows.append(row)

# Build DataFrame
if rows:
    df = pd.DataFrame(rows).set_index("model").sort_index(axis=1)
else:
    df = pd.DataFrame(columns=["model"]).set_index("model")

# Save CSV
out_csv.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_csv, index=True)

print(f"✅ Wrote summary with shape {df.shape} to: {out_csv}")
# with pd.option_context("display.max_columns", None, "display.width", 200):
#     print(df)
    
df


✅ Wrote summary with shape (7, 8) to: metrics_summary.csv


Unnamed: 0_level_0,OpenMathInstruct-2__avg_abs_diff,OpenMathInstruct-2__latency_mean_s,ai2_arc__latency_mean_s,ai2_arc__macro_f1,boolq__latency_mean_s,boolq__macro_f1,squad_v2__F1,squad_v2__latency_mean_s
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Qwen3-0.6B-arc_SFT_None_Lora32,22870.653666,0.4439,0.1595,0.488,0.1599,0.3367,8.59,0.1958
Qwen3-0.6B-arc_SFT_None_Lora64,22870.688842,0.1811,0.1601,0.4937,0.1587,0.3241,8.09,0.1952
Qwen3-0.6B-boolq_SFT_None_Lora32,25172.04453,4.7384,0.9745,0.5021,0.153,0.3148,8.15,0.1963
Qwen3-0.6B-openmath_SFT_None_Lora32,23919.390779,1.5776,0.8702,0.5095,0.1564,0.3305,8.48,0.1963
Qwen3-0.6B-squad_SFT_None_Lora32,23523.160626,1.9676,0.3095,0.5024,0.1519,0.3148,9.59,0.195
Qwen3-0.6B_base,24834.379745,6.0139,1.1397,0.4932,0.1794,0.394,10.07,0.2274
Qwen3-1.7B_base,742.251114,12.5197,3.4025,0.7986,0.2171,0.0099,30.36,0.2837
