In [4]:
from pathlib import Path
import re
import json
import pandas as pd
from typing import Optional, Tuple, List

# =========================
# Config
# =========================
# We scan both: Testing/metrics (current folder) AND repo-root/metrics
METRIC_DIRS = [Path("metrics"), Path("../metrics")]
OUT_CSV = Path("metrics_summary.csv")
DEBUG = False  # set True to print debug info


# =========================
# Helpers
# =========================
def dataset_slug(name: str) -> str:
    n = Path(name).stem
    if n.startswith("test-"):
        n = n[5:]
    return n

def get_metric(ds_block: dict, key: str):
    val = ds_block.get(key)
    if val is None:
        val = (ds_block.get("metrics") or {}).get(key)
    return val

EFFICACY_POLICY = {
    "mcq4": ["macro_f1", "accuracy"],
    "boolq": ["macro_f1", "MCC", "balanced_accuracy", "accuracy"],  # excluded anyway
    "squad_v2": ["F1", "EM"],
    "math_numeric": ["avg_abs_diff"],  # lower is better
}

def choose_efficacy_metric(ds_block: dict) -> Optional[Tuple[str, float]]:
    kind = (ds_block.get("type") or "").lower()
    for k in EFFICACY_POLICY.get(kind, []):
        v = get_metric(ds_block, k)
        if v is not None:
            return k, v
    for k in ["accuracy", "macro_f1", "F1", "balanced_accuracy", "MCC", "EM", "avg_abs_diff"]:
        v = get_metric(ds_block, k)
        if v is not None:
            return k, v
    return None

def parse_model_info(model_id: str) -> dict:
    m_ds = re.search(r'-([A-Za-z0-9_]+)_SFT_', model_id)
    dataset = (m_ds.group(1).lower() if m_ds else None)

    is_base = 1 if model_id.endswith('_base') else 0
    if is_base:
        dataset = '_base'

    is_nopeft = 1 if 'NoPeft' in model_id else 0

    m_lora = re.search(r'[Ll]ora(\d+)', model_id)
    lora_rank = int(m_lora.group(1)) if m_lora else -1
    has_lora = 1 if m_lora else 0

    dataset_order = 1 if dataset == '_base' else 0
    if has_lora:
        lora_group = 0
    elif is_nopeft:
        lora_group = 1
    else:
        lora_group = 2

    m_size = re.search(r'-(\d+(?:\.\d+)?)B-', model_id)
    size_num = float(m_size.group(1)) if m_size else 0.0

    return {
        "dataset": dataset or "",
        "dataset_order": dataset_order,
        "lora_group": lora_group,
        "lora_rank": lora_rank,
        "is_nopeft": is_nopeft,
        "is_base": is_base,
        "size_num": size_num,
    }

def _gather_eval_jsons(dirs: List[Path]) -> List[Path]:
    files: List[Path] = []
    for d in dirs:
        if d.exists():
            for f in d.rglob("*.json"):
                if f.name == "training_metadata.json":
                    continue
                files.append(f)
    files = sorted(set(f.resolve() for f in files))
    if DEBUG:
        print(f"[DEBUG] Found {len(files)} evaluation JSON(s):")
        for f in files:
            print("   ", f)
    return files

def _collect_models_roots() -> List[Path]:
    """
    Find any 'Models' directory starting from CWD and walking up parents.
    Running from Testing/, this yields '../Models' (and higher if needed).
    """
    roots: List[Path] = []
    here = Path.cwd().resolve()
    for base in [here] + list(here.parents):
        cand = base / "Models"
        if cand.exists() and cand.is_dir():
            roots.append(cand.resolve())
    # Dedup, keep order
    out, seen = [], set()
    for r in roots:
        if r not in seen:
            seen.add(r)
            out.append(r)
    if DEBUG:
        print("[DEBUG] Models roots:", out)
    return out

def _load_training_vram_from_path(p: Path) -> Optional[Tuple[float, float]]:
    try:
        with open(p, "r", encoding="utf-8") as f:
            tm = json.load(f)
        v = ((tm.get("hardware_info") or {}).get("vram_peaks")) or {}
        res = v.get("overall_max_reserved_gb")
        alloc = v.get("overall_max_allocated_gb")
        if (res is not None) or (alloc is not None):
            return res, alloc
    except Exception:
        pass
    return None

def _safe_name(s: str) -> str:
    return re.sub(r'[<>:"/\\|?*\x00-\x1F]', "_", s)

def _find_training_metadata(model_name_in_json: Optional[str], json_stem: str) -> Optional[Tuple[float, float]]:
    """
    Resolve training_metadata.json robustly when running from Testing/.
    We try:
      1) Direct model_name path (absolute or relative) + parents
      2) <any Models root>/<basename(model_name)>/training_metadata.json
      3) <any Models root>/<json_stem>/training_metadata.json
      4) Fallback: search under each Models root for a folder whose name equals
         json_stem or the model_info.model_name inside the file.
    """
    models_roots = _collect_models_roots()

    # 1) Direct model_name path tries (absolute or relative under cwd & parents)
    cand_paths: List[Path] = []
    if model_name_in_json:
        p = Path(model_name_in_json)
        # absolute as given
        cand_paths.append(p / "training_metadata.json")
        # relative under cwd and parents
        here = Path.cwd().resolve()
        for base in [here] + list(here.parents):
            cand_paths.append((base / p / "training_metadata.json"))

    # 2) Models roots + basename(model_name)
    if model_name_in_json:
        base_name = Path(model_name_in_json).name
        for root in models_roots:
            cand_paths.append(root / base_name / "training_metadata.json")

    # 3) Models roots + json_stem
    for root in models_roots:
        cand_paths.append(root / json_stem / "training_metadata.json")

    # First pass: any candidate that exists
    for c in cand_paths:
        if c.exists():
            vr = _load_training_vram_from_path(c)
            if vr:
                if DEBUG:
                    print(f"[DEBUG] Using training metadata (direct): {c}")
                return vr

    # 4) Deep search under each Models root
    for root in models_roots:
        try:
            hits = list(root.rglob("training_metadata.json"))
        except Exception:
            hits = []
        # Prefer parent name match
        for h in hits:
            if h.parent.name in {json_stem, Path(model_name_in_json or '').name}:
                vr = _load_training_vram_from_path(h)
                if vr:
                    if DEBUG:
                        print(f"[DEBUG] Using training metadata (parent match): {h}")
                    return vr
        # Try matching model_info.model_name inside file
        for h in hits:
            try:
                with open(h, "r", encoding="utf-8") as f:
                    tm = json.load(f)
                mi = (tm.get("model_info") or {}).get("model_name")
                if mi and mi in {json_stem, _safe_name(json_stem)}:
                    vr = _load_training_vram_from_path(h)
                    if vr:
                        if DEBUG:
                            print(f"[DEBUG] Using training metadata (model_info match): {h}")
                        return vr
            except Exception:
                continue

    if DEBUG:
        print(f"[DEBUG] No training metadata found for {json_stem}")
    return None


# VRAM columns we always want visible
VRAM_COLS = [
    "train_peak_vram_reserved_gb",
    "train_peak_vram_allocated_gb",
    "eval_peak_vram_reserved_gb",
    "eval_peak_vram_allocated_gb",
]


# =========================
# Main
# =========================
rows: List[dict] = []
vram_cols_order: list[str] = VRAM_COLS.copy()
col_order: list[str] = []

eval_jsons = _gather_eval_jsons(METRIC_DIRS)
if not eval_jsons and DEBUG:
    print("[DEBUG] No evaluation JSONs found. Check METRIC_DIRS or where your evaluator saves files.")

for jf in eval_jsons:
    try:
        with open(jf, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception:
        if DEBUG:
            print(f"[DEBUG] Skipping unreadable JSON: {jf}")
        continue

    model_id = jf.stem  # metrics filename without extension
    row = {"model": model_id}

    # ---- EVAL VRAM ----
    hw = data.get("hardware") or {}
    summ = data.get("summary") or {}
    row["eval_peak_vram_reserved_gb"]  = hw.get("peak_vram_reserved_gb",  summ.get("peak_vram_reserved_gb"))
    row["eval_peak_vram_allocated_gb"] = hw.get("peak_vram_allocated_gb", summ.get("peak_vram_allocated_gb"))

    # ---- TRAIN VRAM ----
    vr = _find_training_metadata(data.get("model_name"), model_id)
    if vr:
        row["train_peak_vram_reserved_gb"], row["train_peak_vram_allocated_gb"] = vr
    else:
        row["train_peak_vram_reserved_gb"] = None
        row["train_peak_vram_allocated_gb"] = None

    # ---- Per-dataset metrics ----
    datasets = data.get("datasets", {}) or {}
    for ds_name, ds_block in datasets.items():
        slug = dataset_slug(ds_name)
        ds_type = (ds_block.get("type") or "").lower()

        if ds_type.startswith("bool") or slug.startswith("bool"):
            continue

        eff = choose_efficacy_metric(ds_block)
        metric_col = None
        if eff is not None:
            metric_name, value = eff
            metric_col = f"{slug}__{metric_name}"
            row[metric_col] = value

        lat_mean = (ds_block.get("latency_seconds") or {}).get("mean")
        latency_col = f"{slug}__latency_mean_s"
        row[latency_col] = lat_mean

        if metric_col is not None and metric_col not in col_order:
            col_order.append(metric_col)
        if latency_col not in col_order:
            col_order.append(latency_col)

    rows.append(row)

# =========================
# Build & save
# =========================
if rows:
    df = pd.DataFrame(rows).set_index("model")

    # Ensure VRAM cols exist even if empty
    for vc in VRAM_COLS:
        if vc not in df.columns:
            df[vc] = pd.NA

    # Column order: VRAM first, then dataset metrics
    final_cols = [c for c in VRAM_COLS if c in df.columns]
    final_cols += [c for c in col_order if c in df.columns and c not in final_cols]
    df = df.reindex(columns=final_cols)

    # Sorting (same as before)
    info_df = df.index.to_series().apply(parse_model_info).apply(pd.Series)
    info_df.index.name = "model"
    df = df.join(info_df)
    df = df.sort_values(
        by=["dataset_order", "dataset", "lora_group", "lora_rank", "size_num", "model"],
        ascending=[True,           True,       True,        False,       True,      True],
    ).drop(columns=["dataset_order", "lora_group", "lora_rank", "is_nopeft", "is_base", "size_num", "dataset"], errors="ignore")
else:
    df = pd.DataFrame(columns=["model"] + VRAM_COLS).set_index("model")

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUT_CSV, index=True)

print(f"✅ Wrote summary with shape {df.shape} to: {OUT_CSV}")
df


✅ Wrote summary with shape (6, 11) to: metrics_summary.csv


Unnamed: 0_level_0,train_peak_vram_reserved_gb,train_peak_vram_allocated_gb,eval_peak_vram_reserved_gb,eval_peak_vram_allocated_gb,ai2_arc__macro_f1,ai2_arc__latency_mean_s,OpenMathInstruct-2__avg_abs_diff,OpenMathInstruct-2__latency_mean_s,squad_v2__F1,squad_v2__latency_mean_s,OpenMathInstruct-2__accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Models__Qwen3-0.6B-base,0.0,0.0,1.375,1.28,0.5111,1.7676,2294.642941,7.3468,10.33,0.2334,
Models__Qwen3-0.6B-base_gptq_w4g64,,,1.434,1.323,0.0,29.6305,,29.6095,50.0,29.8376,0.0
Models__Qwen3-0.6B-openmath_SFT_NoPeft_NoQuant,5.688,5.346,1.252,1.219,0.2143,0.0886,6.0,0.0435,10.33,0.2466,
Models__Qwen3-0.6B-openmath_SFT_NoPeft_NoQuant_gptq_w4g64,,,1.434,1.323,0.0,30.167,,29.7323,50.0,29.329,0.0
Qwen3-0.6B-openmath_SFT_LoRa64_NoQuant,2.814,2.188,1.375,1.284,0.661,1.8407,137.154211,3.6479,10.33,0.236,
Qwen3-0.6B-openmath_SFT_LoRa64_QLORA_w4_headbf16,1.877,1.398,1.0,0.638,0.4937,0.6717,50656.7865,8.3269,0.0,0.3752,
