# 12 · Benchmark Models Comparison (SpectraMind V50)

Mission‑grade notebook to **compare SpectraMind V50** against baseline/benchmark models (e.g., Kaggle baselines) using **CLI/Hydra artifacts only** (no ad‑hoc training code).

### What this does
1) Auto‑discovers V50 run folders under `outputs/` and optionally loads **baseline predictions** (paths provided below).
2) Harvests **metrics** (Val GLL/Loss/Coverage) from each run.
3) Optionally computes **evaluation GLL** from predictions and labels (if available).
4) Produces a **leaderboard** (CSV/MD/HTML) and comparison plots.
5) Generates quick **per‑planet overlays** (predictions vs labels) for a small sample.

**Contract**: Thin orchestration. We only read outputs written by the pipeline/CLI; any regeneration is done by calling the CLI (optional cells). All artifacts are written to `outputs/notebooks/12_benchmark_models_comparison/`.

In [None]:
import os, sys, json, shutil, subprocess, platform, zipfile, textwrap
from pathlib import Path
from datetime import datetime
from typing import Optional, List, Dict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook'); sns.set_style('whitegrid')

ROOT = Path.cwd().resolve()
NB_OUT = ROOT / 'outputs' / 'notebooks' / '12_benchmark_models_comparison'
NB_OUT.mkdir(parents=True, exist_ok=True)
OUT_ROOT = ROOT / 'outputs'

ENV = {
    'python': platform.python_version(),
    'platform': platform.platform()
}
(NB_OUT/'env_snapshot.json').write_text(json.dumps(ENV, indent=2))

print('ROOT   :', ROOT)
print('OUT    :', OUT_ROOT)
print('NB_OUT :', NB_OUT)

## Parameters
Configure comparison inputs.

- **V50_RUN_TAGS**: optional substrings to filter SpectraMind V50 runs under `outputs/`.
- **BASELINES**: mapping of friendly names to prediction file(s). Provide long‑format (`planet_id,wavelength_index,mu`) or wide (`mu_000..`) CSV paths.
- **LABELS_HINT**: path to labels CSV if you want to compute evaluation GLL (long format: `planet_id,wavelength_index,y`).

In [None]:
# Filter V50 runs by tags (set to [] to include all)
V50_RUN_TAGS: List[str] = []  # e.g., ['v50', 'mamba', 'gat']

# Baseline predictions mapping (edit paths to your artifacts)
BASELINES: Dict[str, Path] = {
    # 'kaggle_baseline_1': ROOT/'benchmarks'/'kaggle'/'baseline_0_329'/'predictions.csv',
    # 'kaggle_model_X'  : ROOT/'benchmarks'/'kaggle'/'model_X'/'predictions.csv',
}

# Labels (optional, used to compute evaluation GLL)
LABELS_HINT: Optional[Path] = None  # e.g., ROOT/'outputs'/'val_labels.csv' or ROOT/'data'/'labels'/'val_labels.csv'

print('V50_RUN_TAGS:', V50_RUN_TAGS)
print('BASELINES   :', {k:str(v) for k,v in BASELINES.items()})
print('LABELS_HINT :', LABELS_HINT)

## Discovery & utilities
We collect V50 runs and read each run's metrics/config if available; for baselines we load predictions only.

In [None]:
def list_run_dirs(outputs_root: Path) -> List[Path]:
    """List run directories under outputs/, supporting date and runs/ layouts."""
    runs = []
    # Date layout
    for d in outputs_root.glob('20*'):
        if d.is_dir():
            runs.extend([p for p in d.iterdir() if p.is_dir()])
    # runs/ layout
    runs_root = outputs_root/'runs'
    if runs_root.exists():
        runs.extend([p for p in runs_root.iterdir() if p.is_dir()])
    # unique by path
    uniq, seen = [], set()
    for p in runs:
        if p not in seen:
            uniq.append(p); seen.add(p)
    return uniq

def match_tags(run_dir: Path, tags: List[str]) -> bool:
    if not tags: return True
    name = run_dir.name.lower()
    return any(t.lower() in name for t in tags)

def find_artifacts(run_dir: Path) -> dict:
    m = {
        'config': None,
        'metrics_csv': None,
        'predictions_csv': None
    }
    for p in [run_dir/'config.yaml', run_dir/'.hydra'/'config.yaml']:
        if p.exists(): m['config'] = p
    for p in [run_dir/'metrics.csv', run_dir/'training_metrics.csv']:
        if p.exists(): m['metrics_csv'] = p
    # Find a predictions.csv in run directory or below
    preds = sorted(run_dir.rglob('predictions.csv'))
    if preds: m['predictions_csv'] = preds[-1]
    return m

def load_metrics(path: Optional[Path]) -> Optional[pd.DataFrame]:
    if path and path.exists():
        try: return pd.read_csv(path)
        except Exception: return None
    return None

def newest_csv(paths: List[Path]) -> Optional[Path]:
    cands = []
    for h in paths:
        if not h.exists():
            continue
        if h.is_file() and h.suffix.lower() == '.csv':
            cands.append(h)
        elif h.is_dir():
            cands.extend(h.rglob('*.csv'))
    if not cands: return None
    return sorted(cands, key=lambda p: p.stat().st_mtime)[-1]

def long_from_predictions_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    cols = {c.lower(): c for c in df.columns}
    # long
    if {'planet_id','wavelength_index','mu'}.issubset(cols):
        return df.rename(columns={cols['planet_id']:'planet_id', cols['wavelength_index']:'wavelength_index', cols['mu']:'mu'})[['planet_id','wavelength_index','mu']]
    # wide
    mu_cols = [c for c in df.columns if str(c).startswith('mu_')]
    if mu_cols:
        id_col = None
        for k in ('planet_id','id','row_id','sample_id'):
            if k in cols: id_col = cols[k]; break
        if id_col is None:
            df = df.copy(); df.insert(0,'planet_id', np.arange(len(df)))
        else:
            df = df.rename(columns={id_col:'planet_id'})
        rows = []
        for _,r in df.iterrows():
            pid = r['planet_id']
            for j,mc in enumerate(sorted(mu_cols)):
                rows.append({'planet_id': pid, 'wavelength_index': j, 'mu': float(r[mc])})
        return pd.DataFrame(rows)
    raise ValueError(f'Unsupported predictions schema: {path}')

def load_labels(path: Optional[Path]) -> Optional[pd.DataFrame]:
    if path is None: return None
    if not path.exists(): return None
    df = pd.read_csv(path)
    c = {x.lower(): x for x in df.columns}
    if {'planet_id','wavelength_index','y'}.issubset(c):
        return df.rename(columns={c['planet_id']:'planet_id', c['wavelength_index']:'wavelength_index', c['y']:'y'})[['planet_id','wavelength_index','y']]
    # wide case (rare) – convert
    num_cols = [col for col in df.columns if np.issubdtype(df[col].dtype, np.number)]
    if num_cols:
        if 'planet_id' in c:
            df = df.rename(columns={c['planet_id']:'planet_id'})
        else:
            df = df.copy(); df.insert(0, 'planet_id', np.arange(len(df)))
        rows = []
        for _,r in df.iterrows():
            pid = r['planet_id']
            for j,col in enumerate(num_cols):
                rows.append({'planet_id': pid, 'wavelength_index': j, 'y': float(r[col])})
        return pd.DataFrame(rows)
    return None

def gll_from_long(pred_long: pd.DataFrame, labels_long: pd.DataFrame, sigma_col: Optional[str] = 'sigma') -> Optional[float]:
    if labels_long is None: return None
    df = pred_long.copy()
    df = df.merge(labels_long, on=['planet_id','wavelength_index'], how='inner')
    if df.empty: return None
    mu = df['mu'].to_numpy(float)
    y  = df['y'].to_numpy(float)
    if sigma_col in df.columns:
        s = df[sigma_col].to_numpy(float)
        s = np.where(s<=0, np.nan, s)
        term = ((mu - y)**2)/(s**2) + np.log(2*np.pi*(s**2))
        gll = -0.5*np.nanmean(term)
    else:
        # Fallback: scaled negative MSE as a proxy (not true GLL)
        mse = np.nanmean((mu - y)**2)
        gll = -mse
    return float(gll)

## Harvest V50 runs

In [None]:
RUNS = [p for p in list_run_dirs(OUT_ROOT) if match_tags(p, V50_RUN_TAGS)]
print(f'Found {len(RUNS)} V50 runs (after tag filter).')

LABELS = load_labels(LABELS_HINT)
if LABELS is not None:
    print('Loaded labels:', LABELS.shape)

v50_rows = []
for rd in RUNS:
    art = find_artifacts(rd)
    met = load_metrics(art['metrics_csv'])
    pred_path = art['predictions_csv']
    eval_gll = None
    if pred_path and pred_path.exists():
        try:
            pred_long = long_from_predictions_csv(pred_path)
            eval_gll = gll_from_long(pred_long, LABELS) if LABELS is not None else None
        except Exception as e:
            print(f'GLL eval failed for {rd.name}:', e)
    row = {'run_dir': str(rd.relative_to(ROOT)), 'type': 'V50', 'eval_gll': eval_gll}
    if met is not None and len(met):
        last = met.iloc[-1]
        row['val_gll'] = last.get('val_gll', np.nan)
        row['val_loss'] = last.get('val_loss', np.nan)
        row['val_coverage'] = last.get('val_coverage', np.nan)
    v50_rows.append(row)

V50 = pd.DataFrame(v50_rows)
display(V50.head())

## Load baselines
We ingest baseline predictions and compute evaluation GLL (if labels provided).

In [None]:
baseline_rows = []
for name, path in BASELINES.items():
    if not path.exists():
        print(f'Baseline {name} not found at {path}'); continue
    pred_long = long_from_predictions_csv(path)
    eval_gll = gll_from_long(pred_long, LABELS) if LABELS is not None else None
    baseline_rows.append({'run_dir': str(path.relative_to(ROOT)), 'type': 'baseline', 'name': name, 'eval_gll': eval_gll})

BL = pd.DataFrame(baseline_rows)
display(BL.head() if not BL.empty else BL)

## Leaderboard
We concatenate V50 and baselines and sort by **eval_gll** (if available) else **val_gll** (descending is better).

In [None]:
LB = pd.concat([
    V50.assign(name=V50['run_dir'].apply(lambda s: Path(s).name)),
    BL
], ignore_index=True)

def sort_key(df):
    if 'eval_gll' in df and df['eval_gll'].notna().any():
        return df.sort_values(['eval_gll'], ascending=False)
    elif 'val_gll' in df and df['val_gll'].notna().any():
        return df.sort_values(['val_gll'], ascending=False)
    else:
        return df

LB = sort_key(LB).reset_index(drop=True)
display(LB.head(20))

LB_CSV = NB_OUT/'benchmark_leaderboard.csv'
LB.to_csv(LB_CSV, index=False)
print('Wrote:', LB_CSV)

## Plots
- **Bar**: eval_gll (or val_gll) for top entries.
- **Overlay**: example planet predictions vs labels for selected models.

In [None]:
plt.figure(figsize=(11,4))
TOPK = min(12, len(LB))
df = LB.head(TOPK).copy()
metric = 'eval_gll' if df['eval_gll'].notna().any() else ('val_gll' if 'val_gll' in df else None)
if metric is not None:
    sns.barplot(x=metric, y='name', data=df, orient='h', hue='type', dodge=False)
    plt.title(f'Top {TOPK} models by {metric}')
    plt.xlabel(metric); plt.ylabel('model/run')
    plt.tight_layout(); plt.savefig(NB_OUT/'bar_top_gll.png', dpi=150); plt.close()
    print('Saved:', NB_OUT/'bar_top_gll.png')
else:
    print('No GLL metric available for plotting.')

### Overlays: predictions vs labels (sample planet)
If labels are present, we pick a planet and overlay spectra from the few best models.

In [None]:
if LABELS is not None and not LABELS.empty and not LB.empty:
    # Choose top 3
    top = LB.head(min(3,len(LB)))
    # Choose a planet id present in labels
    pid = LABELS['planet_id'].iloc[0]

    plt.figure(figsize=(10,4))
    # plot labels
    y_df = LABELS[LABELS['planet_id']==pid].sort_values('wavelength_index')
    plt.plot(y_df['wavelength_index'], y_df['y'], color='k', lw=1.6, label='labels (y)')

    # Add each model
    for _,r in top.iterrows():
        if r['type']=='V50':
            run_dir = ROOT / r['run_dir']
            pred_path = find_artifacts(run_dir)['predictions_csv']
        else:
            pred_path = ROOT / r['run_dir']
        try:
            p_long = long_from_predictions_csv(pred_path)
            p_pid = p_long[p_long['planet_id']==pid].sort_values('wavelength_index')
            if len(p_pid):
                plt.plot(p_pid['wavelength_index'], p_pid['mu'], lw=1.2, label=r.get('name', Path(r['run_dir']).name))
        except Exception as e:
            print('Overlay failed for', r.get('name', r['run_dir']), ':', e)

    plt.xlabel('wavelength index'); plt.ylabel('value')
    plt.title(f'Overlay — planet {pid}')
    plt.legend(); plt.tight_layout()
    plt.savefig(NB_OUT/'overlay_planet_sample.png', dpi=150); plt.close()
    print('Saved:', NB_OUT/'overlay_planet_sample.png')
else:
    print('Labels not available or leaderboard empty; skip overlay.')

## Export Markdown & HTML leaderboards
Useful for PRs/Wikis/Kaggle writeups.

In [None]:
def md_table(df: pd.DataFrame, cols: List[str]) -> str:
    cols = [c for c in cols if c in df.columns]
    if not cols: return '(no data)'
    md = ['| ' + ' | '.join(cols) + ' |', '| ' + ' | '.join(['---']*len(cols)) + ' |']
    for _,row in df.iterrows():
        md.append('| ' + ' | '.join([str(row.get(c,'')) for c in cols]) + ' |')
    return '\n'.join(md)

if not LB.empty:
    cols = ['name','type','run_dir','eval_gll','val_gll','val_loss','val_coverage']
    md = '# Benchmark Leaderboard\n\n' + md_table(LB.head(25), cols)
    (NB_OUT/'benchmark_leaderboard.md').write_text(md)
    print('Wrote:', NB_OUT/'benchmark_leaderboard.md')

    html = ['<html><head><meta charset="utf-8"><title>Benchmark Leaderboard</title></head><body>',
            '<h1>Benchmark Leaderboard</h1>', '<pre>', md, '</pre>', '</body></html>']
    (NB_OUT/'benchmark_leaderboard.html').write_text('\n'.join(html), encoding='utf-8')
    print('Wrote:', NB_OUT/'benchmark_leaderboard.html')
else:
    print('Leaderboard empty — no export.')

## Optional: DVC add
Register notebook outputs for full reproducibility.

In [None]:
if shutil.which('dvc'):
    try:
        subprocess.run(['dvc','add', str(NB_OUT)], check=False)
        subprocess.run(['git','add', f'{NB_OUT}.dvc', '.gitignore'], check=False)
        subprocess.run(['dvc','status'], check=False)
        print('DVC add done (non‑blocking).')
    except Exception as e:
        print('DVC step failed (non‑blocking):', e)
else:
    print('DVC not found; skipping.')