# Eval Run Plots

Plots loss, AUC, and accuracy curves for each run listed in `run_details.json`.
Best values are highlighted with markers.

In [None]:
import json
from pathlib import Path
from typing import Any, Dict, List, Optional

import matplotlib.pyplot as plt
import pandas as pd


def resolve_path(path_str: str) -> Path:
    """Resolve path from multiple candidate locations."""
    path = Path(path_str)
    candidates = [
        path,
        Path(path.name),
        Path('evals') / path.name,
        Path('..') / path,
    ]
    for candidate in candidates:
        if candidate.exists():
            return candidate
    return path


def generate_run_name(run: Dict[str, Any]) -> str:
    """Generate a unique/differentiator name from run parameters or run_id."""
    params = run.get('params')
    run_id = run.get('run_id', 'unknown')
    
    # Extract timestamp from run_id (e.g., temporal-vit-20260106-044352 -> 01/06-0443)
    parts = run_id.split('-')
    if len(parts) >= 4:
        date_part = parts[-2]  # e.g., 20260106
        time_part = parts[-1]  # e.g., 044352
        short_ts = f"{date_part[4:6]}/{date_part[6:8]}-{time_part[:4]}"
    else:
        short_ts = run_id[-12:] if len(run_id) > 12 else run_id
    
    if params and isinstance(params, dict):
        # Build name from key parameters
        name_parts = []
        
        # Key differentiating parameters
        param_abbrevs = {
            'n_trials': 'tr',
            'embed_dim': 'dim',
            'n_layers': 'L',
            'n_heads': 'H',
            'dropout': 'do',
            'drop_path': 'dp',
            'lr': 'lr',
            'weight_decay': 'wd',
            'label_smoothing': 'ls',
            'batch_size': 'bs',
        }
        
        for key, abbrev in param_abbrevs.items():
            if key in params:
                val = params[key]
                if isinstance(val, float):
                    if val < 0.01:
                        val_str = f"{val:.0e}"
                    else:
                        val_str = f"{val:.2g}"
                else:
                    val_str = str(val)
                name_parts.append(f"{abbrev}{val_str}")
        
        if name_parts:
            return f"{short_ts} ({', '.join(name_parts[:4])})"
    
    return short_ts


def extract_metrics_df(run: Dict[str, Any]) -> pd.DataFrame:
    """Extract metrics from run into a DataFrame."""
    metrics = run.get('metrics', [])
    if not metrics:
        return pd.DataFrame()
    
    df = pd.DataFrame(metrics)
    # Filter to training epochs only (exclude test-only rows)
    if 'val/loss' in df.columns:
        df = df.dropna(subset=['val/loss'])
    return df


# Load run details
run_details_path = resolve_path('evals/run_details.json')
if not run_details_path.exists():
    run_details_path = resolve_path('run_details.json')

with open(run_details_path, 'r', encoding='utf-8') as handle:
    payload = json.load(handle)

runs = payload.get('runs', [])
if not runs:
    raise ValueError('No runs found in run_details.json')

print(f"Found {len(runs)} runs")

In [None]:
# Prepare data for each run
run_data = []
for run in runs:
    df = extract_metrics_df(run)
    if df.empty:
        continue
    
    name = generate_run_name(run)
    run_data.append({
        'name': name,
        'run_id': run.get('run_id'),
        'df': df,
        'summary': run.get('summary', {}),
    })

print(f"Runs with metrics: {len(run_data)}")
for rd in run_data:
    print(f"  - {rd['name']}: {len(rd['df'])} epochs")

In [None]:
# Plot 1: Loss over epochs
fig, ax = plt.subplots(figsize=(10, 6))

for rd in run_data:
    df = rd['df']
    name = rd['name']
    
    if 'val/loss' not in df.columns or 'step' not in df.columns:
        continue
    
    epochs = df['step'].values
    val_loss = df['val/loss'].values
    
    # Plot the line
    line, = ax.plot(epochs, val_loss, marker='o', markersize=4, label=name)
    
    # Highlight best (minimum) loss
    best_idx = val_loss.argmin()
    best_epoch = epochs[best_idx]
    best_val = val_loss[best_idx]
    ax.scatter([best_epoch], [best_val], s=150, c=line.get_color(), 
               marker='*', edgecolors='black', linewidths=1, zorder=5)
    ax.annotate(f'{best_val:.3f}', (best_epoch, best_val), 
                textcoords='offset points', xytext=(5, 5), fontsize=8)

ax.set_title('Validation Loss over Epochs', fontsize=14, fontweight='bold')
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Loss', fontsize=12)
ax.grid(True, alpha=0.3)
ax.legend(loc='best', fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
# Plot 2: AUC over epochs
fig, ax = plt.subplots(figsize=(10, 6))

for rd in run_data:
    df = rd['df']
    name = rd['name']
    
    if 'val/auc' not in df.columns or 'step' not in df.columns:
        continue
    
    epochs = df['step'].values
    val_auc = df['val/auc'].values
    
    # Plot the line
    line, = ax.plot(epochs, val_auc, marker='o', markersize=4, label=name)
    
    # Highlight best (maximum) AUC
    best_idx = val_auc.argmax()
    best_epoch = epochs[best_idx]
    best_val = val_auc[best_idx]
    ax.scatter([best_epoch], [best_val], s=150, c=line.get_color(), 
               marker='*', edgecolors='black', linewidths=1, zorder=5)
    ax.annotate(f'{best_val:.4f}', (best_epoch, best_val), 
                textcoords='offset points', xytext=(5, -10), fontsize=8)

ax.set_title('Validation AUC over Epochs', fontsize=14, fontweight='bold')
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('AUC', fontsize=12)
ax.grid(True, alpha=0.3)
ax.legend(loc='best', fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
# Plot 3: Accuracy over epochs
fig, ax = plt.subplots(figsize=(10, 6))

for rd in run_data:
    df = rd['df']
    name = rd['name']
    
    if 'val/acc' not in df.columns or 'step' not in df.columns:
        continue
    
    epochs = df['step'].values
    val_acc = df['val/acc'].values
    
    # Plot the line
    line, = ax.plot(epochs, val_acc, marker='o', markersize=4, label=name)
    
    # Highlight best (maximum) accuracy
    best_idx = val_acc.argmax()
    best_epoch = epochs[best_idx]
    best_val = val_acc[best_idx]
    ax.scatter([best_epoch], [best_val], s=150, c=line.get_color(), 
               marker='*', edgecolors='black', linewidths=1, zorder=5)
    ax.annotate(f'{best_val:.3f}', (best_epoch, best_val), 
                textcoords='offset points', xytext=(5, -10), fontsize=8)

ax.set_title('Validation Accuracy over Epochs', fontsize=14, fontweight='bold')
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Accuracy', fontsize=12)
ax.grid(True, alpha=0.3)
ax.legend(loc='best', fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
# Summary table
summary_data = []
for rd in run_data:
    df = rd['df']
    summary = rd['summary']
    
    row = {
        'Run': rd['name'],
        'Epochs': len(df),
        'Best Val AUC': summary.get('best_val_auc'),
        'Best AUC Epoch': summary.get('best_val_auc_step'),
        'Test AUC': summary.get('last_test_auc'),
        'Test Acc': summary.get('last_test_acc'),
    }
    summary_data.append(row)

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('Best Val AUC', ascending=False)
display(summary_df)