# Eval Run Plots

Plots loss, AUC, and accuracy curves for each run listed in `run_details.json`.
Best values are highlighted with markers. Baseline results are loaded from `baseline_results.json`.

In [None]:
import json
from pathlib import Path
from typing import Any, Dict, List, Optional

import matplotlib.pyplot as plt
import pandas as pd


def resolve_path(path_str: str) -> Path:
    """Resolve path from multiple candidate locations."""
    path = Path(path_str)
    candidates = [
        path,
        Path(path.name),
        Path('evals') / path.name,
        Path('..') / path,
    ]
    for candidate in candidates:
        if candidate.exists():
            return candidate
    return path


def generate_run_name(run: Dict[str, Any]) -> str:
    """Generate a unique/differentiator name from run parameters or run_id."""
    params = run.get('params')
    run_id = run.get('run_id', 'unknown')
    
    # Extract timestamp from run_id (e.g., temporal-vit-20260106-044352 -> 01/06-0443)
    parts = run_id.split('-')
    if len(parts) >= 4:
        date_part = parts[-2]  # e.g., 20260106
        time_part = parts[-1]  # e.g., 044352
        short_ts = f"{date_part[4:6]}/{date_part[6:8]}-{time_part[:4]}"
    else:
        short_ts = run_id[-12:] if len(run_id) > 12 else run_id
    
    if params and isinstance(params, dict):
        # Build name from key parameters
        name_parts = []
        
        # Key differentiating parameters
        param_abbrevs = {
            'n_trials': 'tr',
            'embed_dim': 'dim',
            'n_layers': 'L',
            'n_heads': 'H',
            'dropout': 'do',
            'drop_path': 'dp',
            'lr': 'lr',
            'weight_decay': 'wd',
            'label_smoothing': 'ls',
            'batch_size': 'bs',
        }
        
        for key, abbrev in param_abbrevs.items():
            if key in params:
                val = params[key]
                if isinstance(val, float):
                    if val < 0.01:
                        val_str = f"{val:.0e}"
                    else:
                        val_str = f"{val:.2g}"
                else:
                    val_str = str(val)
                name_parts.append(f"{abbrev}{val_str}")
        
        if name_parts:
            return f"{short_ts} ({', '.join(name_parts[:4])})"
    
    return short_ts


def extract_metrics_df(run: Dict[str, Any]) -> pd.DataFrame:
    """Extract metrics from run into a DataFrame."""
    metrics = run.get('metrics', [])
    if not metrics:
        return pd.DataFrame()
    
    df = pd.DataFrame(metrics)
    # Filter to training epochs only (exclude test-only rows)
    if 'val/loss' in df.columns:
        df = df.dropna(subset=['val/loss'])
    return df


def load_baselines(path_str: str = 'evals/baseline_results.json') -> List[Dict[str, Any]]:
    """Load baseline results from JSON file."""
    path = resolve_path(path_str)
    if not path.exists():
        print(f"Baseline results not found at {path}. Run collect_baseline_results.py first.")
        return []
    
    with open(path, 'r', encoding='utf-8') as handle:
        payload = json.load(handle)
    return payload.get('baselines', [])


# Load run details
run_details_path = resolve_path('evals/run_details.json')
if not run_details_path.exists():
    run_details_path = resolve_path('run_details.json')

with open(run_details_path, 'r', encoding='utf-8') as handle:
    payload = json.load(handle)

runs = payload.get('runs', [])
if not runs:
    raise ValueError('No runs found in run_details.json')

baselines = load_baselines()

print(f"Found {len(runs)} runs")
print(f"Found {len(baselines)} baseline results")


In [None]:
# Load HP tuning run details (optional)
hptune_details_path = resolve_path('evals/hptune_run_details.json')
if not hptune_details_path.exists():
    hptune_details_path = resolve_path('hptune_run_details.json')

hptune_runs = []
if hptune_details_path.exists():
    with open(hptune_details_path, 'r', encoding='utf-8') as handle:
        hptune_payload = json.load(handle)
    hptune_runs = hptune_payload.get('runs', [])
    print(f"Found {len(hptune_runs)} HP tuning runs")
else:
    print(f"HP tuning details not found at {hptune_details_path}")

def summarize_hptune_runs(runs):
    rows = []
    for run in runs:
        summary = run.get('summary', {})
        run_id = run.get('run_id') or 'unknown'
        val_auc = summary.get('best_val_auc')
        test_auc = summary.get('last_test_auc')
        if val_auc is None and test_auc is None:
            continue
        rows.append({
            'run_id': run_id,
            'val_auc': val_auc,
            'test_auc': test_auc,
            'summary': summary,
        })
    return rows

hptune_data = summarize_hptune_runs(hptune_runs)
print(f"HP runs with summary metrics: {len(hptune_data)}")


In [None]:
# Prepare data for each run
run_data = []
for run in runs:
    df = extract_metrics_df(run)
    if df.empty:
        continue
    
    name = generate_run_name(run)
    run_data.append({
        'name': name,
        'run_id': run.get('run_id'),
        'df': df,
        'summary': run.get('summary', {}),
    })

print(f"Runs with metrics: {len(run_data)}")
for rd in run_data:
    print(f"  - {rd['name']}: {len(rd['df'])} epochs")

print('\nBaselines:')
for bl in baselines:
    test_auc = bl.get('test', {}).get('auc')
    test_acc = bl.get('test', {}).get('acc')
    auc_str = f"{test_auc:.4f}" if isinstance(test_auc, float) else 'N/A'
    acc_str = f"{test_acc:.4f}" if isinstance(test_acc, float) else 'N/A'
    print(f"  - {bl['name']}: test_auc={auc_str}, test_acc={acc_str}")


In [None]:
# Plot 1: Loss over epochs
fig, ax = plt.subplots(figsize=(9, 5))

for rd in run_data:
    df = rd['df']
    name = rd['name']
    
    if 'val/loss' not in df.columns or 'step' not in df.columns:
        continue
    
    epochs = df['step'].values
    val_loss = df['val/loss'].values
    
    # Plot the line
    line, = ax.plot(epochs, val_loss, marker='o', markersize=4, label=name)
    
    # Highlight best (minimum) loss
    best_idx = val_loss.argmin()
    best_epoch = epochs[best_idx]
    best_val = val_loss[best_idx]
    ax.scatter([best_epoch], [best_val], s=150, c=line.get_color(), 
               marker='*', edgecolors='black', linewidths=1, zorder=5)
    ax.annotate(f'{best_val:.3f}', (best_epoch, best_val), 
                textcoords='offset points', xytext=(5, 5), fontsize=8)

ax.set_title('Validation Loss over Epochs', fontsize=14, fontweight='bold')
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Loss', fontsize=12)
ax.grid(True, alpha=0.3)
ax.legend(loc='best', fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
# Plot 2: Validation AUC over epochs
fig, ax = plt.subplots(figsize=(9, 5))

for rd in run_data:
    df = rd['df']
    name = rd['name']
    
    if 'val/auc' not in df.columns or 'step' not in df.columns:
        continue
    
    epochs = df['step'].values
    val_auc = df['val/auc'].values
    
    # Plot the line
    line, = ax.plot(epochs, val_auc, marker='o', markersize=4, label=name)
    
    # Highlight best (maximum) AUC
    best_idx = val_auc.argmax()
    best_epoch = epochs[best_idx]
    best_val = val_auc[best_idx]
    ax.scatter([best_epoch], [best_val], s=150, c=line.get_color(), 
               marker='*', edgecolors='black', linewidths=1, zorder=5)
    ax.annotate(f'{best_val:.4f}', (best_epoch, best_val), 
                textcoords='offset points', xytext=(5, -10), fontsize=8)

ax.set_title('Validation AUC over Epochs', fontsize=14, fontweight='bold')
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('AUC', fontsize=12)
ax.grid(True, alpha=0.3)
ax.legend(loc='best', fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
# Plot 3: Validation Accuracy over epochs
fig, ax = plt.subplots(figsize=(9, 5))

for rd in run_data:
    df = rd['df']
    name = rd['name']
    
    if 'val/acc' not in df.columns or 'step' not in df.columns:
        continue
    
    epochs = df['step'].values
    val_acc = df['val/acc'].values
    
    # Plot the line
    line, = ax.plot(epochs, val_acc, marker='o', markersize=4, label=name)
    
    # Highlight best (maximum) accuracy
    best_idx = val_acc.argmax()
    best_epoch = epochs[best_idx]
    best_val = val_acc[best_idx]
    ax.scatter([best_epoch], [best_val], s=150, c=line.get_color(), 
               marker='*', edgecolors='black', linewidths=1, zorder=5)
    ax.annotate(f'{best_val:.3f}', (best_epoch, best_val), 
                textcoords='offset points', xytext=(5, -10), fontsize=8)

ax.set_title('Validation Accuracy over Epochs', fontsize=14, fontweight='bold')
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Accuracy', fontsize=12)
ax.grid(True, alpha=0.3)
ax.legend(loc='best', fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
# Plot 4: Test AUC comparison (ViT vs baselines)
fig, ax = plt.subplots(figsize=(12, 6))

model_names = []
test_aucs = []
colors = []

for rd in run_data:
    test_auc = rd['summary'].get('last_test_auc')
    if isinstance(test_auc, float):
        model_names.append(f"ViT: {rd['name']}")
        test_aucs.append(test_auc)
        colors.append('steelblue')

baseline_color_map = {
    'log_reg': 'coral',
    'logistic_regression': 'coral',
    'xgboost': 'forestgreen',
    'random_forest': 'purple',
}
for bl in baselines:
    test_auc = bl.get('test', {}).get('auc')
    if isinstance(test_auc, float):
        model_names.append(bl['name'])
        test_aucs.append(test_auc)
        colors.append(baseline_color_map.get(bl.get('model_type', ''), 'gray'))

if not test_aucs:
    print('No test AUC values available to plot.')
else:
    sorted_indices = sorted(range(len(test_aucs)), key=lambda i: test_aucs[i], reverse=True)
    model_names = [model_names[i] for i in sorted_indices]
    test_aucs = [test_aucs[i] for i in sorted_indices]
    colors = [colors[i] for i in sorted_indices]

    bars = ax.barh(range(len(model_names)), test_aucs, color=colors, edgecolor='black', linewidth=0.5)
    for bar, auc in zip(bars, test_aucs):
        ax.text(auc + 0.005, bar.get_y() + bar.get_height() / 2, f'{auc:.4f}',
                va='center', ha='left', fontsize=9)

    ax.set_yticks(range(len(model_names)))
    ax.set_yticklabels(model_names, fontsize=10)
    ax.set_xlabel('Test AUC', fontsize=12)
    ax.set_title('Test AUC Comparison: ViT vs Baselines', fontsize=14, fontweight='bold')
    ax.set_xlim(0, max(test_aucs) * 1.1)
    ax.grid(True, alpha=0.3, axis='x')

    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='steelblue', edgecolor='black', label='ViT'),
        Patch(facecolor='coral', edgecolor='black', label='Logistic Regression'),
    ]
    ax.legend(handles=legend_elements, loc='lower right', fontsize=10)

    plt.tight_layout()
    plt.show()


In [None]:
# Plot 5: Test Accuracy comparison (ViT vs baselines)
fig, ax = plt.subplots(figsize=(12, 6))

model_names = []
test_accs = []
colors = []

for rd in run_data:
    test_acc = rd['summary'].get('last_test_acc')
    if isinstance(test_acc, float):
        model_names.append(f"ViT: {rd['name']}")
        test_accs.append(test_acc)
        colors.append('steelblue')

baseline_color_map = {
    'log_reg': 'coral',
    'logistic_regression': 'coral',
    'xgboost': 'forestgreen',
    'random_forest': 'purple',
}
for bl in baselines:
    test_acc = bl.get('test', {}).get('acc')
    if isinstance(test_acc, float):
        model_names.append(bl['name'])
        test_accs.append(test_acc)
        colors.append(baseline_color_map.get(bl.get('model_type', ''), 'gray'))

if not test_accs:
    print('No test accuracy values available to plot.')
else:
    sorted_indices = sorted(range(len(test_accs)), key=lambda i: test_accs[i], reverse=True)
    model_names = [model_names[i] for i in sorted_indices]
    test_accs = [test_accs[i] for i in sorted_indices]
    colors = [colors[i] for i in sorted_indices]

    bars = ax.barh(range(len(model_names)), test_accs, color=colors, edgecolor='black', linewidth=0.5)
    for bar, acc in zip(bars, test_accs):
        ax.text(acc + 0.005, bar.get_y() + bar.get_height() / 2, f'{acc:.4f}',
                va='center', ha='left', fontsize=9)

    ax.set_yticks(range(len(model_names)))
    ax.set_yticklabels(model_names, fontsize=10)
    ax.set_xlabel('Test Accuracy', fontsize=12)
    ax.set_title('Test Accuracy Comparison: ViT vs Baselines', fontsize=14, fontweight='bold')
    ax.set_xlim(0, max(test_accs) * 1.1)
    ax.grid(True, alpha=0.3, axis='x')

    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='steelblue', edgecolor='black', label='ViT'),
        Patch(facecolor='coral', edgecolor='black', label='Logistic Regression'),
    ]
    ax.legend(handles=legend_elements, loc='lower right', fontsize=10)

    plt.tight_layout()
    plt.show()


In [None]:
# Summary table with baselines
summary_data = []

for rd in run_data:
    df = rd['df']
    summary = rd['summary']
    
    row = {
        'Model': 'ViT',
        'Run': rd['name'],
        'Epochs': len(df),
        'Best Val AUC': summary.get('best_val_auc'),
        'Best AUC Epoch': summary.get('best_val_auc_step'),
        'Test AUC': summary.get('last_test_auc'),
        'Test Acc': summary.get('last_test_acc'),
    }
    summary_data.append(row)

for bl in baselines:
    row = {
        'Model': bl.get('model_type', 'baseline'),
        'Run': bl['name'],
        'Epochs': '-',
        'Best Val AUC': bl.get('val', {}).get('auc'),
        'Best AUC Epoch': '-',
        'Test AUC': bl.get('test', {}).get('auc'),
        'Test Acc': bl.get('test', {}).get('acc'),
    }
    summary_data.append(row)

summary_df = pd.DataFrame(summary_data)
summary_df['_sort_key'] = summary_df['Test AUC'].apply(lambda x: x if isinstance(x, float) else -1)
summary_df = summary_df.sort_values('_sort_key', ascending=False).drop(columns=['_sort_key'])

display(summary_df)


In [None]:
# Plot 6: HP tuning trial AUCs (validation vs test)
if not hptune_data:
    print('No HP tuning runs to plot.')
else:
    import numpy as np
    import math

    def short_run_id(run_id: str) -> str:
        if not run_id:
            return 'unknown'
        return run_id[-8:] if len(run_id) > 8 else run_id

    def add_value_labels(bars, values, y_pad=0.01):
        for bar, val in zip(bars, values):
            try:
                val_f = float(val)
            except (TypeError, ValueError):
                continue
            if math.isnan(val_f):
                continue
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                bar.get_height() + y_pad,
                f"{val_f:.3f}",
                ha='center',
                va='bottom',
                fontsize=8,
                rotation=0,
                color='black',
                bbox=dict(facecolor='white', edgecolor='none', alpha=0.7, pad=1.2),
                clip_on=False,
            )

    labels = [short_run_id(row['run_id']) for row in hptune_data]
    val_aucs = [row.get('val_auc') for row in hptune_data]
    test_aucs = [row.get('test_auc') for row in hptune_data]

    x = np.arange(len(labels))
    width = 0.38

    fig, ax = plt.subplots(figsize=(max(10, len(labels) * 0.55), 6))
    val_bars = ax.bar(x - width / 2, val_aucs, width, label='Val AUC', color='steelblue')
    test_bars = ax.bar(x + width / 2, test_aucs, width, label='Test AUC', color='darkorange')

    valid_vals = [v for v in (val_aucs + test_aucs) if isinstance(v, (int, float)) and not math.isnan(v)]
    y_max = max(valid_vals) if valid_vals else 1.0
    ax.set_ylim(0, min(1.2, y_max + 0.08))

    add_value_labels(val_bars, val_aucs)
    add_value_labels(test_bars, test_aucs)

    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=45, ha='right', fontsize=8)
    ax.set_ylabel('AUC')
    ax.set_title('HP Tuning Trials: Validation vs Test AUC')
    ax.grid(True, axis='y', alpha=0.3)
    ax.legend()
    plt.tight_layout()


In [None]:
# Plot 7: HP tuning gap (val - test) and scatter
if not hptune_data:
    print('No HP tuning runs to plot.')
else:
    import numpy as np
    import math

    def short_run_id(run_id: str) -> str:
        if not run_id:
            return 'unknown'
        return run_id[-8:] if len(run_id) > 8 else run_id

    labels = [short_run_id(row['run_id']) for row in hptune_data]
    val_aucs = [row.get('val_auc') for row in hptune_data]
    test_aucs = [row.get('test_auc') for row in hptune_data]

    rows = []
    for label, val, test in zip(labels, val_aucs, test_aucs):
        if not isinstance(val, (int, float)) or not isinstance(test, (int, float)):
            continue
        if math.isnan(val) or math.isnan(test):
            continue
        rows.append((label, float(val), float(test)))

    if not rows:
        print('No valid HP runs with val/test AUC to plot.')
    else:
        labels_f, val_f, test_f = zip(*rows)
        gaps = [v - t for v, t in zip(val_f, test_f)]

        x = np.arange(len(labels_f))
        fig, ax = plt.subplots(figsize=(max(10, len(labels_f) * 0.55), 5))
        ax.bar(x, gaps, color='slategray')
        ax.axhline(0, color='black', linewidth=0.8)
        ax.set_xticks(x)
        ax.set_xticklabels(labels_f, rotation=45, ha='right', fontsize=8)
        ax.set_ylabel('Val AUC - Test AUC')
        ax.set_title('HP Tuning Trials: Generalization Gap')
        ax.grid(True, axis='y', alpha=0.3)
        plt.tight_layout()

        fig, ax = plt.subplots(figsize=(6, 6))
        ax.scatter(val_f, test_f, color='teal', alpha=0.8)
        ax.plot([0, 1], [0, 1], '--', color='gray')
        ax.set_xlim(0, 1.0)
        ax.set_ylim(0, 1.0)
        ax.set_xlabel('Validation AUC')
        ax.set_ylabel('Test AUC')
        ax.set_title('HP Tuning Trials: Val vs Test AUC')
        ax.grid(True, alpha=0.3)
        plt.tight_layout()


In [None]:
# Plot 8: HP tuning validation AUC over epochs
if not hptune_runs:
    print('No HP tuning runs to plot.')
else:
    import math

    def short_run_id(run_id: str) -> str:
        if not run_id:
            return 'unknown'
        return run_id[-8:] if len(run_id) > 8 else run_id

    hp_run_data = []
    for run in hptune_runs:
        df = extract_metrics_df(run)
        if df.empty or 'val/auc' not in df.columns or 'step' not in df.columns:
            continue
        hp_run_data.append({
            'run_id': run.get('run_id') or 'unknown',
            'df': df,
        })

    if not hp_run_data:
        print('No HP tuning runs with per-epoch val AUC.')
    else:
        fig, ax = plt.subplots(figsize=(10, 6))
        for rd in hp_run_data:
            df = rd['df']
            epochs = df['step'].values
            val_auc = df['val/auc'].values
            ax.plot(epochs, val_auc, marker='o', markersize=3, label=short_run_id(rd['run_id']))

        ax.set_xlabel('Epoch')
        ax.set_ylabel('Val AUC')
        ax.set_title('HP Tuning: Validation AUC over Epochs')
        ax.grid(True, alpha=0.3)
        ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=8)
        plt.tight_layout()


In [None]:
# Plot 9: HP tuning validation loss over epochs
if not hptune_runs:
    print('No HP tuning runs to plot.')
else:
    hp_run_data = []
    for run in hptune_runs:
        df = extract_metrics_df(run)
        if df.empty or 'val/loss' not in df.columns or 'step' not in df.columns:
            continue
        hp_run_data.append({
            'run_id': run.get('run_id') or 'unknown',
            'df': df,
        })

    if not hp_run_data:
        print('No HP tuning runs with per-epoch val loss.')
    else:
        fig, ax = plt.subplots(figsize=(10, 6))
        for rd in hp_run_data:
            df = rd['df']
            epochs = df['step'].values
            val_loss = df['val/loss'].values
            ax.plot(epochs, val_loss, marker='o', markersize=3, label=short_run_id(rd['run_id']))

        ax.set_xlabel('Epoch')
        ax.set_ylabel('Val Loss')
        ax.set_title('HP Tuning: Validation Loss over Epochs')
        ax.grid(True, alpha=0.3)
        ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=8)
        plt.tight_layout()
