# Evaluation Results Comparison

This notebook loads evaluation results from checkpoint directories and compares:
- **Original TRM** (embedding-based)
- **ETRM variants** (encoder-based: F1-F4)

Evaluation on 32 held-out test puzzle groups with voting across augmentations.

**Important**: This comparison includes training duration analysis to ensure fair comparison.

In [None]:
import sys
from pathlib import Path
import json
import re

sys.path.insert(0, str(Path.cwd()))

import pandas as pd
import numpy as np

from figure_utils import (
    fetch_final_runs,
    dataframe_to_markdown,
    save_table,
)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Dataset sizes for epoch calculation
EMB_DATASET_SAMPLES = 3_633_410  # embedding dataset (original TRM)
ENC_DATASET_SAMPLES = 2_373_921  # encoder dataset (ETRM)

## 1. Experiment Configurations

In [None]:
CHECKPOINT_ROOT = Path("../../checkpoints")

EXPERIMENTS = {
    "TRM (Original)": {
        "path": CHECKPOINT_ROOT / "Arc1concept-aug-1000-ACT-torch" / "pretrain_att_arc1concept_4",
        "encoder_type": "embedding",
        "description": "Original TRM with learned puzzle embeddings",
        "batch_size": 768,
        "dataset_samples": EMB_DATASET_SAMPLES,
        "config_epochs": 100000,
    },
    "F1: Standard": {
        "path": CHECKPOINT_ROOT / "etrm-final" / "F1_standard",
        "encoder_type": "standard (2L)",
        "description": "Standard transformer encoder, 2 layers",
        "batch_size": 256,
        "dataset_samples": ENC_DATASET_SAMPLES,
        "config_epochs": 50000,
    },
    "F2: Hybrid VAE": {
        "path": CHECKPOINT_ROOT / "etrm-final" / "F2_hybrid_var",
        "encoder_type": "hybrid_var (4L)",
        "description": "Hybrid variational encoder, 4 layers",
        "batch_size": 128,
        "dataset_samples": ENC_DATASET_SAMPLES,
        "config_epochs": 25000,
    },
    "F3: ETRMTRM": {
        "path": CHECKPOINT_ROOT / "etrm-final" / "F3_etrmtrm",
        "encoder_type": "trm_style (recurrent)",
        "description": "TRM-style recurrent encoder",
        "batch_size": 256,
        "dataset_samples": ENC_DATASET_SAMPLES,
        "config_epochs": 25000,
    },
    "F4: LPN VAE": {
        "path": CHECKPOINT_ROOT / "etrm-final" / "F4_lpn_var",
        "encoder_type": "lpn_var (2L)",
        "description": "LPN-style variational encoder",
        "batch_size": 256,
        "dataset_samples": ENC_DATASET_SAMPLES,
        "config_epochs": 25000,
    },
}

print(f"Configured {len(EXPERIMENTS)} experiments")

## 2. Load Evaluation Results

In [None]:
def find_eval_files(exp_path, pattern="eval_results_groups_32*.json"):
    """Find all eval result files matching pattern in experiment directory."""
    return list(exp_path.glob(pattern))


def extract_step_from_filename(filename):
    """Extract step number from filename like 'eval_results_groups_32_step_174240.json'."""
    match = re.search(r'step_(\d+)', str(filename))
    return int(match.group(1)) if match else None


def load_eval_results(eval_path):
    """Load evaluation results from JSON file."""
    if not eval_path.exists():
        return None
    
    with open(eval_path) as f:
        data = json.load(f)
    
    # Add step from filename if not in checkpoint path
    if 'step' not in data:
        data['step'] = extract_step_from_filename(eval_path)
    
    return data


# Discover and load all eval results
eval_results = {}  # exp_name -> list of (step, results)

for exp_name, config in EXPERIMENTS.items():
    eval_files = find_eval_files(config["path"])
    
    if not eval_files:
        print(f"[--] {exp_name}: No eval results")
        continue
    
    exp_results = []
    for eval_file in sorted(eval_files):
        result = load_eval_results(eval_file)
        if result:
            step = extract_step_from_filename(eval_file)
            pass1 = result['results'].get('ARC/pass@1', 0) * 100
            exp_results.append({
                'step': step,
                'file': eval_file.name,
                'results': result['results'],
            })
            print(f"[OK] {exp_name}: step={step:,}, pass@1={pass1:.2f}%")
    
    if exp_results:
        eval_results[exp_name] = exp_results

print(f"\nLoaded results for {len(eval_results)}/{len(EXPERIMENTS)} experiments")

## 3. Training Duration Analysis

**Critical for fair comparison**: Calculate actual training duration (data passes) for each model.

In [None]:
# Calculate training duration for each experiment checkpoint
training_info = []

for exp_name, config in EXPERIMENTS.items():
    if exp_name not in eval_results:
        row = {
            "Model": exp_name,
            "Batch Size": config["batch_size"],
            "Steps": np.nan,
            "Samples Seen": np.nan,
            "Data Passes": np.nan,
        }
        training_info.append(row)
        continue
    
    for checkpoint in eval_results[exp_name]:
        step = checkpoint['step']
        samples_seen = step * config["batch_size"]
        data_passes = samples_seen / config["dataset_samples"]
        
        row = {
            "Model": exp_name,
            "Batch Size": config["batch_size"],
            "Steps": step,
            "Samples Seen": samples_seen,
            "Data Passes": data_passes,
        }
        training_info.append(row)

df_training = pd.DataFrame(training_info)

print("=" * 80)
print("TRAINING DURATION COMPARISON")
print("=" * 80)
print("\nData Passes = how many times the model saw each training sample on average")
print()
df_training

In [None]:
# Show step comparison for fair analysis
print("=" * 80)
print("STEP COMPARISON FOR FAIR ANALYSIS")
print("=" * 80)

# Group by model and show all checkpoints
for exp_name in EXPERIMENTS.keys():
    exp_data = df_training[df_training['Model'] == exp_name]
    if exp_data['Steps'].isna().all():
        print(f"\n{exp_name}: No checkpoints")
    else:
        print(f"\n{exp_name}:")
        for _, row in exp_data.iterrows():
            print(f"  Step {row['Steps']:,.0f} → {row['Data Passes']:.1f} data passes")

## 4. Fetch Training Metrics from W&B

In [None]:
# Fetch final runs from W&B for train accuracy
df_wandb = fetch_final_runs()

# Map W&B run names to our experiment names
wandb_mapping = {
    "F1_standard": "F1: Standard",
    "F2_hybrid_var": "F2: Hybrid VAE",
    "F3_etrmtrm": "F3: ETRMTRM",
    "F4_lpn_var": "F4: LPN VAE",
}

# Extract train accuracy
train_acc = {}
for _, row in df_wandb.iterrows():
    if row['display_name'] in wandb_mapping:
        exp_name = wandb_mapping[row['display_name']]
        train_acc[exp_name] = row['train_exact_acc']

# Original TRM train accuracy (from paper - nearly 100%)
train_acc["TRM (Original)"] = 99.9

print("Training accuracies:")
for name, acc in sorted(train_acc.items()):
    print(f"  {name}: {acc:.1f}%")

## 5. Build Main Results Table

In [None]:
# Build comprehensive results table (one row per checkpoint)
rows = []

for exp_name, config in EXPERIMENTS.items():
    if exp_name not in eval_results:
        row = {
            "Model": exp_name,
            "Encoder": config["encoder_type"],
            "Steps": np.nan,
            "Data Passes": np.nan,
            "Train EM%": train_acc.get(exp_name, np.nan),
            "Pass@1%": np.nan,
            "Pass@2%": np.nan,
            "Pass@5%": np.nan,
        }
        rows.append(row)
        continue
    
    for checkpoint in eval_results[exp_name]:
        step = checkpoint['step']
        samples_seen = step * config["batch_size"]
        data_passes = samples_seen / config["dataset_samples"]
        results = checkpoint['results']
        
        row = {
            "Model": exp_name,
            "Encoder": config["encoder_type"],
            "Steps": step,
            "Data Passes": data_passes,
            "Train EM%": train_acc.get(exp_name, np.nan),
            "Pass@1%": results.get("ARC/pass@1", 0) * 100,
            "Pass@2%": results.get("ARC/pass@2", 0) * 100,
            "Pass@5%": results.get("ARC/pass@5", 0) * 100,
        }
        rows.append(row)

df_results = pd.DataFrame(rows)
df_results

## 6. Paper-Ready Results Table

In [None]:
# Create paper-ready table
paper_table = df_results[[
    "Model", "Encoder", "Steps", "Data Passes", "Train EM%", "Pass@1%", "Pass@2%", "Pass@5%"
]].copy()

# Format columns
paper_table["Steps"] = paper_table["Steps"].apply(
    lambda x: f"{x:,.0f}" if not pd.isna(x) else "-"
)
paper_table["Data Passes"] = paper_table["Data Passes"].apply(
    lambda x: f"{x:.1f}" if not pd.isna(x) else "-"
)
for col in ["Train EM%", "Pass@1%", "Pass@2%", "Pass@5%"]:
    paper_table[col] = paper_table[col].apply(
        lambda x: f"{x:.2f}" if not pd.isna(x) else "-"
    )

print("\n" + "="*80)
print("MAIN RESULTS TABLE (all checkpoints)")
print("="*80)
print(dataframe_to_markdown(paper_table))

## 7. Export Tables

In [None]:
# Build markdown content for export
results_content = f"""# Evaluation Results Comparison

Evaluation on 32 held-out test puzzle groups with majority voting across augmentations.

## All Results (Multiple Checkpoints)

{dataframe_to_markdown(paper_table)}

### Column Definitions

- **Steps**: Training steps (gradient updates)
- **Data Passes**: Number of times the model saw each training sample (steps × batch_size / dataset_size)
- **Train EM%**: Exact match accuracy on training puzzles
- **Pass@k%**: Test accuracy with k attempts, using majority voting across augmented versions

### Notes

- Evaluation uses 32 held-out puzzle groups (true generalization test)
- Original TRM uses puzzle-specific learned embeddings (cannot generalize to new puzzles)
- ETRM variants use encoder networks that can potentially generalize to unseen puzzles
- TRM has two checkpoints to enable fair comparison at different training stages
"""

save_table(results_content, "eval_results_comparison")
print("\nSaved to docs/project-report/tables/eval_results_comparison.md")

## 8. Summary

In [None]:
print("\n" + "="*80)
print("SUMMARY")
print("="*80)

# Count unique experiments with results
exps_with_results = len(eval_results)
total_checkpoints = sum(len(v) for v in eval_results.values())
print(f"\nExperiments with results: {exps_with_results}/{len(EXPERIMENTS)}")
print(f"Total checkpoints evaluated: {total_checkpoints}")

# Find best result
best_pass1 = 0
best_model = ""
best_step = 0
for exp_name, checkpoints in eval_results.items():
    for cp in checkpoints:
        pass1 = cp['results'].get('ARC/pass@1', 0) * 100
        if pass1 > best_pass1:
            best_pass1 = pass1
            best_model = exp_name
            best_step = cp['step']

print(f"\nBest Pass@1: {best_model} @ step {best_step:,} ({best_pass1:.2f}%)")

# Step comparison for fairness
print("\n" + "-"*40)
print("FAIR COMPARISON BY STEPS")
print("-"*40)
print("\nSimilar step counts allow fair comparison of gradient updates:")
etrm_steps = [(name, cp['step']) for name, cps in eval_results.items() 
              if name != "TRM (Original)" for cp in cps]
for name, step in etrm_steps:
    print(f"  {name}: {step:,} steps")