# ETRM Results Tables

This notebook generates tables for the Experiments section:
1. **Semi-Final Architecture Search**: Narrative summary organized by design dimension
2. **Final Training Results**: Main results table comparing architectures

Tables are exported as markdown for easy LaTeX conversion.

In [1]:
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path.cwd()))

import pandas as pd
import numpy as np
from figure_utils import (
    fetch_semifinal_runs,
    fetch_final_runs,
    dataframe_to_markdown,
    save_table,
    format_number,
)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## 1. Fetch Data from W&B

In [2]:
# Fetch semi-final runs
df_semifinal = fetch_semifinal_runs()
print(f"Semi-final: {len(df_semifinal)} runs")

# Fetch final runs
df_final = fetch_final_runs()
print(f"Final: {len(df_final)} runs")

[34m[1mwandb[0m: Currently logged in as: [33mbdsaglam[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Semi-final: 17 runs
Final: 4 runs


In [3]:
# Preview semi-final data
df_semifinal[['display_name', 'encoder_type', 'encoder_layers', 'halt_explore_prob', 
              'kl_weight', 'train_exact_acc', 'arc_pass1', 'state']]

Unnamed: 0,display_name,encoder_type,encoder_layers,halt_explore_prob,kl_weight,train_exact_acc,arc_pass1,state
0,LPN4_deep_var,lpn_variational,8,0.5,0.0001,0.0,0.0,finished
1,LPN3_deep_std,lpn_standard,8,0.5,0.0001,0.0,0.0,finished
2,LPN2_paper_var,lpn_var,2,0.5,0.0001,12.109375,0.75,finished
3,LPN1_paper_std,lpn,2,0.5,0.0001,2.734375,0.0,killed
4,SF3_hybrid_var_kl_2,hybrid_variational,4,0.5,0.01,5.46875,0.0,finished
5,SF3_hybrid_var_kl_3,hybrid_variational,4,0.5,0.001,4.6875,0.0,finished
6,SF3_hybrid_var,hybrid_variational,4,0.5,0.0001,0.78125,0.0,finished
7,SF3_hybrid_var_baseline,hybrid_variational,4,0.5,0.0,2.34375,0.0,finished
8,SF5_standard_baseline,standard,2,0.5,0.0,43.359375,1.0,finished
9,SF1_hybrid_std_baseline,hybrid_standard,4,0.5,0.0,37.5,0.5,finished


In [4]:
# Preview final data
df_final[['display_name', 'encoder_type', 'train_exact_acc', 'arc_pass1', 'arc_pass2', 'arc_pass5', 'state', 'steps']]

Unnamed: 0,display_name,encoder_type,train_exact_acc,arc_pass1,arc_pass2,arc_pass5,state,steps
0,F4_lpn_var,lpn_var,23.828125,0,0.0,0.0,running,1777
1,F3_etrmtrm,standard,51.171875,0,0.25,0.25,finished,87310
2,F2_hybrid_var,hybrid_variational,40.625,0,0.0,0.0,finished,174240
3,F1_standard,standard,78.90625,0,0.5,0.5,finished,174622


## 2. Semi-Final Results: Architecture Search

Organize results by design dimension to show systematic exploration.

### 2.1 Encoder Architecture Comparison

In [None]:
# Filter to finished runs with baseline exploration prob (0.5)
df_arch = df_semifinal[
    (df_semifinal['state'] == 'finished') & 
    (df_semifinal['halt_explore_prob'] == 0.5)
].copy()

# Group by encoder type, take best performer per type
arch_comparison = df_arch.groupby('encoder_type').apply(
    lambda x: x.loc[x['train_exact_acc'].idxmax()]
).reset_index(drop=True)

# Select columns for table
arch_table = arch_comparison[[
    'encoder_type', 'encoder_layers', 'train_exact_acc', 'arc_pass1', 'arc_pass2', 'arc_pass5', 'grad_encoder'
]].copy()

# Sort before renaming columns
arch_table = arch_table.sort_values('train_exact_acc', ascending=False)

arch_table.columns = ['Encoder', 'Layers', 'Train EM%', 'Pass@1%', 'Pass@2%', 'Pass@5%', 'Grad Norm']

print("\n### Encoder Architecture Comparison (explore=0.5)")
print(dataframe_to_markdown(arch_table))

### 2.2 Exploration Probability Effect

In [None]:
# Compare exploration prob for standard and hybrid_standard encoders
df_explore = df_semifinal[
    (df_semifinal['state'] == 'finished') & 
    (df_semifinal['encoder_type'].isin(['standard', 'hybrid_standard']))
].copy()

explore_table = df_explore[[
    'display_name', 'encoder_type', 'halt_explore_prob', 'train_exact_acc', 'arc_pass1', 'arc_pass2', 'arc_pass5'
]].copy()

# Sort before renaming columns
explore_table = explore_table.sort_values(['encoder_type', 'halt_explore_prob'])

explore_table.columns = ['Run', 'Encoder', 'Explore Prob', 'Train EM%', 'Pass@1%', 'Pass@2%', 'Pass@5%']

print("\n### Exploration Probability Effect")
print(dataframe_to_markdown(explore_table))

### 2.3 KL Weight Variations (Variational Encoders)

In [None]:
# Filter to variational encoders
df_kl = df_semifinal[
    (df_semifinal['state'] == 'finished') & 
    (df_semifinal['encoder_type'].str.contains('var', case=False, na=False))
].copy()

if len(df_kl) > 0:
    kl_table = df_kl[[
        'display_name', 'encoder_type', 'kl_weight', 'train_exact_acc', 'arc_pass1', 'arc_pass2', 'arc_pass5', 'encoder_var'
    ]].copy()
    
    # Sort before renaming columns
    kl_table = kl_table.sort_values('train_exact_acc', ascending=False)
    
    kl_table.columns = ['Run', 'Encoder', 'KL Weight', 'Train EM%', 'Pass@1%', 'Pass@2%', 'Pass@5%', 'Enc Var']
    
    print("\n### KL Weight Variations (Variational Encoders)")
    print(dataframe_to_markdown(kl_table))
else:
    print("No variational encoder runs found")

### 2.4 LPN Architecture Variants

In [None]:
# Filter to LPN runs
df_lpn = df_semifinal[
    (df_semifinal['state'] == 'finished') & 
    (df_semifinal['display_name'].str.contains('LPN', case=False, na=False))
].copy()

if len(df_lpn) > 0:
    lpn_table = df_lpn[[
        'display_name', 'encoder_layers', 'train_exact_acc', 'arc_pass1', 'arc_pass2', 'arc_pass5', 'num_params', 'grad_encoder'
    ]].copy()
    
    # Sort before renaming columns
    lpn_table = lpn_table.sort_values('train_exact_acc', ascending=False)
    
    # Format num_params
    lpn_table['num_params'] = lpn_table['num_params'].apply(lambda x: f"{x/1e6:.1f}M" if not pd.isna(x) else "-")
    
    lpn_table.columns = ['Run', 'Layers', 'Train EM%', 'Pass@1%', 'Pass@2%', 'Pass@5%', 'Params', 'Grad Norm']
    
    print("\n### LPN Architecture Variants")
    print(dataframe_to_markdown(lpn_table))
else:
    print("No LPN runs found")

## 3. Final Training Results

In [None]:
# Create final results table
final_table = df_final[[
    'display_name', 'encoder_type', 'num_params', 'steps', 
    'train_exact_acc', 'arc_pass1', 'arc_pass2', 'arc_pass5', 'state'
]].copy()

# Sort before renaming columns
final_table = final_table.sort_values('train_exact_acc', ascending=False)

# Format columns
final_table['num_params'] = final_table['num_params'].apply(
    lambda x: f"{x/1e6:.1f}M" if not pd.isna(x) else "-"
)
final_table['steps'] = final_table['steps'].apply(
    lambda x: f"{x/1000:.0f}k" if not pd.isna(x) else "-"
)

final_table.columns = [
    'Experiment', 'Encoder', 'Params', 'Steps', 
    'Train EM%', 'Pass@1%', 'Pass@2%', 'Pass@5%', 'Status'
]

print("\n### Final Training Results (50k epochs)")
print(dataframe_to_markdown(final_table))

## 4. Export Tables

In [None]:
# Build comprehensive semi-final table markdown
semifinal_content = """# Preliminary Experiments: Architecture Search

Semi-final experiments (1000 epochs) on full training set with 32 test puzzle evaluation.

## Encoder Architecture Comparison

Best performer per architecture type with baseline exploration probability (0.5).

{arch_table}

## Exploration Probability Effect

Effect of exploration probability on standard and hybrid encoders.

{explore_table}

## KL Weight Variations (Variational Encoders)

Testing different KL regularization weights for variational bottleneck.

{kl_table}

## LPN Architecture Variants

Paper-exact LPN implementations with different configurations.

{lpn_table}

### Column Definitions

- **Train EM%**: Exact match accuracy on training puzzles
- **Pass@1%**: Test accuracy with 1 attempt (majority voting across augmentations)
- **Pass@2%**: Test accuracy with best of 2 attempts
- **Pass@5%**: Test accuracy with best of 5 attempts
"""

# Format tables
semifinal_md = semifinal_content.format(
    arch_table=dataframe_to_markdown(arch_table) if 'arch_table' in dir() else "No data",
    explore_table=dataframe_to_markdown(explore_table) if 'explore_table' in dir() else "No data",
    kl_table=dataframe_to_markdown(kl_table) if 'kl_table' in dir() and len(df_kl) > 0 else "No variational runs with KL > 0",
    lpn_table=dataframe_to_markdown(lpn_table) if 'lpn_table' in dir() and len(df_lpn) > 0 else "No LPN runs",
)

save_table(semifinal_md, "semifinal_exploration")

In [None]:
# Build final results markdown
final_content = """# Final Training Results

Full training (50k epochs) on ~560 puzzle groups, evaluated on 32 held-out test puzzles.

## Main Results

{final_table}

### Notes

- **Train EM%**: Exact match accuracy on training set
- **Pass@1%**: ARC pass@1 accuracy with voting across augmented versions
- **Pass@2%**: Best of 2 attempts accuracy
- **Pass@5%**: Best of 5 attempts accuracy
- All experiments use pretrained TRM decoder
- Evaluation on 32 held-out puzzle groups (true generalization test)
"""

final_md = final_content.format(
    final_table=dataframe_to_markdown(final_table)
)

save_table(final_md, "final_results")

## 5. Summary Statistics

In [None]:
print("=" * 60)
print("SUMMARY")
print("=" * 60)

print(f"\nSemi-final experiments: {len(df_semifinal)} runs")
print(f"  - Finished: {len(df_semifinal[df_semifinal['state'] == 'finished'])}")
print(f"  - Best train EM: {df_semifinal['train_exact_acc'].max():.1f}%")
print(f"  - Best test pass@1: {df_semifinal['arc_pass1'].max():.2f}%")
print(f"  - Best test pass@2: {df_semifinal['arc_pass2'].max():.2f}%")
print(f"  - Best test pass@5: {df_semifinal['arc_pass5'].max():.2f}%")

print(f"\nFinal experiments: {len(df_final)} runs")
print(f"  - Finished: {len(df_final[df_final['state'] == 'finished'])}")
print(f"  - Running: {len(df_final[df_final['state'] == 'running'])}")
print(f"  - Best train EM: {df_final['train_exact_acc'].max():.1f}%")
print(f"  - Best test pass@1: {df_final['arc_pass1'].max():.2f}%")
print(f"  - Best test pass@2: {df_final['arc_pass2'].max():.2f}%")
print(f"  - Best test pass@5: {df_final['arc_pass5'].max():.2f}%")