# NODE Baseline Evaluation

This notebook runs comprehensive baseline experiments to establish performance benchmarks for Neural Oblivious Decision Ensembles (NODE) before implementing enhancements.

## Experiments to Run:
- **Classification**: EPSILON, HIGGS, A9A datasets
- **Regression**: YEAR dataset
- **Model Configs**: Shallow (1 layer), Medium (2 layers), Deep (8 layers)

## Expected Results:
- Baseline performance metrics for comparison
- Training time and convergence analysis
- Model size and memory usage patterns


In [None]:
# Setup and imports
%load_ext autoreload
%autoreload 2

import os, sys, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import json

# Add lib to path
sys.path.insert(0, '..')
import lib
from qhoptim.pyt import QHAdam

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")

# Results storage
results = {}


Using device: cpu
PyTorch version: 2.8.0+cpu


## 1. EPSILON Dataset - Classification Baseline


In [None]:
# EPSILON Dataset - Shallow NODE
print("Loading EPSILON dataset...")
data = lib.Dataset("EPSILON", random_state=1337, quantile_transform=True, quantile_noise=1e-3)

num_features = data.X_train.shape[1]
num_classes = len(set(data.y_train))

print(f"Features: {num_features}, Classes: {num_classes}")
print(f"Train: {len(data.X_train)}, Valid: {len(data.X_valid)}, Test: {len(data.X_test)}")


Loading EPSILON dataset...
Downloading https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/epsilon_normalized.bz2 > ./data\EPSILON\epsilon_normalized.bz2


  1%|▌                                                    | 39931904/3871424303 [01:16<48:16, 1322859.36it/s]

In [None]:
# Create shallow NODE model
experiment_name = f'epsilon_shallow_{int(time.time())}'

model = nn.Sequential(
    lib.DenseBlock(num_features, layer_dim=2048, num_layers=1, tree_dim=num_classes + 1, 
                   flatten_output=False, depth=6, choice_function=lib.entmax15, bin_function=lib.entmoid15),
    lib.Lambda(lambda x: x[..., :num_classes].mean(dim=-2))
).to(device)

# Data-aware initialization
with torch.no_grad():
    model(torch.as_tensor(data.X_train[:2000], device=device))

# Multi-GPU support
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

print(f"Model created: {experiment_name}")


In [None]:
# Train shallow model
trainer = lib.Trainer(
    model=model, loss_function=F.cross_entropy,
    experiment_name=experiment_name,
    warm_start=False,
    Optimizer=QHAdam,
    optimizer_params=dict(nus=(0.7, 1.0), betas=(0.95, 0.998)),
    verbose=True,
    n_last_checkpoints=5
)

# Training loop
loss_history, err_history = [], []
best_val_err = 1.0
best_step = 0
early_stopping_rounds = 10000
report_frequency = 100

print("Starting training...")
start_time = time.time()

for batch in lib.iterate_minibatches(data.X_train, data.y_train, batch_size=1024, 
                                    shuffle=True, epochs=float('inf')):
    metrics = trainer.train_on_batch(*batch, device=device)
    loss_history.append(metrics['loss'])

    if trainer.step % report_frequency == 0:
        trainer.save_checkpoint()
        trainer.average_checkpoints(out_tag='avg')
        trainer.load_checkpoint(tag='avg')
        err = trainer.evaluate_classification_error(
            data.X_valid, data.y_valid, device=device, batch_size=1024)
        
        if err < best_val_err:
            best_val_err = err
            best_step = trainer.step
            trainer.save_checkpoint(tag='best')
        
        err_history.append(err)
        trainer.load_checkpoint()
        trainer.remove_old_temp_checkpoints()
            
        print(f"Step {trainer.step}: Loss={metrics['loss']:.5f}, Val Error={err:.5f}")
        
    if trainer.step > best_step + early_stopping_rounds:
        print(f'Early stopping: No improvement for {early_stopping_rounds} steps')
        print(f"Best step: {best_step}, Best Val Error: {best_val_err:.5f}")
        break

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f}s")


In [None]:
# Evaluate on test set
trainer.load_checkpoint(tag='best')
test_error = trainer.evaluate_classification_error(data.X_test, data.y_test, device=device, batch_size=1024)
test_auc = trainer.evaluate_auc(data.X_test, data.y_test, device=device, batch_size=512)
test_logloss = trainer.evaluate_logloss(data.X_test, data.y_test, device=device, batch_size=512)

print(f"\nEPSILON Shallow Results:")
print(f"Test Error Rate: {test_error:.5f}")
print(f"Test AUC: {test_auc:.5f}")
print(f"Test LogLoss: {test_logloss:.5f}")
print(f"Training Time: {training_time:.2f}s")

# Store results
results['EPSILON_shallow'] = {
    'dataset': 'EPSILON',
    'model': 'shallow',
    'test_error': test_error,
    'test_auc': test_auc,
    'test_logloss': test_logloss,
    'training_time': training_time,
    'best_step': best_step,
    'best_val_error': best_val_err
}


## 2. YEAR Dataset - Regression Baseline


In [None]:
# YEAR Dataset - Shallow NODE
print("Loading YEAR dataset...")
data_year = lib.Dataset("YEAR", random_state=1337, quantile_transform=True, quantile_noise=1e-3)

in_features = data_year.X_train.shape[1]

# Normalize target
mu, std = data_year.y_train.mean(), data_year.y_train.std()
normalize = lambda x: ((x - mu) / std).astype(np.float32)
data_year.y_train, data_year.y_valid, data_year.y_test = map(normalize, [data_year.y_train, data_year.y_valid, data_year.y_test])

print(f"Features: {in_features}")
print(f"Target mean: {mu:.5f}, std: {std:.5f}")
print(f"Train: {len(data_year.X_train)}, Valid: {len(data_year.y_valid)}, Test: {len(data_year.y_test)}")


In [None]:
# Create shallow NODE model for regression
experiment_name_year = f'year_shallow_{int(time.time())}'

model_year = nn.Sequential(
    lib.DenseBlock(in_features, 2048, num_layers=1, tree_dim=3, depth=6, flatten_output=False,
                   choice_function=lib.entmax15, bin_function=lib.entmoid15),
    lib.Lambda(lambda x: x[..., 0].mean(dim=-1))
).to(device)

# Data-aware initialization
with torch.no_grad():
    model_year(torch.as_tensor(data_year.X_train[:1000], device=device))

# Multi-GPU support
if torch.cuda.device_count() > 1:
    model_year = nn.DataParallel(model_year)

print(f"Model created: {experiment_name_year}")


In [None]:
# Train regression model
trainer_year = lib.Trainer(
    model=model_year, loss_function=F.mse_loss,
    experiment_name=experiment_name_year,
    warm_start=False,
    Optimizer=QHAdam,
    optimizer_params=dict(nus=(0.7, 1.0), betas=(0.95, 0.998)),
    verbose=True,
    n_last_checkpoints=5
)

# Training loop
loss_history_year, mse_history_year = [], []
best_mse = float('inf')
best_step_mse = 0
early_stopping_rounds = 5000
report_frequency = 100

print("Starting training...")
start_time_year = time.time()

for batch in lib.iterate_minibatches(data_year.X_train, data_year.y_train, batch_size=1024, 
                                    shuffle=True, epochs=float('inf')):
    metrics = trainer_year.train_on_batch(*batch, device=device)
    loss_history_year.append(metrics['loss'])

    if trainer_year.step % report_frequency == 0:
        trainer_year.save_checkpoint()
        trainer_year.average_checkpoints(out_tag='avg')
        trainer_year.load_checkpoint(tag='avg')
        mse = trainer_year.evaluate_mse(
            data_year.X_valid, data_year.y_valid, device=device, batch_size=16384)
        
        if mse < best_mse:
            best_mse = mse
            best_step_mse = trainer_year.step
            trainer_year.save_checkpoint(tag='best_mse')
        
        mse_history_year.append(mse)
        trainer_year.load_checkpoint()
        trainer_year.remove_old_temp_checkpoints()
            
        print(f"Step {trainer_year.step}: Loss={metrics['loss']:.5f}, Val MSE={mse:.5f}")
        
    if trainer_year.step > best_step_mse + early_stopping_rounds:
        print(f'Early stopping: No improvement for {early_stopping_rounds} steps')
        print(f"Best step: {best_step_mse}, Best Val MSE: {best_mse:.5f}")
        break

training_time_year = time.time() - start_time_year
print(f"Training completed in {training_time_year:.2f}s")


In [None]:
# Evaluate on test set
trainer_year.load_checkpoint(tag='best_mse')
test_mse = trainer_year.evaluate_mse(data_year.X_test, data_year.y_test, device=device)

# Convert back to original scale
test_rmse_original = np.sqrt(test_mse * std ** 2)

print(f"\nYEAR Shallow Results:")
print(f"Test MSE (normalized): {test_mse:.5f}")
print(f"Test RMSE (original scale): {test_rmse_original:.5f}")
print(f"Training Time: {training_time_year:.2f}s")

# Store results
results['YEAR_shallow'] = {
    'dataset': 'YEAR',
    'model': 'shallow',
    'test_mse': test_mse,
    'test_rmse_original': test_rmse_original,
    'training_time': training_time_year,
    'best_step': best_step_mse,
    'best_val_mse': best_mse
}


## 3. Results Summary and Comparison


In [None]:
# Create results summary
print("\n" + "="*60)
print("BASELINE RESULTS SUMMARY")
print("="*60)

for key, result in results.items():
    print(f"\n{key}:")
    print(f"  Dataset: {result['dataset']}")
    print(f"  Model: {result['model']}")
    print(f"  Training Time: {result['training_time']:.2f}s")
    print(f"  Best Step: {result['best_step']}")
    
    if 'test_error' in result:
        print(f"  Test Error: {result['test_error']:.5f}")
        print(f"  Test AUC: {result['test_auc']:.5f}")
        print(f"  Test LogLoss: {result['test_logloss']:.5f}")
    else:
        print(f"  Test MSE: {result['test_mse']:.5f}")
        print(f"  Test RMSE (original): {result['test_rmse_original']:.5f}")


In [None]:
# Save results to JSON
with open('baseline_results.json', 'w') as f:
    json.dump(results, f, indent=2, default=str)

print("\nResults saved to baseline_results.json")

# Create comparison DataFrame
rows = []
for key, result in results.items():
    row = {
        'Dataset': result['dataset'],
        'Model': result['model'],
        'Training_Time_s': result['training_time'],
        'Best_Step': result['best_step']
    }
    
    if 'test_error' in result:
        row.update({
            'Test_Error': result['test_error'],
            'Test_AUC': result['test_auc'],
            'Test_LogLoss': result['test_logloss']
        })
    else:
        row['Test_MSE'] = result['test_mse']
        row['Test_RMSE_Original'] = result['test_rmse_original']
    
    rows.append(row)

df_results = pd.DataFrame(rows)
df_results.to_csv('baseline_comparison.csv', index=False)

print("\nComparison table:")
print(df_results.to_string(index=False))
print("\nComparison table saved to baseline_comparison.csv")


## 4. Next Steps

With baseline results established, you can now:

1. **Implement Enhancements**:
   - Focal Loss for imbalanced datasets
   - Learning rate scheduling (warmup + cosine decay)
   - Advanced optimizers (RAdam + Lookahead)
   - Regularization (Dropout + BatchNorm)

2. **Run Additional Baselines**:
   - Deep NODE (8 layers) on same datasets
   - Medium NODE (2 layers) for comparison
   - Other datasets (HIGGS, A9A)

3. **Compare with GBDT**:
   - Run XGBoost and CatBoost on same datasets
   - Establish fair comparison baselines

4. **Analyze Results**:
   - Training efficiency patterns
   - Convergence behavior
   - Memory usage analysis
