In [143]:
import pandas as pd
import numpy as np

import torch
import torch.optim as optim
import torch.nn as nn


from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error # Using MSE for Ridge simplicity
from sklearn.model_selection import ParameterGrid

import torch
from torch.utils.data import TensorDataset, DataLoader

In [144]:
# --- 1. Data Loading Function ---

def load_data(suffix):
    """
    Loads features (X), targets (y), and weights (wts) CSV files for a given prefix.

    Args:
        suffix (string): suffix for csv file.

    Returns:
        tuple: A tuple containing three pandas DataFrames (X_df, y_df, wts_df).
    """
    X = pd.read_csv(f'X_{suffix}.csv')
    y = pd.read_csv(f'y_{suffix}.csv')
    wts = pd.read_csv(f'wts_{suffix}.csv')
    return X, y, wts

In [145]:
# --- 2. Dataset and DataLoader Creation Function ---

def create_pytorch_datasets_loaders(X_train_df, 
                                    y_train_df, 
                                    wts_train_df,
                                    X_val_df, 
                                    y_val_df, 
                                    wts_val_df,
                                    batch_size):
    """
    Converts pandas DataFrames into PyTorch Tensors, creates TensorDatasets,
    and returns DataLoaders for training and validation.

    Args:
        X_train_df (pd.DataFrame): Training features.
        y_train_df (pd.DataFrame): Training targets.
        wts_train_df (pd.DataFrame): Training weights ('P(C)' column expected).
        X_val_df (pd.DataFrame): Validation features.
        y_val_df (pd.DataFrame): Validation targets.
        wts_val_df (pd.DataFrame): Validation weights ('P(C)' column expected).
        batch_size (int): The batch size for the DataLoaders.

    Returns:
        tuple: A tuple containing (train_loader, val_loader).
    """
    # --- Convert DataFrames to Tensors ---
    # Use .values to get NumPy arrays, then convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_df.values, dtype=torch.float32) 
    y_train_tensor = torch.tensor(y_train_df.values, dtype=torch.float32)
    # Ensure weights tensor is [N, 1] for potential broadcasting in loss
    wts_train_tensor = torch.tensor(wts_train_df[['P(C)']].values, dtype=torch.float32)

    X_val_tensor = torch.tensor(X_val_df.values, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val_df.values, dtype=torch.float32)
    wts_val_tensor = torch.tensor(wts_val_df[['P(C)']].values, dtype=torch.float32)

    # --- Create TensorDatasets ---
    # Combines tensors along the first dimension. Each sample will be retrieved as a tuple.
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor, wts_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor, wts_val_tensor)

    #set the manual seed for reproducibility
    torch.manual_seed(42)

    # --- Create DataLoaders ---
    # Handles batching, shuffling, and parallel loading
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True  # Shuffle training data each epoch
    )
    val_loader = DataLoader(
        dataset=val_dataset,
        batch_size=batch_size, # Can use larger batch for validation if memory allows
        shuffle=False # No need to shuffle validation data
    )

    print(f"Created DataLoaders - Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")
    return train_loader, val_loader

In [146]:
# --- 3. Weighted cross-entropy loss function ---

def weighted_cross_entropy_loss(outputs, targets, weights):
    """
    Compute weighted cross entropy loss.

    Args:
        outputs (torch.Tensor): Model predictions (batch_size, n_classes)
        targets (torch.Tensor): True probability distributions (batch_size, n_classes)
        weights (torch.Tensor): Sample weights P(C) (batch_size,)

    Returns:
        torch.Tensor: Expected value of cross entropy loss
                        computed as Σ(P(C) * Loss(C)) / Σ(P(C))
    """
    # Add small epsilon to avoid log(0)
    eps = 1e-7
    outputs = torch.clamp(outputs, 
                          min=eps)
    
    # Compute cross entropy loss for each sample
    sample_losses = -torch.sum(targets * torch.log(outputs), dim=1)
    
    # Weight each sample loss by P(C)
    weighted_losses = sample_losses * weights
    
    # Return expected value (weighted average)
    return weighted_losses.sum() / weights.sum()

In [147]:
# --- 4. PyTorch Model Training Function with Early Stopping ---

def train_pytorch_model(model, 
                        train_loader, 
                        val_loader, 
                        loss_fn, 
                        optimizer, 
                        device,
                        max_epochs=200, 
                        patience=15, 
                        verbose=False):
    """
    Trains a PyTorch model with early stopping based on validation loss.

    Args:
        model (torch.nn.Module): The PyTorch model to train.
        train_loader (DataLoader): DataLoader for the training set.
        val_loader (DataLoader): DataLoader for the validation set.
        loss_fn (callable): The loss function. Should accept (outputs, targets, weights).
        optimizer (torch.optim.Optimizer): The optimizer.
        device (torch.device): The device to perform computations on.
        max_epochs (int): Maximum number of epochs to train.
        patience (int): Number of epochs to wait for improvement before stopping.
        verbose (bool): If True, prints loss per epoch. Defaults to False.

    Returns:
        float: The best validation loss achieved during training.
    """
    best_validation_loss = float('inf')
    epochs_without_improvement = 0
    best_epoch = 0

    if not val_loader: # Handle case with no validation data, though unlikely in CV
        print("Warning: No validation loader provided. Cannot perform early stopping.")
        # Optionally train for max_epochs without validation, but not recommended for tuning
        return float('inf') # Or handle differently

    for epoch in range(max_epochs):
        # --- Training Phase ---
        model.train()  # Set model to training mode (enables dropout, batch norm updates)
        train_loss_epoch = 0.0
        for batch_idx, (features, targets, weights) in enumerate(train_loader):
            # Data already on the correct device from DataLoader preparation if done right
            features, targets, weights = features.to(device), targets.to(device), weights.to(device)

            # 1. Forward pass
            outputs = model(features)

            # 2. Calculate loss
            loss = loss_fn(outputs, targets, weights)

            # 3. Backward pass and optimization
            optimizer.zero_grad() # Clear previous gradients
            loss.backward()       # Compute gradients
            optimizer.step()      # Update weights

            train_loss_epoch += loss.item() # Accumulate batch loss

        avg_train_loss = train_loss_epoch / len(train_loader)

        # --- Validation Phase ---
        model.eval()  # Set model to evaluation mode (disables dropout, uses running stats for batch norm)
        validation_loss_epoch = 0.0
        with torch.no_grad():  # Disable gradient calculations for efficiency
            for features, targets, weights in val_loader:
                features, targets, weights = features.to(device), targets.to(device), weights.to(device)
                outputs = model(features)
                loss = loss_fn(outputs, targets, weights)
                validation_loss_epoch += loss.item()

        avg_validation_loss = validation_loss_epoch / len(val_loader)

        if verbose:
            print(f"Epoch {epoch+1}/{max_epochs} - Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_validation_loss:.6f}")

        # --- Early Stopping Check ---
        if avg_validation_loss < best_validation_loss:
            best_validation_loss = avg_validation_loss
            epochs_without_improvement = 0
            best_epoch = epoch + 1
            # Optional: Save the best model state here if needed later
            # torch.save(model.state_dict(), 'best_model_fold_X.pth')
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            if verbose:
                print(f"Early stopping triggered after {epoch+1} epochs. Best val loss {best_validation_loss:.6f} at epoch {best_epoch}.")
            break # Exit the training loop


    return best_validation_loss

In [None]:
# --- Configuration ---

FOLD_DEFINITIONS = [
    {'train_suffix': '2008_2012', 'val_suffix': '2016'},
    {'train_suffix': '2008_2016', 'val_suffix': '2012'},
    {'train_suffix': '2012_2016', 'val_suffix': '2008'},
]

BATCH_SIZE = 50
MAX_EPOCHS = 200 # Max epochs for early stopping
PATIENCE = 15    # Patience for early stopping
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu") #set device


#make dictionary to hold results for each model
results = {}
models = {}

In [166]:
# --- Get Input/Output Dimensions Once ---
temp_X, temp_y, _ = load_data(FOLD_DEFINITIONS[0]['train_suffix'])
INPUT_DIM = temp_X.shape[1]
OUTPUT_DIM = temp_y.shape[1]
del temp_X, temp_y
# ---

In [149]:
# --- 1. Ridge Regression Model Builder (Wrapper) ---

def build_ridge_model(alpha):
  """
  Builds (instantiates) a scikit-learn Ridge regression model.

  Args:
      alpha (float): Regularization strength (alpha >= 0). Larger values
                     specify stronger regularization. Corresponds to L2 penalty.

  Returns:
      sklearn.linear_model.Ridge: An instance of the Ridge model, ready to be fit.
  """
  # Instantiate the Ridge model with the specified alpha parameter
  model = Ridge(alpha=alpha)
  return model

# --- 1. Cross-Validation Function for Ridge Regression (Modified) ---

def cross_val_ridge(fold_definitions=FOLD_DEFINITIONS):
    """
    Performs 3-fold CV hyperparameter tuning for Ridge Regression.

    Args:
        data_dir (str): Directory containing the data CSV files.
        fold_definitions (list): List defining train/val prefixes for each fold.

    Returns:
        tuple:
            - pd.DataFrame: DataFrame with columns ['alpha', 'mean_cv_score'], sorted by score.
            - sklearn.linear_model.Ridge: Untrained Ridge model instance with the best alpha.
    """
    print("--- Starting Cross-Validation for Ridge Regression ---")
    alphas = [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
    results_list = []
    best_score = float('inf')
    best_alpha = None

    for alpha in alphas:
        print(f"  Testing alpha={alpha}:")
        fold_validation_scores = []

        for i, fold in enumerate(fold_definitions):
            X_train_pd, y_train_pd, wts_train_pd = load_data(fold['train_suffix'])
            X_val_pd, y_val_pd, wts_val_pd = load_data(fold['val_suffix'])

            model = build_ridge_model(alpha=alpha) # Use the builder
            model.fit(X_train_pd.values, y_train_pd.values)
            y_pred_val = model.predict(X_val_pd.values)
            validation_score = mean_squared_error(y_val_pd.values, y_pred_val)
            fold_validation_scores.append(validation_score)

        avg_validation_score = np.mean(fold_validation_scores)
        print(f"  Avg Val Score (MSE) for config {alpha}: {avg_validation_score:.6f}")
        results_list.append({'alpha': alpha, 'mean_cv_score': avg_validation_score})

        # Track best configuration
        if avg_validation_score < best_score:
            best_score = avg_validation_score
            best_alpha = alpha

    # Compile results
    results_df = pd.DataFrame(results_list)
    results_df = results_df.sort_values(by='mean_cv_score', ascending=True)

    # Build the best model instance (untrained)
    print(f"\nBest Ridge alpha found: {best_alpha} with score {best_score:.6f}")
    best_model_instance = build_ridge_model(alpha=best_alpha)

    print("--- Finished Cross-Validation for Ridge Regression ---")
    return results_df, best_model_instance

# get results and model for ridge cross-validation
results['ridge'], models['ridge'] = cross_val_ridge()

# display ridge results
results['ridge']

--- Starting Cross-Validation for Ridge Regression ---
  Testing alpha=0.01:
  Avg Val Score (MSE) for config 0.01: 0.001830
  Testing alpha=0.1:
  Avg Val Score (MSE) for config 0.1: 0.001830
  Testing alpha=1.0:
  Avg Val Score (MSE) for config 1.0: 0.001830
  Testing alpha=10.0:
  Avg Val Score (MSE) for config 10.0: 0.001830
  Testing alpha=100.0:
  Avg Val Score (MSE) for config 100.0: 0.001831
  Testing alpha=1000.0:
  Avg Val Score (MSE) for config 1000.0: 0.001853

Best Ridge alpha found: 10.0 with score 0.001830
--- Finished Cross-Validation for Ridge Regression ---


Unnamed: 0,alpha,mean_cv_score
3,10.0,0.00183
2,1.0,0.00183
1,0.1,0.00183
0,0.01,0.00183
4,100.0,0.001831
5,1000.0,0.001853


In [150]:
# --- 2. Softmax Regression Model Builder ---

def build_softmax_model(input_dim, output_dim):
    """
    Builds a Softmax Regression model (Linear layer + Softmax).

    Args:
        input_dim (int): Number of input features.
        output_dim (int): Number of output classes (probabilities).

    Returns:
        torch.nn.Module: An instance of the SoftmaxRegression model.
    """
    class SoftmaxRegression(nn.Module):
        def __init__(self, input_dim, output_dim):
            super().__init__()
            self.linear = nn.Linear(input_dim, output_dim)
            # Apply Softmax to get probability outputs.
            # Note: If your custom 'weighted_cross_entropy_loss' expects raw logits
            # (like nn.CrossEntropyLoss), remove the Softmax layer here and
            # apply it separately if needed after prediction.
            self.softmax = nn.Softmax(dim=1) # Apply softmax across the class dimension

        def forward(self, x):
            """Forward pass."""
            logits = self.linear(x)
            probabilities = self.softmax(logits)
            return probabilities

    model = SoftmaxRegression(input_dim, output_dim)
    return model

# --- 2. Cross-Validation Function for Softmax Regression (Modified) ---

def cross_val_softmax(fold_definitions=FOLD_DEFINITIONS,
                       batch_size=BATCH_SIZE, 
                       device=DEVICE,
                       max_epochs=MAX_EPOCHS, 
                       patience=PATIENCE):
    """
    Performs 3-fold CV hyperparameter tuning for Softmax Regression (PyTorch).

    Returns:
        tuple:
            - pd.DataFrame: Results DF sorted by score.
            - torch.nn.Module: Untrained Softmax model instance with the best structural params (dims).
                                Training hyperparams (LR, WD) are not part of the structure.
    """
    print("--- Starting Cross-Validation for Softmax Regression ---")
    param_grid = {
        'learning_rate': [1e-2, 1e-3, 1e-4],
        'weight_decay': [0, 1e-5, 1e-4]
    }
    results_list = []
    best_score = float('inf')
    best_config = None # Store the best config dict
    optimizer_choice = optim.AdamW

    # --- Get Input/Output Dimensions Once ---
    # Load data from the first fold just to determine dimensions
    temp_X, temp_y, _ = load_data(fold_definitions[0]['train_suffix'])
    input_dim = temp_X.shape[1]
    output_dim = temp_y.shape[1]
    del temp_X, temp_y # Free memory
    # ---

    for config in ParameterGrid(param_grid):
        print(f"  Testing config: {config}")
        fold_validation_scores = []

        for i, fold in enumerate(fold_definitions):
            X_train_pd, y_train_pd, wts_train_pd = load_data(fold['train_suffix'])
            X_val_pd, y_val_pd, wts_val_pd = load_data(fold['val_suffix'])

            train_loader, val_loader = create_pytorch_datasets_loaders(X_train_pd, 
                                                                       y_train_pd, 
                                                                       wts_train_pd, 
                                                                       X_val_pd, 
                                                                       y_val_pd, 
                                                                       wts_val_pd,
                                                                       batch_size)

            model = build_softmax_model(input_dim=input_dim, output_dim=output_dim).to(device)
            optimizer = optimizer_choice(model.parameters(), 
                                         lr=config['learning_rate'], 
                                         weight_decay=config['weight_decay'])

            validation_score = train_pytorch_model(model, 
                                                   train_loader, 
                                                   val_loader, 
                                                   weighted_cross_entropy_loss, 
                                                   optimizer,
                                                   device, 
                                                   max_epochs, 
                                                   patience, 
                                                   verbose=True)
            fold_validation_scores.append(validation_score)

        avg_validation_score = np.mean(fold_validation_scores)
        print(f"  Avg Val Score for config {config}: {avg_validation_score:.6f}")
        current_result = {**config, 'mean_cv_score': avg_validation_score}
        results_list.append(current_result)

        # Track best configuration (dictionary)
        if avg_validation_score < best_score:
            best_score = avg_validation_score
            best_config = config # Keep the config dict

    # Compile results
    results_df = pd.DataFrame(results_list)
    results_df = results_df.sort_values(by='mean_cv_score', ascending=True)

    # Build the best model instance (untrained)
    # Note: Softmax structure only depends on dims, not LR or WD.
    print(f"\nBest Softmax config found: {best_config} with score {best_score:.6f}")
    best_model_instance = build_softmax_model(input_dim=input_dim, output_dim=output_dim)

    print("--- Finished Cross-Validation for Softmax Regression ---")
    return results_df, best_model_instance

# get results and model for softmax cross-validation
results['softmax'], models['softmax'] = cross_val_softmax()

# display softmax results
results['softmax']

--- Starting Cross-Validation for Softmax Regression ---
  Testing config: {'learning_rate': 0.01, 'weight_decay': 0}
Created DataLoaders - Train batches: 124, Val batches: 62
Epoch 1/200 - Train Loss: 46.905209, Val Loss: 44.169698
Epoch 2/200 - Train Loss: 42.642436, Val Loss: 43.582655
Epoch 3/200 - Train Loss: 42.104975, Val Loss: 43.865024
Epoch 4/200 - Train Loss: 41.904845, Val Loss: 43.856537
Epoch 5/200 - Train Loss: 41.904641, Val Loss: 43.841016
Epoch 6/200 - Train Loss: 41.847481, Val Loss: 43.869766
Epoch 7/200 - Train Loss: 41.826627, Val Loss: 43.911656
Epoch 8/200 - Train Loss: 41.741646, Val Loss: 44.101544
Epoch 9/200 - Train Loss: 41.865080, Val Loss: 44.020007
Epoch 10/200 - Train Loss: 41.831663, Val Loss: 44.052717
Epoch 11/200 - Train Loss: 41.781452, Val Loss: 44.506106
Epoch 12/200 - Train Loss: 41.906978, Val Loss: 43.954545
Epoch 13/200 - Train Loss: 41.861042, Val Loss: 44.058443
Epoch 14/200 - Train Loss: 41.803274, Val Loss: 44.028854
Epoch 15/200 - Train 

Unnamed: 0,learning_rate,weight_decay,mean_cv_score
6,0.0001,0.0,42.293211
7,0.0001,1e-05,42.293211
8,0.0001,0.0001,42.293211
3,0.001,0.0,42.2999
4,0.001,1e-05,42.2999
5,0.001,0.0001,42.299922
0,0.01,0.0,42.520074
1,0.01,1e-05,42.520079
2,0.01,0.0001,42.520112


In [155]:
# --- 3. 1-Hidden-Layer MLP Model Builder ---

def build_nn1_model(input_dim, 
                    output_dim, 
                    n_hidden, 
                    dropout_rate):
    """
    Builds a 1-Hidden-Layer MLP with ReLU activation and Dropout.

    Architecture: Linear -> ReLU -> Dropout -> Linear -> Softmax

    Args:
        input_dim (int): Number of input features.
        output_dim (int): Number of output classes (probabilities).
        n_hidden (int): Number of neurons in the hidden layer.
        dropout_rate (float): Dropout probability.

    Returns:
        torch.nn.Module: An instance of the NN1Layer model.
    """
    class NN1Layer(nn.Module):
        def __init__(self, input_dim, output_dim, n_hidden, dropout_rate):
            super().__init__()
            self.layer_1 = nn.Linear(input_dim, n_hidden)
            self.relu = nn.ReLU()
            self.dropout = nn.Dropout(dropout_rate)
            self.layer_2 = nn.Linear(n_hidden, output_dim)
            # Apply Softmax to get probability outputs (see note in build_softmax_model)
            self.softmax = nn.Softmax(dim=1)

        def forward(self, x):
            """Forward pass."""
            x = self.layer_1(x)
            x = self.relu(x)
            x = self.dropout(x) # Apply dropout after activation
            logits = self.layer_2(x)
            probabilities = self.softmax(logits)
            return probabilities

    model = NN1Layer(input_dim, output_dim, n_hidden, dropout_rate)
    return model

# --- 3. Cross-Validation Function for 1-Hidden-Layer MLP (Modified) ---

def cross_val_mlp1(fold_definitions=FOLD_DEFINITIONS,
                   batch_size=BATCH_SIZE, 
                   device=DEVICE,
                   max_epochs=MAX_EPOCHS, 
                   patience=PATIENCE):
    """
    Performs 3-fold CV hyperparameter tuning for a 1-Hidden-Layer MLP (PyTorch).

    Returns:
        tuple:
            - pd.DataFrame: Results DF sorted by score.
            - torch.nn.Module: Untrained MLP-1 model instance with the best hyperparameters.
    """
    print("--- Starting Cross-Validation for 1-Layer MLP ---")
    param_grid = {
        'n_hidden': [16, 32, 64, 128],
        'dropout_rate': [0.1, 0.3, 0.5],
        'learning_rate': [1e-2, 1e-3, 1e-4]
    }
    results_list = []
    best_score = float('inf')
    best_config = None
    optimizer_choice = optim.AdamW

    # --- Get Input/Output Dimensions Once ---
    temp_X, temp_y, _ = load_data(fold_definitions[0]['train_suffix'])
    input_dim = temp_X.shape[1]
    output_dim = temp_y.shape[1]
    del temp_X, temp_y
    # ---

    for config in ParameterGrid(param_grid):
        print(f"  Testing config: {config}")
        fold_validation_scores = []

        for i, fold in enumerate(fold_definitions):
            X_train_pd, y_train_pd, wts_train_pd = load_data(fold['train_suffix'])
            X_val_pd, y_val_pd, wts_val_pd = load_data(fold['val_suffix'])
            train_loader, val_loader = create_pytorch_datasets_loaders(X_train_pd, 
                                                                       y_train_pd, 
                                                                       wts_train_pd, 
                                                                       X_val_pd, 
                                                                       y_val_pd, 
                                                                       wts_val_pd,
                                                                       batch_size)

            model = build_nn1_model(input_dim=input_dim, 
                                    output_dim=output_dim,
                                    n_hidden=config['n_hidden'], 
                                    dropout_rate=config['dropout_rate']
            ).to(device)
            optimizer = optimizer_choice(model.parameters(), lr=config['learning_rate'])

            validation_score = train_pytorch_model(model, 
                                                   train_loader, 
                                                   val_loader, 
                                                   weighted_cross_entropy_loss, 
                                                   optimizer,
                                                   device, 
                                                   max_epochs, 
                                                   patience, 
                                                   verbose=False
            )
            fold_validation_scores.append(validation_score)

        avg_validation_score = np.mean(fold_validation_scores)
        print(f"  Avg Val Score for config {config}: {avg_validation_score:.6f}")
        current_result = {**config, 'mean_cv_score': avg_validation_score}
        results_list.append(current_result)

        if avg_validation_score < best_score:
            best_score = avg_validation_score
            best_config = config

    results_df = pd.DataFrame(results_list)
    results_df = results_df.sort_values(by='mean_cv_score', ascending=True)

    # Build the best model instance (untrained)
    print(f"\nBest 1-Layer MLP config found: {best_config} with score {best_score:.6f}")
    best_model_instance = build_nn1_model(
        input_dim=input_dim, output_dim=output_dim,
        n_hidden=best_config['n_hidden'], dropout_rate=best_config['dropout_rate']
    )

    print("--- Finished Cross-Validation for 1-Layer MLP ---")
    return results_df, best_model_instance

In [156]:
results['mlp1'], models['mlp1'] = cross_val_mlp1()
results['mlp1']

--- Starting Cross-Validation for 1-Layer MLP ---
  Testing config: {'dropout_rate': 0.1, 'learning_rate': 0.01, 'n_hidden': 16}
Created DataLoaders - Train batches: 124, Val batches: 62
Created DataLoaders - Train batches: 124, Val batches: 62
Created DataLoaders - Train batches: 124, Val batches: 62
  Avg Val Score for config {'dropout_rate': 0.1, 'learning_rate': 0.01, 'n_hidden': 16}: 42.318940
  Testing config: {'dropout_rate': 0.1, 'learning_rate': 0.01, 'n_hidden': 32}
Created DataLoaders - Train batches: 124, Val batches: 62
Created DataLoaders - Train batches: 124, Val batches: 62
Created DataLoaders - Train batches: 124, Val batches: 62
  Avg Val Score for config {'dropout_rate': 0.1, 'learning_rate': 0.01, 'n_hidden': 32}: 42.303454
  Testing config: {'dropout_rate': 0.1, 'learning_rate': 0.01, 'n_hidden': 64}
Created DataLoaders - Train batches: 124, Val batches: 62
Created DataLoaders - Train batches: 124, Val batches: 62
Created DataLoaders - Train batches: 124, Val batch

Unnamed: 0,dropout_rate,learning_rate,n_hidden,mean_cv_score
6,0.1,0.001,64,42.275062
7,0.1,0.001,128,42.277826
19,0.3,0.001,128,42.281945
18,0.3,0.001,64,42.289017
31,0.5,0.001,128,42.293218
5,0.1,0.001,32,42.300045
13,0.3,0.01,32,42.30187
1,0.1,0.01,32,42.303454
23,0.3,0.0001,128,42.311532
10,0.1,0.0001,64,42.311978


In [157]:
models['mlp1']

NN1Layer(
  (layer_1): Linear(in_features=115, out_features=64, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (layer_2): Linear(in_features=64, out_features=4, bias=True)
  (softmax): Softmax(dim=1)
)

In [163]:
torch.save(models['mlp1'],'untrained_mlp1.pkl')

AttributeError: Can't pickle local object 'build_nn1_model.<locals>.NN1Layer'

In [158]:
models['mlp1'].state_dict()

OrderedDict([('layer_1.weight',
              tensor([[-0.0038, -0.0037,  0.0708,  ..., -0.0315,  0.0550,  0.0361],
                      [ 0.0793, -0.0667,  0.0253,  ..., -0.0650,  0.0144,  0.0389],
                      [ 0.0157,  0.0540, -0.0340,  ...,  0.0567, -0.0621, -0.0442],
                      ...,
                      [ 0.0810, -0.0652, -0.0578,  ...,  0.0658, -0.0269,  0.0615],
                      [ 0.0535, -0.0166, -0.0897,  ...,  0.0227,  0.0661,  0.0740],
                      [-0.0154, -0.0583,  0.0861,  ...,  0.0496,  0.0276,  0.0243]])),
             ('layer_1.bias',
              tensor([-0.0345, -0.0910,  0.0128, -0.0161, -0.0071,  0.0089, -0.0545, -0.0581,
                      -0.0271,  0.0362, -0.0584,  0.0869, -0.0918, -0.0579,  0.0267, -0.0907,
                      -0.0518,  0.0481, -0.0632,  0.0299, -0.0497, -0.0441,  0.0162,  0.0090,
                       0.0846,  0.0194,  0.0162, -0.0484,  0.0810, -0.0012,  0.0558, -0.0746,
                       0.048

In [164]:
import torch
import joblib # For saving scikit-learn models
import os

# --- Assume necessary helper functions and configs ---
# build_ridge_model, build_softmax_model, build_nn1_model, build_nn2_model
# DEVICE (needed for loading PyTorch models)
# --- End Assume ---


# --- 1. Save Untrained Model Object Function ---

def save_untrained_model(model, filename):
    """
    Saves the entire UNTRAINED model object (including architecture).
    Uses torch.save for PyTorch models and joblib.dump for scikit-learn models.

    Args:
        model (torch.nn.Module or sklearn estimator): The untrained model instance
                 (e.g., returned by the cross_val_* functions).
        filename (str): The name for the saved file (e.g., 'untrained_best_mlp1.pkl').
                        Using '.pkl' extension is common for pickled objects.
    """
    save_path = os.path.join(".", filename)
    if isinstance(model, torch.nn.Module):
        print(f"Saving entire PyTorch model object to: {save_path}")
        # Note: This saves the whole object via pickle. Less portable than state_dict.
        torch.save(model, save_path)
        print("PyTorch model object saved successfully.")
    elif hasattr(model, 'predict'): # Basic check for sklearn-like model
        print(f"Saving scikit-learn model object using joblib to: {save_path}")
        joblib.dump(model, save_path)
        print("Scikit-learn model object saved successfully.")
    else:
        print(f"Warning: Model type not recognized for saving: {type(model)}")

In [165]:
save_untrained_model(models['mlp1'], 'untrained_best_mlp1.pkl')

Saving entire PyTorch model object to: ./untrained_best_mlp1.pkl


AttributeError: Can't pickle local object 'build_nn1_model.<locals>.NN1Layer'

In [None]:



# --- 2. Load Untrained Model Object Function ---

def load_untrained_model(filename, device):
    """
    Loads an entire UNTRAINED model object saved using save_untrained_model.

    Args:
        filename (str): The file containing the saved model object.
        device (torch.device): The device to load PyTorch models onto ('cpu' or 'cuda').
                               Ignored for scikit-learn models.

    Returns:
        The loaded untrained model instance (torch.nn.Module or sklearn estimator).
    """
    load_path = os.path.join(".", filename)
    if not os.path.exists(load_path):
        raise FileNotFoundError(f"Model file not found at: {load_path}")

    print(f"Loading model object from: {load_path}")
    # Try loading with torch first, assume PyTorch model
    try:
        # map_location moves the model's tensors to the specified device during loading
        model = torch.load(load_path, map_location=device)
        if isinstance(model, torch.nn.Module):
             # Ensure model is fully on device (sometimes needed for internal buffers)
            model.to(device)
            print(f"PyTorch model loaded successfully onto {device}.")
            return model
        else:
            # If torch.load worked but it's not an nn.Module, maybe it was sklearn saved with torch? Unlikely.
            # Try joblib next.
            pass
    except Exception as torch_exception:
        # If torch.load fails, assume it might be a scikit-learn model saved with joblib
        print(f"torch.load failed ({torch_exception}), attempting joblib.load...")
        try:
            model = joblib.load(load_path)
            if hasattr(model, 'predict'):
                print("Scikit-learn model loaded successfully.")
                return model
            else:
                raise TypeError("Loaded object is not a recognized model type.")
        except Exception as joblib_exception:
            print(f"joblib.load also failed ({joblib_exception}).")
            raise IOError(f"Could not load model from {load_path} using torch or joblib.") from joblib_exception

    # If we somehow loaded with torch but it wasn't nn.Module, and joblib wasn't tried/failed
    raise TypeError("Could not determine model type during loading.")


# --- Workflow Integration ---

# 1. After running CV:
#    ridge_cv_results_df, best_ridge_instance = cross_val_ridge()
#    mlp1_cv_results_df, best_mlp1_instance = cross_val_mlp1()
#    # ... etc.

# 2. Determine the overall best model type and instance
#    (e.g., best_overall_model_instance = best_mlp1_instance; best_overall_model_type = 'MLP1')

# 3. Save the UNTRAINED best model instance
#    save_untrained_model(best_overall_model_instance, f"UNTRAINED_best_{best_overall_model_type}.pkl")

# --- Student Workflow (in a separate script/notebook) ---

# 1. Load the provided untrained model file
#    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#    loaded_untrained_model = load_untrained_model("UNTRAINED_best_MLP1.pkl", DEVICE)
#    best_overall_model_type = 'MLP1' # Student needs to know this

# 2. Get the corresponding best *training* hyperparameters (students would need these, maybe from the CV results df)
#    # Example: Assuming students have access to the best row of the results df
#    best_hyperparams_row = {'learning_rate': 1e-3, 'dropout_rate': 0.3, 'n_hidden': 64, 'mean_cv_score': 0.123} # Load this row
#    training_config = {
#        'learning_rate': best_hyperparams_row['learning_rate'],
#        'weight_decay': best_hyperparams_row.get('weight_decay', 0) # Handle potential absence
#    }

# 3. Define final training epochs
#    FINAL_EPOCHS = 100

# 4. Call the final training function (using the function from the previous response)
#    trained_model = train_final_model(
#        model_instance=loaded_untrained_model, # Pass the loaded untrained model
#        model_type=best_overall_model_type,
#        best_training_config=training_config,
#        data_dir=".", # Students need data directory
#        device=DEVICE,
#        batch_size=BATCH_SIZE, # Students need batch size
#        final_train_epochs=FINAL_EPOCHS
#    )

# 5. Students can now use the 'trained_model' for testing or further analysis.

In [None]:
# save the softmax model to file
save_untrained_model(models['softmax'], 'untrained_softmax.pkl')

Saving model state dictionary to: ./best_softmax.pth
Model saved successfully.


In [161]:
# save the mlp1 model to file
save_model(models['mlp1'], 'best_mlp1.pth')

Saving model state dictionary to: ./best_mlp1.pth
Model saved successfully.


In [None]:
#load the softmax model from file
softmax_loaded = load_model(build_softmax_model, 
                             input_dim, 
                             output_dim, 
                             {}, # No structural params needed for softmax
                             'best_softmax.pth', 
                             device)

In [None]:
import torch
import torch.optim as optim
import os
import pandas as pd

# --- Assume necessary helper functions and configs are defined ---
# load_data, create_pytorch_datasets_loaders, weighted_cross_entropy_loss
# build_softmax_model, build_nn1_model, build_nn2_model
# BATCH_SIZE, DEVICE
# --- End Assume ---

# --- 1. Save Model Function ---

def save_model(model, filename):
    """
    Saves the state dictionary of a PyTorch model to a file.

    Args:
        model (torch.nn.Module): The PyTorch model to save.
        filename (str): The name of the file to save the model to (e.g., 'best_mlp1.pth').
                        It will be saved in the current working directory.
    """
    save_path = os.path.join(".", filename) # Save in the current directory
    print(f"Saving model state dictionary to: {save_path}")
    torch.save(model.state_dict(), save_path)
    print("Model saved successfully.")


# --- 2. Load Model Function ---

def load_model(model_builder, input_dim, output_dim, structural_config, filename, device):
    """
    Loads a model's state dictionary into a newly built model instance.

    Args:
        model_builder (callable): The function used to build the model
                                  (e.g., build_nn1_model).
        input_dim (int): The number of input features for the model.
        output_dim (int): The number of output classes for the model.
        structural_config (dict): Dictionary containing the structural hyperparameters
                                 needed by the model_builder (e.g., {'n_hidden': 64, 'dropout_rate': 0.3}).
                                 Should NOT include training params like learning_rate.
        filename (str): The name of the file containing the saved state dictionary.
        device (torch.device): The device to load the model onto ('cpu' or 'cuda').

    Returns:
        torch.nn.Module: The model instance with loaded weights.
    """
    load_path = os.path.join(".", filename)
    if not os.path.exists(load_path):
        raise FileNotFoundError(f"Model file not found at: {load_path}")

    print(f"Building model structure using: {structural_config}")
    # Build the model structure first, passing necessary dimensions and structural params
    model = model_builder(input_dim=input_dim, output_dim=output_dim, **structural_config)

    print(f"Loading model state dictionary from: {load_path}")
    # Load the saved state dictionary. map_location ensures it loads to the specified device.
    state_dict = torch.load(load_path, map_location=device)
    model.load_state_dict(state_dict)

    # Move the model to the specified device (redundant if map_location worked, but good practice)
    model.to(device)
    print("Model loaded successfully.")
    return model


# --- 3. Train Final Model Function ---

def train_final_model(model, training_config, data_dir, device,
                      batch_size, final_train_epochs,
                      train_data_prefix="2008_2012_2016"):
    """
    Trains a given model instance on the combined training+validation dataset.

    Args:
        model (torch.nn.Module): The model instance to train (should be already built,
                                 possibly loaded).
        training_config (dict): Dictionary containing the optimal *training* hyperparameters
                                (e.g., {'learning_rate': 1e-3, 'weight_decay': 1e-5}).
                                Should NOT include structural params like n_hidden.
        data_dir (str): Directory containing the data CSV files.
        device (torch.device): The device to train on.
        batch_size (int): Batch size for training.
        final_train_epochs (int): The fixed number of epochs to train the final model for.
                                  (Could be based on median epochs from CV, or a fixed value).
        train_data_prefix (str): The prefix for the combined training data files.

    Returns:
        torch.nn.Module: The trained model instance.
    """
    print(f"\n--- Starting Final Model Training ---")
    print(f"Training configuration: {training_config}")
    print(f"Training for {final_train_epochs} epochs.")

    # 1. Load Combined Training Data
    X_train_pd, y_train_pd, wts_train_pd = load_data(train_data_prefix, data_dir)

    # 2. Create DataLoader for the combined data
    # We only need a training loader here. Using create_pytorch_datasets_loaders structure
    # but only using the training part.
    X_train_tensor = torch.tensor(X_train_pd.values, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train_pd.values, dtype=torch.float32).to(device)
    wts_train_tensor = torch.tensor(wts_train_pd[['P(C)']].values, dtype=torch.float32).to(device)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor, wts_train_tensor)
    final_train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    print(f"Loaded final training data: {len(train_dataset)} samples, {len(final_train_loader)} batches.")

    # 3. Setup Optimizer
    # Assuming AdamW for NNs based on previous choices. Ridge is handled separately.
    # Softmax might use AdamW too if weight decay was chosen.
    optimizer_choice = optim.AdamW
    optimizer = optimizer_choice(
        model.parameters(),
        lr=training_config['learning_rate'],
        weight_decay=training_config.get('weight_decay', 0) # Use weight_decay if present
    )

    # 4. Training Loop (Fixed Epochs, No Validation/Early Stopping)
    model.to(device) # Ensure model is on the right device
    model.train()    # Set model to training mode

    for epoch in range(final_train_epochs):
        train_loss_epoch = 0.0
        for batch_idx, (features, targets, weights) in enumerate(final_train_loader):
            # Data is already on the correct device

            # Forward pass
            outputs = model(features)
            # Calculate loss (using the same weighted loss as in CV)
            loss = weighted_cross_entropy_loss(outputs, targets, weights)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss_epoch += loss.item()

        avg_train_loss = train_loss_epoch / len(final_train_loader)
        # Print progress periodically
        if (epoch + 1) % 10 == 0 or epoch == final_train_epochs - 1:
             print(f"Epoch {epoch+1}/{final_train_epochs} - Training Loss: {avg_train_loss:.6f}")

    print("--- Finished Final Model Training ---")
    return model # Return the trained model

# --- Example Workflow Integration (Conceptual) ---

# # Assuming you have run CV and obtained:
# # best_mlp1_config = {'n_hidden': 64, 'dropout_rate': 0.3, 'learning_rate': 0.001}
# # best_mlp1_model_structure = build_nn1_model(input_dim=115, output_dim=4, n_hidden=64, dropout_rate=0.3)

# # 1. Save the best untrained structure (optional, but good practice)
# save_model(best_mlp1_model_structure, "untrained_best_mlp1.pth")

# # 2. Prepare configs for loading and training
# structural_params = {k: v for k, v in best_mlp1_config.items() if k in ['n_hidden', 'dropout_rate']} # Or shared_hidden_size
# training_params = {k: v for k, v in best_mlp1_config.items() if k in ['learning_rate', 'weight_decay']}

# # Determine input/output dims (e.g., from final training data)
# final_input_dim = 115 # Replace with actual value
# final_output_dim = 4  # Replace with actual value

# # 3. Load the model structure
# loaded_model = load_model(
#     model_builder=build_nn1_model, # Pass the correct builder function
#     input_dim=final_input_dim,
#     output_dim=final_output_dim,
#     structural_config=structural_params,
#     filename="untrained_best_mlp1.pth", # Name used in save_model
#     device=DEVICE
# )

# # 4. Train the loaded model on the combined data
# # Choose number of epochs (e.g., a fixed value like 100, or based on CV results)
# FINAL_EPOCHS = 100
# trained_final_model = train_final_model(
#     model=loaded_model,
#     training_config=training_params,
#     data_dir=".", # Your data directory
#     device=DEVICE,
#     batch_size=BATCH_SIZE,
#     final_train_epochs=FINAL_EPOCHS
# )

# # 5. Save the *trained* final model before testing
# save_model(trained_final_model, "TRAINED_final_best_mlp1.pth")

In [None]:
# --- 4. 2-Hidden-Layer MLP Model Builder ---

def build_nn2_model(input_dim, output_dim, shared_hidden_size, dropout_rate):
    """
    Builds a 2-Hidden-Layer MLP with ReLU activation and Dropout.
    Uses the same size for both hidden layers.

    Architecture: Linear -> ReLU -> Dropout -> Linear -> ReLU -> Dropout -> Linear -> Softmax

    Args:
        input_dim (int): Number of input features.
        output_dim (int): Number of output classes (probabilities).
        shared_hidden_size (int): Number of neurons in *each* hidden layer.
        dropout_rate (float): Dropout probability (applied after each ReLU).

    Returns:
        torch.nn.Module: An instance of the NN2Layer model.
    """
    class NN2Layer(nn.Module):
        def __init__(self, input_dim, output_dim, shared_hidden_size, dropout_rate):
            super().__init__()
            self.layer_1 = nn.Linear(input_dim, shared_hidden_size)
            self.relu1 = nn.ReLU()
            self.dropout1 = nn.Dropout(dropout_rate)
            self.layer_2 = nn.Linear(shared_hidden_size, shared_hidden_size)
            self.relu2 = nn.ReLU()
            self.dropout2 = nn.Dropout(dropout_rate)
            self.layer_3 = nn.Linear(shared_hidden_size, output_dim)
            # Apply Softmax to get probability outputs (see note in build_softmax_model)
            self.softmax = nn.Softmax(dim=1)

        def forward(self, x):
            """Forward pass."""
            x = self.layer_1(x)
            x = self.relu1(x)
            x = self.dropout1(x)
            x = self.layer_2(x)
            x = self.relu2(x)
            x = self.dropout2(x)
            logits = self.layer_3(x)
            probabilities = self.softmax(logits)
            return probabilities

    model = NN2Layer(input_dim, output_dim, shared_hidden_size, dropout_rate)
    return model

# --- Example Instantiation (for checking) ---
# input_features = 115
# output_classes = 4
#
# softmax_model = build_softmax_model(input_features, output_classes)
# print("Softmax Model:\n", softmax_model)
#
# nn1_model = build_nn1_model(input_features, output_classes, n_hidden=64, dropout_rate=0.3)
# print("\nNN 1-Layer Model:\n", nn1_model)
#
# nn2_model = build_nn2_model(input_features, output_classes, shared_hidden_size=32, dropout_rate=0.5)
# print("\nNN 2-Layer Model:\n", nn2_model)

# --- 4. Cross-Validation Function for 2-Hidden-Layer MLP (Modified) ---

def cross_val_mlp2(fold_definitions=FOLD_DEFINITIONS,
                   batch_size=BATCH_SIZE, device=DEVICE,
                   max_epochs=MAX_EPOCHS, patience=PATIENCE):
    """
    Performs 3-fold CV hyperparameter tuning for a 2-Hidden-Layer MLP (PyTorch).

    Returns:
        tuple:
            - pd.DataFrame: Results DF sorted by score.
            - torch.nn.Module: Untrained MLP-2 model instance with the best hyperparameters.
    """
    print("--- Starting Cross-Validation for 2-Layer MLP ---")
    param_grid = {
        'shared_hidden_size': [16, 32, 64],
        'dropout_rate': [0.1, 0.3, 0.5],
        'learning_rate': [1e-2, 1e-3, 1e-4]
    }
    results_list = []
    best_score = float('inf')
    best_config = None
    optimizer_choice = optim.AdamW

    # --- Get Input/Output Dimensions Once ---
    temp_X, temp_y, _ = load_data(fold_definitions[0]['train_suffix'])
    input_dim = temp_X.shape[1]
    output_dim = temp_y.shape[1]
    del temp_X, temp_y
    # ---

    for config in ParameterGrid(param_grid):
        print(f"  Testing config: {config}")
        fold_validation_scores = []

        for i, fold in enumerate(fold_definitions):
            X_train_pd, y_train_pd, wts_train_pd = load_data(fold['train_suffix'])
            X_val_pd, y_val_pd, wts_val_pd = load_data(fold['val_suffix'])
            train_loader, val_loader = create_pytorch_datasets_loaders(
                X_train_pd, y_train_pd, wts_train_pd, X_val_pd, y_val_pd, wts_val_pd,
                batch_size
            )

            model = build_nn2_model(
                input_dim=input_dim, output_dim=output_dim,
                shared_hidden_size=config['shared_hidden_size'], dropout_rate=config['dropout_rate']
            ).to(device)
            optimizer = optimizer_choice(model.parameters(), lr=config['learning_rate'])

            validation_score = train_pytorch_model(
                model, train_loader, val_loader, weighted_cross_entropy_loss, optimizer,
                device, max_epochs, patience, verbose=False
            )
            fold_validation_scores.append(validation_score)

        avg_validation_score = np.mean(fold_validation_scores)
        print(f"  Avg Val Score for config {config}: {avg_validation_score:.6f}")
        current_result = {**config, 'mean_cv_score': avg_validation_score}
        results_list.append(current_result)

        if avg_validation_score < best_score:
            best_score = avg_validation_score
            best_config = config

    results_df = pd.DataFrame(results_list)
    results_df = results_df.sort_values(by='mean_cv_score', ascending=True)

    # Build the best model instance (untrained)
    print(f"\nBest 2-Layer MLP config found: {best_config} with score {best_score:.6f}")
    best_model_instance = build_nn2_model(
        input_dim=input_dim, output_dim=output_dim,
        shared_hidden_size=best_config['shared_hidden_size'], dropout_rate=best_config['dropout_rate']
    )

    print("--- Finished Cross-Validation for 2-Layer MLP ---")
    return results_df, best_model_instance

In [None]:
# --- Assume these are defined elsewhere ---
# FOLD_DEFINITIONS = [...]
# BATCH_SIZE = 50
# DEVICE = torch.device(...)
# MAX_EPOCHS = 300
# PATIENCE = 15
# def load_data(prefix): ...
# def create_pytorch_datasets_loaders(...): ...
# def train_pytorch_model(...): ...
# def weighted_cross_entropy_loss(...): ...
# def build_softmax_model(**config): ...
# def build_nn1_model(**config): ...
# def build_nn2_model(**config): ...
# --- End Assume ---