In [4]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from torch import tensor, float32, cat, mean, std
from torch.nn import Linear, ReLU, Sigmoid, BatchNorm1d, Module, MSELoss, BCELoss
from torch.optim import Adam, SGD, RMSprop
from torch.utils.data import random_split, Dataset, DataLoader, TensorDataset
from torch.nn.init import xavier_normal_
from sklearn.model_selection import train_test_split

In [5]:
class CSVDataset(Dataset):
    """
    A PyTorch Dataset class that processes a CSV file containing basketball match data, 
    transforms it into a structured format, applies feature engineering, and prepares 
    it for model training.

    This dataset supports flipping data to create balanced training examples, computing 
    percentage-based statistics, applying sample weighting, and normalizing features.

    Args:
        team_id (Series): Team IDs corresponding to each row in the dataset.
        X_dataframe (DataFrame): Processed feature data in Pandas DataFrame format.
        y_dataframe (Series): Binary outcome labels (win/loss).
        column_names (Index): Feature column names.
        X (Tensor): Normalized feature tensor.
        y (Tensor): Label tensor.
        sample_weights (Tensor): Sample weights for training.
        mean (Tensor): Mean of each feature for normalization.
        std (Tensor): Standard deviation of each feature for normalization.
    """
    @staticmethod
    def flip_data(df):
        """
        Converts a dataset of basketball match results into a balanced dataset 
        by flipping team perspectives. 

        For each game, this function creates two entries: one where the winning 
        team is treated as the "Current Team" and one where the losing team is 
        treated as the "Current Team." It renames columns accordingly to maintain 
        a consistent format and assigns a binary label (1 for wins, 0 for losses).

        Args:
            df (DataFrame): Raw match data containing team statistics.

        Returns:
            DataFrame: Transformed dataset with flipped perspectives.
        """
        win_df = df.copy()
        lose_df = df.copy()
        win_df = win_df.rename(columns={
            'WTeamID': 'CurrentTeamID',
            'LTeamID': 'OpponentTeamID',
            'WScore': 'CurrentTeam_Score',
            'LScore': 'OpponentTeam_Score',
            'WFGM': 'CurrentTeam_FGM',
            'WFGA': 'CurrentTeam_FGA',
            'WFGM': 'CurrentTeam_FGM',
            'WFGA': 'CurrentTeam_FGA',
            'WFGM3': 'CurrentTeam_FGM3',
            'WFGA3': 'CurrentTeam_FGA3',
            'WFTM': 'CurrentTeam_FTM',
            'WFTA': 'CurrentTeam_FTA',
            'WOR': 'CurrentTeam_OR',
            'WDR': 'CurrentTeam_DR',
            'WAst': 'CurrentTeam_Ast',
            'WTO': 'CurrentTeam_TO',
            'WStl': 'CurrentTeam_Stl',
            'WBlk': 'CurrentTeam_Blk',
            'WPF': 'CurrentTeam_PF',
            'LFGM': 'OpponentTeam_FGM',
            'LFGA': 'OpponentTeam_FGA',
            'LFGM3': 'OpponentTeam_FGM3',
            'LFGA3': 'OpponentTeam_FGA3',
            'LFTM': 'OpponentTeam_FTM',
            'LFTA': 'OpponentTeam_FTA',
            'LOR': 'OpponentTeam_OR',
            'LDR': 'OpponentTeam_DR',
            'LAst': 'OpponentTeam_Ast',
            'LTO': 'OpponentTeam_TO',
            'LStl': 'OpponentTeam_Stl',
            'LBlk': 'OpponentTeam_Blk',
            'LPF': 'OpponentTeam_PF',
            'WScore': 'CurrentTeam_Score',
            'LScore': 'OpponentTeam_Score',
        })
        win_df['Result'] = 1
        win_df['CurrentTeam_Loc'] = win_df['WLoc'].map({'H': 1, 'A': -1, 'N': 0})
                
        lose_df = lose_df.rename(columns={
            'LTeamID': 'CurrentTeamID',
            'WTeamID': 'OpponentTeamID',
            'LScore': 'CurrentTeam_Score',
            'WScore': 'OpponentTeam_Score',
            'LFGM': 'CurrentTeam_FGM',
            'LFGA': 'CurrentTeam_FGA',
            'LFGM': 'CurrentTeam_FGM',
            'LFGA': 'CurrentTeam_FGA',
            'LFGM3': 'CurrentTeam_FGM3',
            'LFGA3': 'CurrentTeam_FGA3',
            'LFTM': 'CurrentTeam_FTM',
            'LFTA': 'CurrentTeam_FTA',
            'LOR': 'CurrentTeam_OR',
            'LDR': 'CurrentTeam_DR',
            'LAst': 'CurrentTeam_Ast',
            'LTO': 'CurrentTeam_TO',
            'LStl': 'CurrentTeam_Stl',
            'LBlk': 'CurrentTeam_Blk',
            'LPF': 'CurrentTeam_PF',
            'WFGM': 'OpponentTeam_FGM',
            'WFGA': 'OpponentTeam_FGA',
            'WFGM3': 'OpponentTeam_FGM3',
            'WFGA3': 'OpponentTeam_FGA3',
            'WFTM': 'OpponentTeam_FTM',
            'WFTA': 'OpponentTeam_FTA',
            'WOR': 'OpponentTeam_OR',
            'WDR': 'OpponentTeam_DR',
            'WAst': 'OpponentTeam_Ast',
            'WTO': 'OpponentTeam_TO',
            'WStl': 'OpponentTeam_Stl',
            'WBlk': 'OpponentTeam_Blk',
            'WPF': 'OpponentTeam_PF',
            'LScore': 'CurrentTeam_Score',
            'WScore': 'OpponentTeam_Score',
        })
        lose_df['Result'] = 0
        lose_df['CurrentTeam_Loc'] = lose_df['WLoc'].map({'H': -1, 'A': 1, 'N': 0})
        
        combined = pd.concat([win_df, lose_df], ignore_index=True)
        return combined.drop(columns=['WLoc'])
    
    @staticmethod
    def calculate_percents(df):
        """
        Computes percentage-based performance metrics for teams in each game.

        This function calculates field goal, three-point, and free throw 
        percentages, as well as total rebounds, for both the current and 
        opponent teams. It then removes redundant raw statistics to reduce 
        dimensionality and fills missing values with zero.

        Args:
            df (DataFrame): Transformed dataset with raw statistics.

        Returns:
            DataFrame: Dataset with percentage-based metrics replacing raw stats.
        """
        df['CurrentTeam_FG_Pct'] = df['CurrentTeam_FGM'] / df['CurrentTeam_FGA']
        df['CurrentTeam_3P_Pct'] = df['CurrentTeam_FGM3'] / df['CurrentTeam_FGA3']
        df['CurrentTeam_FT_Pct'] = df['CurrentTeam_FTM'] / df['CurrentTeam_FTA']
        df['CurrentTeam_Reb'] = df['CurrentTeam_OR'] + df['CurrentTeam_DR']

        df['OpponentTeam_FG_Pct'] = df['OpponentTeam_FGM'] / df['OpponentTeam_FGA']
        df['OpponentTeam_3P_Pct'] = df['OpponentTeam_FGM3'] / df['OpponentTeam_FGA3']
        df['OpponentTeam_FT_Pct'] = df['OpponentTeam_FTM'] / df['OpponentTeam_FTA']
        df['OpponentTeam_Reb'] = df['OpponentTeam_OR'] + df['OpponentTeam_DR']

        redundant = [
            'CurrentTeam_FGM', 'CurrentTeam_FGA', 'CurrentTeam_FGM3', 'CurrentTeam_FGA3',
            'CurrentTeam_FTM', 'CurrentTeam_FTA', 'CurrentTeam_OR', 'CurrentTeam_DR',
            'OpponentTeam_FGM', 'OpponentTeam_FGA', 'OpponentTeam_FGM3', 'OpponentTeam_FGA3',
            'OpponentTeam_FTM', 'OpponentTeam_FTA', 'OpponentTeam_OR', 'OpponentTeam_DR'
        ]
        return df.drop(columns=redundant).fillna(0)
        
    def __init__(self, path, lambda_decay=0.5):
        """
        Initializes the dataset by loading, processing, and normalizing match data.

        This constructor loads the dataset from a CSV file, flips data to create 
        balanced training examples, calculates derived statistics, applies sample 
        weighting based on season recency, normalizes features, and prepares tensors 
        for PyTorch models.

        Args:
            path (str): Path to the CSV file containing match data.
            lambda_decay (float, optional): Decay rate for sample weighting based 
                on season recency. Defaults to 0.5.
        """
        df = pd.read_csv(path)
        most_recent_season = df['Season'].max()
        df = self.flip_data(df)
        
        df['SampleWeight'] = np.exp(-lambda_decay * (most_recent_season - df['Season']))
        df['SampleWeight'] = df['SampleWeight'] / df['SampleWeight'].sum() 
        
        df = self.calculate_percents(df)
        
        df = df.drop(columns=['Season'])
        
        self.team_id = df['CurrentTeamID']
        self.X_dataframe = df.drop(columns=['Result', 'DayNum', 'CurrentTeamID', 'OpponentTeamID', 'CurrentTeam_Score', 'OpponentTeam_Score', 'SampleWeight'])
        self.y_dataframe = df['Result']
        
        self.column_names = self.X_dataframe.columns
        
        self.X = tensor(self.X_dataframe.values, dtype=float32)
        self.y = tensor(self.y_dataframe.values, dtype=float32).reshape(-1, 1)
        self.sample_weights = tensor(df['SampleWeight'].values, dtype=float32).reshape(-1, 1)  
        
        self.mean = mean(self.X, dim=0)
        self.std = std(self.X, dim=0)
        self.std[self.std == 0] = 1e-8
        self.X = (self.X - self.mean) / self.std
    
    def __len__(self):
        """
        Returns the number of samples in the dataset.

        Returns:
            int: Total number of samples in the dataset.
        """
        return len(self.X)
    
    def __getitem__(self, idx):
        """
        Retrieves a specific sample from the dataset.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            tuple: A tuple containing:
                - X[idx] (Tensor): Feature tensor for the given index.
                - y[idx] (Tensor): Label tensor (win/loss) for the given index.
                - sample_weights[idx] (Tensor): Sample weight tensor for the given index.
        """
        return self.X[idx], self.y[idx], self.sample_weights[idx]
    
    def get_column_names(self):
        """
        Retrieves the names of the feature columns in the dataset.

        Returns:
            Index: List of feature column names.
        """
        return self.column_names
    
    def get_example_data(self):
        """
        Returns a small subset of the dataset for inspection.

        Returns:
            DataFrame: The first five rows of the feature dataset.
        """
        return self.X_dataframe[:5]
        

In [6]:
import torch
import torch.nn as nn
from torch.nn import Linear, BatchNorm1d, LeakyReLU, Dropout
from torch.nn.init import xavier_normal_, kaiming_uniform_

class MLP(nn.Module):
    def __init__(self, n_inputs, dropout_value=0.2, hidden_units=[20, 10, 8], relu_slope=0.01):
        """
        Initializes a multi-layer perceptron (MLP) model with configurable hidden layers, batch normalization, 
        dropout, and LeakyReLU activation.

        This constructor dynamically creates a sequence of fully connected layers (hidden layers) based on 
        the `hidden_units` parameter. Each hidden layer is followed by batch normalization, a LeakyReLU 
        activation function with a specified slope, and a dropout layer to improve generalization.

        The final output layer is a single neuron initialized using Xavier normalization, designed for 
        binary classification or regression tasks.

        Args:
            n_inputs (int): Number of input features.
            dropout_value (float, optional): Dropout rate applied after each hidden layer. Defaults to 0.2.
            hidden_units (list, optional): List defining the number of neurons in each hidden layer. Defaults to [20, 10, 8].
            relu_slope (float, optional): Negative slope for the LeakyReLU activation function. Defaults to 0.01.
        """
        super(MLP, self).__init__()
        self.hidden_units = hidden_units
        self.model_params = nn.ModuleDict()

        # Build hidden layers dynamically
        for i, hidden_dim in enumerate(hidden_units):
            if i == 0:
                self.model_params['hidden0'] = Linear(n_inputs, hidden_dim)
                kaiming_uniform_(self.model_params['hidden0'].weight, nonlinearity='leaky_relu')
            else:
                self.model_params[f'hidden{i}'] = Linear(hidden_units[i-1], hidden_dim)
                kaiming_uniform_(self.model_params[f'hidden{i}'].weight, nonlinearity='leaky_relu')
            
            self.model_params[f'bn{i}'] = BatchNorm1d(hidden_dim)
            self.model_params[f'act{i}'] = LeakyReLU(relu_slope)
            self.model_params[f'drop{i}'] = Dropout(dropout_value)

        self.output = Linear(hidden_units[-1], 1)
        xavier_normal_(self.output.weight)

    def forward(self, X):
        """
        Defines the forward pass of the MLP model.

        The input `X` is passed sequentially through the hidden layers, 
        where each layer consists of a linear transformation followed by 
        batch normalization, LeakyReLU activation, and dropout. 
        Finally, the processed tensor is passed through the output layer 
        to generate the final prediction.

        Args:
            X (Tensor): Input tensor of shape (batch_size, n_inputs).

        Returns:
            Tensor: Output tensor of shape (batch_size, 1), representing 
            the model's predictions.
        """
        for i in range(len(self.hidden_units)):
            X = self.model_params[f'hidden{i}'](X)
            X = self.model_params[f'bn{i}'](X)
            X = self.model_params[f'act{i}'](X)
            X = self.model_params[f'drop{i}'](X)
        X = self.output(X)
        return X

In [7]:
def prepare_data(path, test_size=0.2, batch_size=64, get_data_headers = True):
    """
    Loads a dataset from a CSV file, splits it into training and test sets, and returns PyTorch DataLoaders.

    This function reads data from a CSV file using the `CSVDataset` class, extracts features, labels, and 
    sample weights, and then splits the data into training and test sets. The data is converted into PyTorch 
    tensors and wrapped in `TensorDataset` objects, which are then used to create PyTorch `DataLoader` 
    instances for efficient mini-batch processing.

    Optionally, it can print the column names from the dataset if `get_data_headers` is set to True.

    Args:
        path (str): Path to the CSV file containing the dataset.
        test_size (float, optional): Proportion of the dataset to be used as the test set. Defaults to 0.2.
        batch_size (int, optional): Number of samples per batch for DataLoader. Defaults to 64.
        get_data_headers (bool, optional): If True, prints the column names of the dataset. Defaults to True.

    Returns:
        tuple: 
            - train_loader (DataLoader): DataLoader for the training set.
            - test_loader (DataLoader): DataLoader for the test set.
            - num_features (int): Number of features in the dataset.
    """
    
    data = CSVDataset(path)
    if(get_data_headers):
        print(data.get_column_names())

    # Ensure sample_weights is correctly split along with X and y
    X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
        data.X, data.y, data.sample_weights, train_size=(1 - test_size), shuffle=True, random_state=42
    )

    # Convert NumPy arrays to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    w_train = torch.tensor(w_train, dtype=torch.float32)

    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    w_test = torch.tensor(w_test, dtype=torch.float32)

    # Create TensorDataset
    train_dataset = TensorDataset(X_train, y_train, w_train)
    test_dataset = TensorDataset(X_test, y_test, w_test)

    # Use DataLoader
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)
    

    return train_loader, test_loader, len(data.get_column_names())

In [8]:
import time

def train_model(train_loader, test_loader, 
                n_inputs,  
                dropout_value=0.2,
                hidden_units=[20, 10, 8, 1],
                relu_slope=0.01,
                learning_rate=0.001, 
                epochs=50, 
                optimizer_type='adam', 
                loss_fn='bce', 
                beta_1=0.9, 
                beta_2=0.999, 
                verbose=True,
                weight_decay = 1e-5):
    """
    Trains a multi-layer perceptron (MLP) model using PyTorch, with configurable hyperparameters.

    This function initializes an MLP model with specified hidden layers, activation function slope,
    and dropout rate. It trains the model on the provided training dataset using the selected optimizer
    (Adam, SGD, or RMSprop) and loss function (Binary Cross-Entropy or Mean Squared Error). The optimizer's
    weight_decay parameter applies L2 regularization to prevent overfitting.
    
    Args:
        train_loader (DataLoader): PyTorch DataLoader providing the training data batches.
        test_loader (DataLoader): PyTorch DataLoader providing the test data batches.
        n_inputs (int): Number of input features in the dataset.
        dropout_value (float, optional): Dropout rate applied to hidden layers. Defaults to 0.2.
        hidden_units (list, optional): List defining the number of neurons in each hidden layer. Defaults to [20, 10, 8].
        relu_slope (float, optional): Negative slope for the LeakyReLU activation function. Defaults to 0.01.
        learning_rate (float, optional): Learning rate for the optimizer. Defaults to 0.001.
        epochs (int, optional): Number of training epochs. Defaults to 50.
        optimizer_type (str, optional): Optimizer choice ('adam', 'sgd', or 'rmsprop'). Defaults to 'adam'.
        loss_function (str, optional): Loss function ('bce' for binary classification, 'mse' for regression). Defaults to 'bce'.
        regularization (float, optional): (Unused) L2 regularization coefficient. Defaults to 0.01.
        beta_1 (float, optional): First momentum term for Adam/RMSprop optimizers. Defaults to 0.9.
        beta_2 (float, optional): Second momentum term for Adam optimizer. Defaults to 0.999.
        verbose (bool, optional): If True, prints training progress and loss at each epoch. Defaults to True.
        weight_decay (float, optional): Weight decay for optimizer regularization. Defaults to 1e-5.

    Raises:
        ValueError: If an invalid optimizer type is provided.

    Returns:
        tuple: A trained PyTorch model and a dictionary containing training and test loss history.
    """
    
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
        
    model = MLP(n_inputs=n_inputs, dropout_value=dropout_value, hidden_units=hidden_units, relu_slope=relu_slope)
    model.to(device)
    model.train()
    
    if loss_fn == "bce":
        loss_fn = nn.BCEWithLogitsLoss(reduction="none")
    else:
        loss_fn = nn.MSELoss(reduction="none")

    optimizer = {
        'adam': Adam(model.parameters(), lr=learning_rate, betas=(beta_1, beta_2), weight_decay=weight_decay),
        'sgd': SGD(model.parameters(), lr=learning_rate, momentum=beta_1, weight_decay=weight_decay),
        'rmsprop': RMSprop(model.parameters(), lr=learning_rate, alpha=beta_1, weight_decay=weight_decay)
    }.get(optimizer_type, None)
    if optimizer is None:
        raise ValueError("Invalid optimizer. Choose 'adam', 'sgd', or 'rmsprop'.")

    history = {
        "train_loss": [],
        "test_loss": [],
        "epoch_time": [],
        "best_test_loss": float('inf'),  # Track best validation loss
        "weight_norms": []
    }
    
    for epoch in range(epochs):
        start_time = time.time()
        
        model.train()
        train_loss = 0.0
        
        for batch_X, batch_y, batch_w in train_loader:
            batch_X, batch_y, batch_w = batch_X.to(device), batch_y.to(device), batch_w.to(device)
            optimizer.zero_grad()
            y_hat = model(batch_X)
            loss = loss_fn(y_hat, batch_y)
            weighted_loss = loss * batch_w.view(-1, 1)
            final_loss = weighted_loss.mean()
            final_loss.backward()
            optimizer.step()
            train_loss += final_loss.item()

        avg_train_loss = train_loss / len(train_loader)
        history["train_loss"].append(avg_train_loss)
        
        model.eval()
        test_loss = 0.0
        with torch.no_grad():
            for batch_X, batch_y, batch_w in test_loader:
                batch_X, batch_y, batch_w = batch_X.to(device), batch_y.to(device), batch_w.to(device)
                y_hat_dev = model(batch_X)
                loss = loss_fn(y_hat_dev, batch_y)
                weighted_loss = loss * batch_w.view(-1, 1)
                final_loss = weighted_loss.mean()
                test_loss += final_loss.item()  # âœ… Corrected accumulation

        avg_test_loss = test_loss / len(test_loader)
        history["test_loss"].append(avg_test_loss)

        # Track best validation loss
        if avg_test_loss < history["best_test_loss"]:
            history["best_test_loss"] = avg_test_loss
        
        # Debugging: Check that the weights are updating
        weight_norm = torch.norm(next(model.parameters())).item()
        history["weight_norms"].append(weight_norm)
        
        # Track time taken for each epoch
        epoch_time = time.time() - start_time
        history["epoch_time"].append(epoch_time)

        if verbose:
            print(f"Epoch [{epoch+1}/{epochs}] "
                  f"- Train Loss: {avg_train_loss:.6f} "
                  f"- Test Loss: {avg_test_loss:.6f} "
                  f"- Weight Norm: {weight_norm:.6f} "
                  f"- Time: {epoch_time:.2f}s")
    
    return model, history

In [9]:
def compute_aggs(dataset, output_file_path='input_data.csv'):
    """
    Computes weighted aggregate statistics for each team based on sample weights and saves the results.

    This function processes a dataset by grouping feature values by `TeamID` and calculating 
    the weighted average for each feature using the provided sample weights. It then constructs 
    two dictionaries: one containing the aggregated statistics from the team's perspective 
    (`current_aggs`), and another containing opponent team statistics (`opponent_aggs`). 
    Additionally, a structured DataFrame is created, where each row represents a team with 
    separate columns for current and opponent perspectives. The results are saved as a CSV file.

    Args:
        dataset (CSVDataset): An instance of the `CSVDataset` class containing team feature data.
        output_file_path (str, optional): File path to save the aggregated features as a CSV. 
            Defaults to 'input_data.csv'.

    Returns:
        tuple:
            - current_aggs (dict): A dictionary mapping `TeamID` to a NumPy array of aggregated 
            statistics for the team's perspective.
            - opponent_aggs (dict): A dictionary mapping `TeamID` to a NumPy array of aggregated 
            statistics from the opponent's perspective.
    """
    # Ensure team_id is a pandas Series.
    team_ids = dataset.team_id if isinstance(dataset.team_id, pd.Series) else pd.Series(dataset.team_id)
    
    # Make a copy of the features DataFrame and add the team IDs.
    df_features = dataset.X_dataframe.copy()
    df_features['TeamID'] = team_ids.values
    
    # Convert sample_weights to a 1D NumPy array.
    if isinstance(dataset.sample_weights, torch.Tensor):
        weights = dataset.sample_weights.cpu().detach().numpy().flatten()
    else:
        weights = np.array(dataset.sample_weights).flatten()
    df_features['SampleWeight'] = weights
    
    # All feature columns are those originally in X_dataframe.
    feature_cols = [col for col in df_features.columns if col not in ['TeamID', 'SampleWeight']]
    
    # Group by TeamID and compute the weighted average for each feature.
    aggregates = {}
    for team, group in df_features.groupby('TeamID'):
        w = group['SampleWeight'].values
        avg_features = {}
        for col in feature_cols:
            avg_features[col] = np.average(group[col].values, weights=w)
        aggregates[team] = avg_features
        
    # Convert the aggregates dictionary to a DataFrame.
    agg_df = pd.DataFrame.from_dict(aggregates, orient='index')
    agg_df.index.name = 'TeamID'
    agg_df.reset_index(inplace=True)
    
    # Define the orders for current and opponent perspectives.
    current_order = ['CurrentTeam_Ast', 'CurrentTeam_TO', 'CurrentTeam_Stl', 
                    'CurrentTeam_Blk', 'CurrentTeam_PF', 'CurrentTeam_Loc', 
                    'CurrentTeam_FG_Pct', 'CurrentTeam_3P_Pct', 
                    'CurrentTeam_FT_Pct', 'CurrentTeam_Reb']
    opponent_order = ['OpponentTeam_Ast', 'OpponentTeam_TO', 'OpponentTeam_Stl', 
                    'OpponentTeam_Blk', 'OpponentTeam_PF', 'OpponentTeam_FG_Pct', 
                    'OpponentTeam_3P_Pct', 'OpponentTeam_FT_Pct', 
                    'OpponentTeam_Reb']
    
    current_aggs = {}
    opponent_aggs = {}
    combined_rows = []
    
    for _, row in agg_df.iterrows():
        team = int(row['TeamID'])
        current_vec = row[current_order].values.astype(np.float32)
        opponent_vec = row[opponent_order].values.astype(np.float32)
        
        current_aggs[team] = current_vec
        opponent_aggs[team] = opponent_vec
        
        combined_row = {'TeamID': team}
        for col, val in zip(current_order, current_vec):
            combined_row['curr_' + col] = val
        for col, val in zip(opponent_order, opponent_vec):
            combined_row['opp_' + col] = val
        combined_rows.append(combined_row)
    
    combined_df = pd.DataFrame(combined_rows)
    combined_df.to_csv(output_file_path, index=False)
    print(f"Aggregated features written to {output_file_path}")
    
    return current_aggs, opponent_aggs

In [10]:
def create_input_data_csv(dataset, output_file_path='input_data.csv'):
    """
    Generates a CSV file containing aggregated statistics for each team based on a given dataset.

    This function computes team-level aggregate statistics by calling `compute_aggs`, which calculates 
    weighted averages for both current and opponent team perspectives. The resulting CSV file contains 
    one row per team with columns for team-specific and opponent-specific aggregated features. The 
    function then reads the generated CSV and returns it as a Pandas DataFrame.

    The resulting CSV file will have the following structure:
      - TeamID: Unique identifier for the team.
      - curr_*: Aggregated statistics from the current team's perspective.
      - opp_*: Aggregated statistics from the opponent's perspective.

    Columns included in the CSV:
      - TeamID
      - curr_CurrentTeam_Ast, curr_CurrentTeam_TO, curr_CurrentTeam_Stl, curr_CurrentTeam_Blk,
        curr_CurrentTeam_PF, curr_CurrentTeam_Loc, curr_CurrentTeam_FG_Pct, curr_CurrentTeam_3P_Pct,
        curr_CurrentTeam_FT_Pct, curr_CurrentTeam_Reb
      - opp_OpponentTeam_Ast, opp_OpponentTeam_TO, opp_OpponentTeam_Stl, opp_OpponentTeam_Blk,
        opp_OpponentTeam_PF, opp_OpponentTeam_FG_Pct, opp_OpponentTeam_3P_Pct, opp_OpponentTeam_FT_Pct,
        opp_OpponentTeam_Reb

    Args:
        dataset (CSVDataset): An instance of the `CSVDataset` class containing match feature data.
        output_file_path (str, optional): File path to save the aggregated team statistics. Defaults to 'input_data.csv'.

    Returns:
        DataFrame: A Pandas DataFrame containing the aggregated team statistics.
    """
    # Compute aggregates and write the CSV file.
    compute_aggs(dataset, output_file_path=output_file_path)
    # Read the generated CSV file and return it.
    df = pd.read_csv(output_file_path)
    print(f"Input data CSV read from {output_file_path}")
    return df

In [11]:
def predict_with_aggs(submission_key, model, current_aggs, opponent_aggs, dataset, device='cpu'):
    """
    Generates a probability prediction for a match using aggregated team features.

    This function takes a submission key that represents a match-up between two teams, retrieves 
    their aggregated statistics, normalizes the feature vector using the dataset's stored mean 
    and standard deviation, and passes it through the trained model to predict the probability 
    that team A wins against team B.

    Args:
        submission_key (str): A string in the format "year_teamidA_teamidB" representing the match-up.
        model (torch.nn.Module): The trained PyTorch model used for prediction.
        current_aggs (dict): Dictionary mapping `TeamID` to aggregated statistics from the current team's perspective.
        opponent_aggs (dict): Dictionary mapping `TeamID` to aggregated statistics from the opponent's perspective.
        dataset (CSVDataset): The dataset instance containing feature normalization parameters.
        device (str, optional): The device ('cpu' or 'cuda') on which the prediction should be performed. Defaults to 'cpu'.

    Raises:
        ValueError: If the submission key format is incorrect.
        ValueError: If the constructed feature vector does not match the expected length.

    Returns:
        float: The predicted probability that team A wins against team B.
    """
    # Parse the submission key.
    parts = submission_key.split('_')
    if len(parts) != 3:
        raise ValueError("Submission key must be in the format 'year_teamidA_teamidB'.")
    _, teamA_str, teamB_str = parts
    teamA = int(teamA_str)
    teamB = int(teamB_str)
    
    # For game-level features, set NumOT to 0.
    numot = 0.0
    
    # Look up the aggregated features.
    # If a team does not have history, default to zeros.
    current_features = current_aggs.get(teamA, np.zeros(10, dtype=np.float32))
    opponent_features = opponent_aggs.get(teamB, np.zeros(9, dtype=np.float32))
    
    # Build the submission feature vector.
    # Training feature order: [NumOT] + current team features (10 values) + opponent team features (9 values)
    submission_features = np.concatenate([[numot], current_features, opponent_features])
    
    expected_length = dataset.X_dataframe.shape[1]  # Should be 20 features.
    if len(submission_features) != expected_length:
        raise ValueError(f"Submission feature vector length {len(submission_features)} does not match expected {expected_length}.")
    
    # Normalize using the training mean and std (stored as torch tensors).
    submission_tensor = torch.tensor(submission_features, dtype=torch.float32).to(device)
    submission_tensor = (submission_tensor - dataset.mean.to(device)) / dataset.std.to(device)
    submission_tensor = submission_tensor.unsqueeze(0)  # Add batch dimension.
    
    # Predict using the model.
    model.eval()
    with torch.no_grad():
        logits = model(submission_tensor)
        prob = torch.sigmoid(logits).item()  # Convert logits to probability.
    return prob

In [12]:
def predict(submission_file_path, model, dataset_input, lambda_decay=0.5, device='cpu'):
    """
    Updates a Kaggle-style submission file by computing match predictions using a trained model.

    This function reads a submission file containing match-ups identified by a submission key 
    (formatted as "year_teamidA_teamidB"), computes the aggregated statistics for teams using 
    `compute_aggs`, predicts the probability of team A winning against team B, and updates the 
    "Pred" column in the submission file with the predicted probabilities.

    Args:
        submission_file_path (str): Path to the CSV file containing the match submission entries.
        model (torch.nn.Module): The trained PyTorch model used for making predictions.
        dataset_input (str or CSVDataset): Either a file path to a CSV dataset or an instance of `CSVDataset`.
        lambda_decay (float, optional): Decay factor for sample weighting when computing aggregates. Defaults to 0.5.
        device (str, optional): The device ('cpu' or 'cuda') on which the model should run. Defaults to 'cpu'.

    Returns:
        None: The function updates the submission file in-place and writes the modified file to disk.

    Side Effects:
        - Reads the dataset from a file if `dataset_input` is a string.
        - Computes aggregated team statistics and stores them in a CSV file (`input_data.csv`).
        - Reads the submission file, updates the "Pred" column with probabilities, and writes the modified file.
    """
    output_file_path = submission_file_path

    # If dataset_input is a string, create a CSVDataset object from it.
    if isinstance(dataset_input, str):
        dataset = CSVDataset(dataset_input, lambda_decay=lambda_decay)
    else:
        dataset = dataset_input

    # Compute the aggregate dictionaries (current and opponent) and create the CSV file.
    current_aggs, opponent_aggs = compute_aggs(dataset, output_file_path='input_data.csv')
    
    # Read the sample submission CSV file.
    sub_df = pd.read_csv(submission_file_path)
    
    # Loop over each submission key, predict the probability, and update the "Pred" column.
    predictions = []
    for idx, row in sub_df.iterrows():
        submission_key = row['ID']
        prob = predict_with_aggs(submission_key, model, current_aggs, opponent_aggs, dataset, device=device)
        predictions.append(prob)
    
    sub_df['Pred'] = predictions
    sub_df.to_csv(output_file_path, index=False)
    print(f"Submission file written to {output_file_path}")

In [15]:
train_loader, test_loader, n_inputs = prepare_data("/Users/advaithvecham/Studying Stuff/PoC/march-machine-learning-mania-2025/MNCAATourneyDetailedResults.csv", test_size=0.2, batch_size=64, get_data_headers=True)

n_inputs = next(iter(train_loader))[0].shape[1]
model, loss_history = train_model(train_loader, test_loader, 
                n_inputs,  
                dropout_value=0.2,
                hidden_units=[20, 10, 8, 1],
                relu_slope=0.1,
                learning_rate=0.001, 
                epochs=120, 
                optimizer_type='adam', 
                loss_fn ='bce', 
                beta_1=0.9, 
                beta_2=0.999, 
                verbose=True)
print(loss_history)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/advaithvecham/Studying Stuff/PoC/march-machine-learning-mania-2025/MNCAATourneyDetailedResults.csv'

In [14]:
plt.plot(loss_history['train_loss'],)
plt.plot(loss_history['test_loss'],)


NameError: name 'loss_history' is not defined

In [28]:
dataset = CSVDataset("/Users/advaithvecham/Studying Stuff/PoC/march-machine-learning-mania-2025/MNCAATourneyDetailedResults.csv", lambda_decay=0.5)
# predict("/Users/advaithvecham/Studying Stuff/PoC/march-machine-learning-mania-2025/SampleSubmissionStage1.csv", model, dataset, lambda_decay=0.5, device='cpu')
predict("/Users/advaithvecham/Studying Stuff/PoC/march-machine-learning-mania-2025/SampleSubmissionStage2.csv", model, dataset, lambda_decay=0.5, device='cpu')

Aggregated features written to input_data.csv
Submission file written to /Users/advaithvecham/Studying Stuff/PoC/march-machine-learning-mania-2025/SampleSubmissionStage2.csv
