In [None]:
import numpy as np
from numpy import vstack
from pandas import read_csv, concat
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import nn
from torch import Tensor
from torch.nn import Linear, ReLU, Sigmoid, BatchNorm1d
from torch.nn import Module
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn import BCELoss
from torch.nn.init import xavier_normal_
from torch.nn.init import kaiming_uniform_

pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)  
pd.set_option('display.max_colwidth', None)  

In [165]:
class CSVDataset(Dataset):
    @staticmethod
    def flip_data(df):
        win_df = df.copy()
        lose_df = df.copy()        
        win_df = win_df.rename(columns={
                'WTeamID': 'CurrentTeamID',
                'LTeamID': 'OpponentTeamID',
                'WScore': 'CurrentTeam_Score',
                'LScore': 'OpponentTeam_Score',
                'WFGM': 'CurrentTeam_FGM',
                'WFGA': 'CurrentTeam_FGA',
                'WFGM': 'CurrentTeam_FGM',
                'WFGA': 'CurrentTeam_FGA',
                'WFGM3': 'CurrentTeam_FGM3',
                'WFGA3': 'CurrentTeam_FGA3',
                'WFTM': 'CurrentTeam_FTM',
                'WFTA': 'CurrentTeam_FTA',
                'WOR': 'CurrentTeam_OR',
                'WDR': 'CurrentTeam_DR',
                'WAst': 'CurrentTeam_Ast',
                'WTO': 'CurrentTeam_TO',
                'WStl': 'CurrentTeam_Stl',
                'WBlk': 'CurrentTeam_Blk',
                'WPF': 'CurrentTeam_PF',
                'LFGM': 'OpponentTeam_FGM',
                'LFGA': 'OpponentTeam_FGA',
                'LFGM3': 'OpponentTeam_FGM3',
                'LFGA3': 'OpponentTeam_FGA3',
                'LFTM': 'OpponentTeam_FTM',
                'LFTA': 'OpponentTeam_FTA',
                'LOR': 'OpponentTeam_OR',
                'LDR': 'OpponentTeam_DR',
                'LAst': 'OpponentTeam_Ast',
                'LTO': 'OpponentTeam_TO',
                'LStl': 'OpponentTeam_Stl',
                'LBlk': 'OpponentTeam_Blk',
                'LPF': 'OpponentTeam_PF',
                'WScore': 'CurrentTeam_Score',
                'LScore': 'OpponentTeam_Score',
        })
        win_df['Result'] = 1
        win_df['CurrentTeam_Loc'] = win_df['WLoc'].map({'H': 1, 'A': -1, 'N': 0})

        lose_df = lose_df.rename(columns={
                'LTeamID': 'CurrentTeamID',
                'WTeamID': 'OpponentTeamID',
                'LScore': 'CurrentTeam_Score',
                'WScore': 'OpponentTeam_Score',
                'LFGM': 'CurrentTeam_FGM',
                'LFGA': 'CurrentTeam_FGA',
                'LFGM': 'CurrentTeam_FGM',
                'LFGA': 'CurrentTeam_FGA',
                'LFGM3': 'CurrentTeam_FGM3',
                'LFGA3': 'CurrentTeam_FGA3',
                'LFTM': 'CurrentTeam_FTM',
                'LFTA': 'CurrentTeam_FTA',
                'LOR': 'CurrentTeam_OR',
                'LDR': 'CurrentTeam_DR',
                'LAst': 'CurrentTeam_Ast',
                'LTO': 'CurrentTeam_TO',
                'LStl': 'CurrentTeam_Stl',
                'LBlk': 'CurrentTeam_Blk',
                'LPF': 'CurrentTeam_PF',
                'WFGM': 'OpponentTeam_FGM',
                'WFGA': 'OpponentTeam_FGA',
                'WFGM3': 'OpponentTeam_FGM3',
                'WFGA3': 'OpponentTeam_FGA3',
                'WFTM': 'OpponentTeam_FTM',
                'WFTA': 'OpponentTeam_FTA',
                'WOR': 'OpponentTeam_OR',
                'WDR': 'OpponentTeam_DR',
                'WAst': 'OpponentTeam_Ast',
                'WTO': 'OpponentTeam_TO',
                'WStl': 'OpponentTeam_Stl',
                'WBlk': 'OpponentTeam_Blk',
                'WPF': 'OpponentTeam_PF',
                'LScore': 'CurrentTeam_Score',
                'WScore': 'OpponentTeam_Score',
        })
        lose_df['Result'] = 0
        lose_df['CurrentTeam_Loc'] = lose_df['WLoc'].map({'H': -1, 'A': 1, 'N': 0})

        combined = pd.concat([win_df, lose_df], ignore_index=True)
        return combined.drop(columns=['WLoc'])

    @staticmethod
    def calculate_percents(df):
        # Current team percentages
        df['CurrentTeam_FG_Pct'] = df['CurrentTeam_FGM'] / df['CurrentTeam_FGA']
        df['CurrentTeam_3P_Pct'] = df['CurrentTeam_FGM3'] / df['CurrentTeam_FGA3']
        df['CurrentTeam_FT_Pct'] = df['CurrentTeam_FTM'] / df['CurrentTeam_FTA']
        df['CurrentTeam_Reb'] = df['CurrentTeam_OR'] + df['CurrentTeam_DR']

        # Opponent team percentages
        df['OpponentTeam_FG_Pct'] = df['OpponentTeam_FGM'] / df['OpponentTeam_FGA']
        df['OpponentTeam_3P_Pct'] = df['OpponentTeam_FGM3'] / df['OpponentTeam_FGA3']
        df['OpponentTeam_FT_Pct'] = df['OpponentTeam_FTM'] / df['OpponentTeam_FTA']
        df['OpponentTeam_Reb'] = df['OpponentTeam_OR'] + df['OpponentTeam_DR']

        # Remove redundant columns
        redundant = [
            'CurrentTeam_FGM', 'CurrentTeam_FGA', 'CurrentTeam_FGM3', 'CurrentTeam_FGA3',
            'CurrentTeam_FTM', 'CurrentTeam_FTA', 'CurrentTeam_OR', 'CurrentTeam_DR',
            'OpponentTeam_FGM', 'OpponentTeam_FGA', 'OpponentTeam_FGM3', 'OpponentTeam_FGA3',
            'OpponentTeam_FTM', 'OpponentTeam_FTA', 'OpponentTeam_OR', 'OpponentTeam_DR'
        ]
        return df.drop(columns=redundant).fillna(0)
    
    def __init__(self, path):
        df = pd.read_csv(path)
        new_order = ['Season', 'DayNum', 'WTeamID','LTeamID', 'WScore', 'LScore', 'WLoc', 'NumOT', 
                    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR','WAst', 
                    'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3','LFTM', 
                    'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']
        df = df[new_order]

        most_recent_season = df['Season'].max()
        df['delta_time'] = most_recent_season - df['Season']
        lambda_decay = 0.5  
        df['weight'] = np.exp(-lambda_decay * df['delta_time'])

        df = self.flip_data(df)
        df = self.calculate_percents(df)
        df = df.drop(columns=['Season', 'delta_time', 'weight'], errors='ignore')
        self.X = df.drop(columns=['Result']).values.astype(np.float32)
        self.y = df['Result'].values.astype(np.float32).reshape(-1, 1)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def get_splits(self, test_ratio=0.2):
        test_size = int(len(self) * test_ratio)
        return random_split(self, [len(self)-test_size, test_size])

In [149]:
class MLP(nn.Module):    
    def __init__(self, n_inputs):
            super(MLP, self).__init__()
            
            # Layer 1
            self.hidden1 = Linear(n_inputs, 20)
            kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
            self.bn1 = BatchNorm1d(20)
            self.act1 = ReLU()
            
            # Layer 2
            self.hidden2 = Linear(20, 10)
            kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
            self.bn2 = BatchNorm1d(10)
            self.act2 = ReLU()
            
            # Layer 3
            self.hidden3 = Linear(10, 8)
            kaiming_uniform_(self.hidden3.weight, nonlinearity='relu')
            self.bn3 = BatchNorm1d(8)
            self.act3 = ReLU()
            
            # Output Layer
            self.output = Linear(8, 1)
            xavier_normal_(self.output.weight)
            self.act4 = Sigmoid()

    def forward(self, X):
        X = self.act1(self.bn1(self.hidden1(X)))
        X = self.act2(self.bn2(self.hidden2(X)))
        X = self.act3(self.bn3(self.hidden3(X)))
        return self.act4(self.output(X))

In [150]:
def prepare_data(path, test_size=0.2, batch_size=64):
    # Create dataset and split
    dataset = CSVDataset(path)
    train_size = int((1 - test_size) * len(dataset))
    test_size = len(dataset) - train_size
    train_data, test_data = random_split(dataset, [train_size, test_size])
    
    # Create dataloaders
    train_dl = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_dl = DataLoader(test_data, batch_size=batch_size)
    return train_dl, test_dl


In [151]:
def train_model(train_dl, model, epochs=100, lr=0.01, validation_dl=None, device='cpu'):
    model = model.to(device)
    criterion = BCELoss()
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, verbose=True)
    
    history = {'train_loss': [], 'val_loss': [], 'lr': []}
    best_val_loss = float('inf')

    for epoch in range(epochs):
        # Training Phase
        model.train()
        train_loss = 0.0
        train_iter = tqdm(train_dl, desc=f'Epoch {epoch+1}/{epochs} [Train]', leave=False)
        
        for inputs, targets in train_iter:
            inputs = inputs.to(device).float()
            targets = targets.to(device).float().view(-1, 1)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)
            train_iter.set_postfix(loss=loss.item())

        # Validation Phase
        val_loss = 0.0
        if validation_dl:
            model.eval()
            val_iter = tqdm(validation_dl, desc=f'Epoch {epoch+1}/{epochs} [Val]', leave=False)
            with torch.no_grad():
                for inputs, targets in val_iter:
                    inputs = inputs.to(device).float()
                    targets = targets.to(device).float().view(-1, 1)
                    outputs = model(inputs)
                    val_loss += criterion(outputs, targets).item() * inputs.size(0)

            val_loss /= len(validation_dl.dataset)
            history['val_loss'].append(val_loss)
            scheduler.step(val_loss)
            
            # Save best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), 'best_model.pth')

        # Update history
        train_loss /= len(train_dl.dataset)
        history['train_loss'].append(train_loss)
        history['lr'].append(optimizer.param_groups[0]['lr'])
        
        # Epoch summary
        log = f'Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f}'
        if validation_dl:
            log += f' | Val Loss: {val_loss:.4f} | LR: {history["lr"][-1]:.2e}'
        print(log)

    print('Training complete')
    return model, history

def evaluate_model(test_dl, model, device='cpu'):
    model = model.to(device).eval()
    predictions, actuals = [], []
    
    with torch.no_grad():
        for inputs, targets in tqdm(test_dl, desc='Evaluating'):
            inputs = inputs.to(device).float()
            targets = targets.to(device).float()
            
            outputs = model(inputs)
            preds = torch.round(outputs).cpu().numpy()
            
            predictions.extend(preds.ravel().tolist())
            actuals.extend(targets.cpu().numpy().ravel().tolist())
    
    return {
        'accuracy': accuracy_score(actuals, predictions),
        'predictions': np.array(predictions),
        'actuals': np.array(actuals)
    }

def predict(row, model, device='cpu'):
    model = model.to(device).eval()
    with torch.no_grad():
        tensor = torch.as_tensor(row, dtype=torch.float32).to(device)
        prediction = model(tensor.unsqueeze(0))
        return torch.sigmoid(prediction).cpu().numpy().item()

In [152]:
if __name__ == '__main__':
    path = "/Users/advaithvecham/Studying Stuff/PoC/march-machine-learning-mania-2025/MRegularSeasonDetailedResults.csv"
    train_dl, test_dl = prepare_data(path)
    print(f"Training samples: {len(train_dl.dataset)}, Test samples: {len(test_dl.dataset)}")
    
    model = MLP(34)  # Match your actual input feature count
    train_model(train_dl, model)
    
    acc = evaluate_model(test_dl, model)
    print(f'Accuracy: {acc:.3f}')


Training samples: 186756, Test samples: 46690


                                                             

RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x25 and 34x20)

In [164]:
# Display all rows
pd.set_option('display.max_rows', None)

# Display all columns
pd.set_option('display.max_columns', None) 

df = pd.read_csv("march-machine-learning-mania-2025/MNCAATourneyDetailedResults.csv")
new_order = ['Season', 'DayNum', 'WTeamID','LTeamID', 'WScore', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']
df = df[new_order]

       # Define a function to flip the data
def flip_data(df):
       # Create the winning team's perspective (Y = 1)
       win_df = df.copy()
       win_df['CurrentTeamID'] = win_df['WTeamID']
       win_df['OpponentTeamID'] = win_df['LTeamID']
       win_df['Result'] = 1  # Win
       win_df['CurrentTeam_Loc'] = win_df['WLoc']

       # Create the losing team's perspective (Y = 0)
       lose_df = df.copy()
       lose_df['CurrentTeamID'] = lose_df['LTeamID']
       lose_df['OpponentTeamID'] = lose_df['WTeamID']
       lose_df['Result'] = 0  # Loss
       
       # Flip the location for the losing team's perspective
       lose_df['CurrentTeam_Loc'] = lose_df['WLoc'].map({'H': -1, 'A': 1, 'N':0})

       # Rename columns for the winning team's perspective
       win_df = win_df.rename(columns={
              'WFGM': 'CurrentTeam_FGM',
              'WFGA': 'CurrentTeam_FGA',
              'WFGM3': 'CurrentTeam_FGM3',
              'WFGA3': 'CurrentTeam_FGA3',
              'WFTM': 'CurrentTeam_FTM',
              'WFTA': 'CurrentTeam_FTA',
              'WOR': 'CurrentTeam_OR',
              'WDR': 'CurrentTeam_DR',
              'WAst': 'CurrentTeam_Ast',
              'WTO': 'CurrentTeam_TO',
              'WStl': 'CurrentTeam_Stl',
              'WBlk': 'CurrentTeam_Blk',
              'WPF': 'CurrentTeam_PF',
              'LFGM': 'OpponentTeam_FGM',
              'LFGA': 'OpponentTeam_FGA',
              'LFGM3': 'OpponentTeam_FGM3',
              'LFGA3': 'OpponentTeam_FGA3',
              'LFTM': 'OpponentTeam_FTM',
              'LFTA': 'OpponentTeam_FTA',
              'LOR': 'OpponentTeam_OR',
              'LDR': 'OpponentTeam_DR',
              'LAst': 'OpponentTeam_Ast',
              'LTO': 'OpponentTeam_TO',
              'LStl': 'OpponentTeam_Stl',
              'LBlk': 'OpponentTeam_Blk',
              'LPF': 'OpponentTeam_PF',
              'WScore': 'CurrentTeam_Score',
              'LScore': 'OpponentTeam_Score',
       })

       # Rename columns for the losing team's perspective
       lose_df = lose_df.rename(columns={
              'LFGM': 'CurrentTeam_FGM',
              'LFGA': 'CurrentTeam_FGA',
              'LFGM3': 'CurrentTeam_FGM3',
              'LFGA3': 'CurrentTeam_FGA3',
              'LFTM': 'CurrentTeam_FTM',
              'LFTA': 'CurrentTeam_FTA',
              'LOR': 'CurrentTeam_OR',
              'LDR': 'CurrentTeam_DR',
              'LAst': 'CurrentTeam_Ast',
              'LTO': 'CurrentTeam_TO',
              'LStl': 'CurrentTeam_Stl',
              'LBlk': 'CurrentTeam_Blk',
              'LPF': 'CurrentTeam_PF',
              'WFGM': 'OpponentTeam_FGM',
              'WFGA': 'OpponentTeam_FGA',
              'WFGM3': 'OpponentTeam_FGM3',
              'WFGA3': 'OpponentTeam_FGA3',
              'WFTM': 'OpponentTeam_FTM',
              'WFTA': 'OpponentTeam_FTA',
              'WOR': 'OpponentTeam_OR',
              'WDR': 'OpponentTeam_DR',
              'WAst': 'OpponentTeam_Ast',
              'WTO': 'OpponentTeam_TO',
              'WStl': 'OpponentTeam_Stl',
              'WBlk': 'OpponentTeam_Blk',
              'WPF': 'OpponentTeam_PF',
              'LScore': 'CurrentTeam_Score',
              'WScore': 'OpponentTeam_Score',
       })

       # Combine the two DataFrames
       flipped_df = pd.concat([win_df, lose_df], ignore_index=True)

       # Drop the original 'W' and 'L' columns
       flipped_df = flipped_df.drop(columns=['WLoc', 'WTeamID', 'LTeamID'])

       return flipped_df

def calculate_percents(df):
       # Calculate percentages for the current team
       df['CurrentTeam_FG_Pct'] = df['CurrentTeam_FGM'] / df['CurrentTeam_FGA']  # Field goal percentage
       df['CurrentTeam_3P_Pct'] = df['CurrentTeam_FGM3'] / df['CurrentTeam_FGA3']  # 3-point percentage
       df['CurrentTeam_FT_Pct'] = df['CurrentTeam_FTM'] / df['CurrentTeam_FTA']  # Free throw percentage
       df['CurrentTeam_Reb'] = df['CurrentTeam_OR'] + df['CurrentTeam_DR']  # Total rebounds

       # Calculate percentages for the opponent team
       df['OpponentTeam_FG_Pct'] = df['OpponentTeam_FGM'] / df['OpponentTeam_FGA']  # Field goal percentage
       df['OpponentTeam_3P_Pct'] = df['OpponentTeam_FGM3'] / df['OpponentTeam_FGA3']  # 3-point percentage
       df['OpponentTeam_FT_Pct'] = df['OpponentTeam_FTM'] / df['OpponentTeam_FTA']  # Free throw percentage
       df['OpponentTeam_Reb'] = df['OpponentTeam_OR'] + df['OpponentTeam_DR']  # Total rebounds

       # Drop redundant raw count columns
       redundant_columns = [
              'CurrentTeam_FGM', 'CurrentTeam_FGA', 'CurrentTeam_FGM3', 'CurrentTeam_FGA3',
              'CurrentTeam_FTM', 'CurrentTeam_FTA', 'CurrentTeam_OR', 'CurrentTeam_DR',
              'OpponentTeam_FGM', 'OpponentTeam_FGA', 'OpponentTeam_FGM3', 'OpponentTeam_FGA3',
              'OpponentTeam_FTM', 'OpponentTeam_FTA', 'OpponentTeam_OR', 'OpponentTeam_DR'
       ]
       df = df.drop(columns=redundant_columns)

       # Handle division by zero (replace NaN with 0)
       df = df.fillna(0)
       
       return df

# Calculate delta_time
most_recent_season = df['Season'].max()
df['delta_time'] = most_recent_season - df['Season']

# Calculate weights using exponential decay
lambda_decay = 0.5  # Tune this hyperparameter
df['weight'] = np.exp(-lambda_decay * df['delta_time'])

# Use the weights as sample weights
sample_weights = df['weight'].values

df = flip_data(df)

# df.pop('weight')
# df.pop('delta_time')
# df.pop('Season')
df = calculate_percents(df)

# df.columns
print(len(df.columns))

29


In [None]:
# class MLP(Module):
#     # define model elements
#     def __init__(self, n_inputs):
#         super(MLP, self).__init__()
#         self.layer = Linear(n_inputs, 1)
#         self.activation = Sigmoid()
 
#     # forward propagate input
#     def forward(self, X):
#         X = self.layer(X)
#         X = self.activation(X)
#         return X

NameError: name 'Module' is not defined