In [57]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss
from sklearn.metrics import accuracy_score
from pytorch_tabnet.tab_model import TabNetClassifier

In [58]:
# Project Paths
# Define the root project directory and construct paths to relevant subdirectories
project_root = os.path.abspath(os.path.join(os.path.dirname(os.getcwd())))

In [59]:
basics_dir_path = os.path.join(project_root, 'data', 'section_1_basics')
team_box_scores_dir_path = os.path.join(project_root, 'data', 'section_2_team_box_scores')
geography_dir_path = os.path.join(project_root, 'data', 'section_3_geography')
public_rankings_dir_path = os.path.join(project_root, 'data', 'section_4_public_rankings')
supplements_dir_path = os.path.join(project_root, 'data', 'section_5_supplements')

In [60]:
# Modeling Pipeline
# Load regular season results for a given gender ('M' or 'W')
def load_regular_season_results(gender, season):
    filename = f"{gender}RegularSeasonCompactResults.csv"
    reg_res = pd.read_csv(os.path.join(basics_dir_path, filename))
    return reg_res[reg_res['Season'] < season]

In [61]:
# Load tournament seed data and extract numerical seed value
def load_seed_data(gender, season):
    filename = f"{gender}NCAATourneySeeds.csv"
    seeds_df = pd.read_csv(os.path.join(basics_dir_path, filename))
    seeds_df = seeds_df[seeds_df['Season'] < season]
    return seeds_df

In [75]:
# Load Tournament Data
def load_tourney_data(gender, season):
    filename = f"{gender}NCAATourneyCompactResults.csv"
    tourney_df = pd.read_csv(os.path.join(basics_dir_path, filename))
    return tourney_df[tourney_df['Season'] < season]

In [76]:
#Load Regular Season Data
def create_matchups(tourney_df):
    rows = []
    for _, row in tourney_df.iterrows():
        team1 = min(row['WTeamID'], row['LTeamID'])
        team2 = max(row['WTeamID'], row['LTeamID'])
        team1won = 1 if team1 == row['WTeamID'] else 0
        rows.append({
            'season': row['Season'],
            'team1ID': team1,
            'team2ID': team2,
            'team1won': team1won
        })
    return pd.DataFrame(rows)

In [77]:
# Aggregate team's regular season stats into one df
def aggregate_team_stats(df):
    w_stats = df.groupby(['Season', 'WTeamID']).agg(
        w_score_avg=('WScore', 'mean'),
        w_games=('WScore', 'count')
    ).rename_axis(['Season', 'TeamID']).reset_index()
    
    l_stats = df.groupby(['Season', 'LTeamID']).agg(
        l_score_avg=('LScore', 'mean'),
        l_games=('LScore', 'count')
    ).rename_axis(['Season', 'TeamID']).reset_index()

    stats = pd.merge(w_stats, l_stats, on=['Season', 'TeamID'], how='outer').fillna(0)
    stats['games'] = stats['w_games'] + stats['l_games']
    stats['win_rate'] = stats['w_games'] / stats['games']
    stats['avg_score'] = (stats['w_score_avg'] * stats['w_games'] + stats['l_score_avg'] * stats['l_games']) / stats['games']
    return stats[['Season', 'TeamID', 'avg_score', 'games', 'win_rate']]

In [78]:
# Generate all valid matchups for a given season
def generate_matchups(team_df, season):
    team_ids = team_df['TeamID'].unique()
    pairs = [(a, b) for a, b in combinations(team_ids, 2)]
    return pd.DataFrame({
        'Season': season,
        'team1ID': [min(a, b) for a, b in pairs],
        'team2ID': [max(a, b) for a, b in pairs]
    })

In [79]:
#Set up Training Data Matrix
def prep_training_data(gender):
    regular_season_df = load_regular_season_results('M', 2025)
    seeds_df = load_seed_data('M', 2025)
    tourney_df = load_tourney_data('M', 2025)
    
    #Create ALL Valid Tournament Matchups Per Season
    matchups_df = create_matchups(tourney_df)
    team_stats = aggregate_team_stats(regular_season_df)

    seeds_df['Seed'] = seeds_df['Seed'].str.extract('(\d+)').astype(int)
    matchups_df = matchups_df.rename(columns={'season': 'Season'})
    team_stats = team_stats.rename(columns={'season': 'Season'})
    seeds_df = seeds_df.rename(columns={'Season': 'Season'})

    # Rename for merging
    team1_seeds = seeds_df.rename(columns={'Season':'Season', 'TeamID': 'team1ID', 'Seed': 'team1_seed'})
    team2_seeds = seeds_df.rename(columns={'Seaspn':'Season','TeamID': 'team2ID', 'Seed': 'team2_seed'})

    # Merge all df's into one
    merged_df = matchups_df \
        .merge(team1_seeds[['Season', 'team1ID', 'team1_seed']], on=['Season', 'team1ID'], how='left') \
        .merge(team2_seeds[['Season', 'team2ID', 'team2_seed']], on=['Season', 'team2ID'], how='left') \
        .merge(team_stats.rename(columns={'TeamID': 'team1ID', 'avg_score': 'team1_avg_score', 'games': 'team1_games', 'win_rate': 'team1_win_rate'}), on=['Season', 'team1ID'], how='left') \
        .merge(team_stats.rename(columns={'TeamID': 'team2ID', 'avg_score': 'team2_avg_score', 'games': 'team2_games', 'win_rate': 'team2_win_rate'}), on=['Season', 'team2ID'], how='left')

    # Feature Engineering Seed Differences and Score Differences
    merged_df['seed_diff'] = merged_df['team1_seed'] - merged_df['team2_seed']
    merged_df['score_diff'] = merged_df['team1_avg_score'] - merged_df['team2_avg_score']

    cols = [
        'team1_seed', 'team1_avg_score', 'team1_games', 'team1_win_rate',
        'team2_seed', 'team2_avg_score', 'team2_games', 'team2_win_rate',
        'seed_diff', 'score_diff',
        'team1won'
    ]
    merged_df = merged_df[cols]
    return merged_df

In [80]:
def split_data(X, y):
    X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
    )

    #Split Training into train and valid
    X_train, X_valid, y_train, y_valid = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)
    X_test_scaled = scaler.transform(X_test)
    return scaler, X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test

In [81]:
def create_and_train_model(X_train_scaled, y_train, X_valid_scaled, y_valid):
    model = TabNetClassifier()
    model.fit(X_train=X_train_scaled, y_train=y_train, eval_set=[(X_valid_scaled, y_valid)])
    return model

In [83]:
def model_stats(model, X_test_scaled, y_test, gender):
    pred = model.predict(X_test_scaled)
    probs = model.predict_proba(X_test_scaled)[:, 1]

    print(f"{gender} Model Accuracy:", accuracy_score(y_test, pred))
    brier = brier_score_loss(y_test, probs)
    print(f"{gender} Model Brier Score: {brier:.4f}")

In [84]:
#Same as prepping training data except without 2025 tourney data because that is what were predicting
def create_predicting_features(gender, matchups):
    regular_season_df = load_regular_season_results(gender, 2026)
    regular_season_df = regular_season_df[regular_season_df['Season'] == 2025]
    
    seeds_df = load_seed_data(gender, 2026)
    seeds_df = seeds_df[seeds_df['Season'] == 2025]

    team_stats = aggregate_team_stats(regular_season_df)

    seeds_df['Seed'] = seeds_df['Seed'].str.extract('(\d+)').astype(int)
    
    matchups = matchups.rename(columns={'season': 'Season'})
    team_stats = team_stats.rename(columns={'season': 'Season'})
    seeds_df = seeds_df.rename(columns={'Season': 'Season'})

    team1_seeds = seeds_df.rename(columns={'Season':'Season', 'TeamID': 'team1ID', 'Seed': 'team1_seed'})
    team2_seeds = seeds_df.rename(columns={'Season':'Season','TeamID': 'team2ID', 'Seed': 'team2_seed'})

    merged_df = matchups \
        .merge(team1_seeds[['Season', 'team1ID', 'team1_seed']], on=['Season', 'team1ID'], how='left') \
        .merge(team2_seeds[['Season', 'team2ID', 'team2_seed']], on=['Season', 'team2ID'], how='left') \
        .merge(team_stats.rename(columns={'TeamID': 'team1ID', 'avg_score': 'team1_avg_score', 'games': 'team1_games', 'win_rate': 'team1_win_rate'}), on=['Season', 'team1ID'], how='left') \
        .merge(team_stats.rename(columns={'TeamID': 'team2ID', 'avg_score': 'team2_avg_score', 'games': 'team2_games', 'win_rate': 'team2_win_rate'}), on=['Season', 'team2ID'], how='left').fillna(17)

    merged_df['seed_diff'] = merged_df['team1_seed'] - merged_df['team2_seed']
    merged_df['score_diff'] = merged_df['team1_avg_score'] - merged_df['team2_avg_score']

    cols = [
        'team1_seed', 'team1_avg_score', 'team1_games', 'team1_win_rate',
        'team2_seed', 'team2_avg_score', 'team2_games', 'team2_win_rate',
        'seed_diff', 'score_diff'
    ]
    merged_df = merged_df[cols]
    return merged_df

In [85]:
def evaluate_model():
    m_df = prep_training_data('M')
    cols = [
        'team1_seed', 'team1_avg_score', 'team1_games', 'team1_win_rate',
        'team2_seed', 'team2_avg_score', 'team2_games', 'team2_win_rate',
        'seed_diff', 'score_diff'
    ]
    m_X = m_df[cols]
    m_y = m_df['team1won']
    m_scaler, m_X_train_scaled, m_y_train, m_X_valid_scaled, m_y_valid, m_X_test_scaled, m_y_test = split_data(m_X, m_y)
    m_model = create_and_train_model(m_X_train_scaled, m_y_train, m_X_valid_scaled, m_y_valid)
    model_stats(m_model, m_X_test_scaled, m_y_test, 'M')
    m_teams = pd.read_csv(os.path.join(basics_dir_path, "MTeams.csv"))
    m_teams = m_teams[m_teams['LastD1Season']==2025]
    m_matchups = generate_matchups(m_teams, 2025)
    m_pred_df = create_predicting_features('M', m_matchups)
    m_pred_scaled = m_scaler.transform(m_pred_df)
    m_probs = m_model.predict_proba(m_pred_scaled)[:,1]
    m_matchups['ID'] = m_matchups.apply(lambda row: f"2025_{int(row.team1ID)}_{int(row.team2ID)}", axis=1)
    m_matchups['Pred'] = m_probs
    men_submission = m_matchups[['ID', 'Pred']]

    w_df = prep_training_data('W')
    w_X = w_df[cols]
    w_y = w_df['team1won']
    w_scaler, w_X_train_scaled, w_y_train, w_X_valid_scaled, w_y_valid, w_X_test_scaled, w_y_test = split_data(w_X, w_y)
    w_model = create_and_train_model(w_X_train_scaled, w_y_train, w_X_valid_scaled, w_y_valid)
    model_stats(w_model, w_X_test_scaled, w_y_test, 'W')
    w_teams = pd.read_csv(os.path.join(basics_dir_path, "WTeams.csv"))
    w_matchups = generate_matchups(w_teams, 2025)
    w_pred_df = create_predicting_features('W', w_matchups)
    w_pred_scaled = w_scaler.transform(w_pred_df)
    w_probs = w_model.predict_proba(w_pred_scaled)[:,1]
    w_matchups['ID'] = w_matchups.apply(lambda row: f"2025_{int(row.team1ID)}_{int(row.team2ID)}", axis=1)
    w_matchups['Pred'] = w_probs
    women_submission = w_matchups[['ID', 'Pred']]

    # Save combined predictions to submission file
    submission = pd.concat([men_submission, women_submission])
    submission.to_csv(f"submission.csv", index=False)
    print(f"Submission file saved as submission.csv")
    return submission
evaluate_model()



epoch 0  | loss: 0.78113 | val_0_auc: 0.62406 |  0:00:00s
epoch 1  | loss: 0.66906 | val_0_auc: 0.71222 |  0:00:00s
epoch 2  | loss: 0.62379 | val_0_auc: 0.74955 |  0:00:00s
epoch 3  | loss: 0.58548 | val_0_auc: 0.74707 |  0:00:00s
epoch 4  | loss: 0.59038 | val_0_auc: 0.75001 |  0:00:00s
epoch 5  | loss: 0.57772 | val_0_auc: 0.75571 |  0:00:00s
epoch 6  | loss: 0.57748 | val_0_auc: 0.76476 |  0:00:00s
epoch 7  | loss: 0.56846 | val_0_auc: 0.77875 |  0:00:00s
epoch 8  | loss: 0.56951 | val_0_auc: 0.78133 |  0:00:00s
epoch 9  | loss: 0.56612 | val_0_auc: 0.78224 |  0:00:00s
epoch 10 | loss: 0.56302 | val_0_auc: 0.77794 |  0:00:00s
epoch 11 | loss: 0.55702 | val_0_auc: 0.77993 |  0:00:00s
epoch 12 | loss: 0.56734 | val_0_auc: 0.78309 |  0:00:00s
epoch 13 | loss: 0.54017 | val_0_auc: 0.78328 |  0:00:00s
epoch 14 | loss: 0.5442  | val_0_auc: 0.78468 |  0:00:00s
epoch 15 | loss: 0.56055 | val_0_auc: 0.78895 |  0:00:00s
epoch 16 | loss: 0.54599 | val_0_auc: 0.79029 |  0:00:00s
epoch 17 | los



epoch 0  | loss: 0.78113 | val_0_auc: 0.62406 |  0:00:00s
epoch 1  | loss: 0.66906 | val_0_auc: 0.71222 |  0:00:00s
epoch 2  | loss: 0.62379 | val_0_auc: 0.74955 |  0:00:00s
epoch 3  | loss: 0.58548 | val_0_auc: 0.74707 |  0:00:00s
epoch 4  | loss: 0.59038 | val_0_auc: 0.75001 |  0:00:00s
epoch 5  | loss: 0.57772 | val_0_auc: 0.75571 |  0:00:00s
epoch 6  | loss: 0.57748 | val_0_auc: 0.76476 |  0:00:00s
epoch 7  | loss: 0.56846 | val_0_auc: 0.77875 |  0:00:00s
epoch 8  | loss: 0.56951 | val_0_auc: 0.78133 |  0:00:00s
epoch 9  | loss: 0.56612 | val_0_auc: 0.78224 |  0:00:00s
epoch 10 | loss: 0.56302 | val_0_auc: 0.77794 |  0:00:00s
epoch 11 | loss: 0.55702 | val_0_auc: 0.77993 |  0:00:00s
epoch 12 | loss: 0.56734 | val_0_auc: 0.78309 |  0:00:00s
epoch 13 | loss: 0.54017 | val_0_auc: 0.78328 |  0:00:00s
epoch 14 | loss: 0.5442  | val_0_auc: 0.78468 |  0:00:00s
epoch 15 | loss: 0.56055 | val_0_auc: 0.78895 |  0:00:00s
epoch 16 | loss: 0.54599 | val_0_auc: 0.79029 |  0:00:00s
epoch 17 | los



Submission file saved as submission.csv


Unnamed: 0,ID,Pred
0,2025_1101_1102,0.447608
1,2025_1101_1103,0.828813
2,2025_1101_1104,0.004510
3,2025_1101_1105,0.376583
4,2025_1101_1106,0.322020
...,...,...
71248,2025_3477_3479,0.243521
71249,2025_3477_3480,0.361354
71250,2025_3478_3479,0.355252
71251,2025_3478_3480,0.355252


Brier Score for Kaggle Submission: 0.14814