In [193]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss
from sklearn.metrics import accuracy_score
from pytorch_tabnet.tab_model import TabNetClassifier

In [194]:
# Project Paths
# Define the root project directory and construct paths to relevant subdirectories
project_root = os.path.abspath(os.path.join(os.path.dirname(os.getcwd())))

In [195]:
basics_dir_path = os.path.join(project_root, 'data', 'section_1_basics')
team_box_scores_dir_path = os.path.join(project_root, 'data', 'section_2_team_box_scores')
geography_dir_path = os.path.join(project_root, 'data', 'section_3_geography')
public_rankings_dir_path = os.path.join(project_root, 'data', 'section_4_public_rankings')
supplements_dir_path = os.path.join(project_root, 'data', 'section_5_supplements')

In [196]:
# Data Completeness Check
# Check for missing values and duplicates in all files within a directory
def data_completeness(directory_path):
    directory = os.listdir(directory_path)
    for file in directory:
        file_path = os.path.join(directory_path, file)
        if os.path.isfile(file_path):
            df = pd.read_csv(file_path)
            df.replace("", np.nan, inplace=True)
            num_duplicates = df.duplicated().sum()
            missing_val_count_by_column = df.isnull().sum()
            if missing_val_count_by_column.any():
                print(f'Missing values in {file}:\n{missing_val_count_by_column[missing_val_count_by_column > 0]}')
            if num_duplicates > 0:
                print(f'Duplicates in {file}: {num_duplicates}')

In [197]:
# Run data validation on all relevant directories
data_completeness(basics_dir_path)
data_completeness(team_box_scores_dir_path)
data_completeness(geography_dir_path)
data_completeness(public_rankings_dir_path)
data_completeness(supplements_dir_path)

In [198]:
# Modeling Pipeline
# Load regular season results for a given gender ('M' or 'W')
def load_regular_season_results(gender):
    filename = f"{gender}RegularSeasonCompactResults.csv"
    reg_res = pd.read_csv(os.path.join(basics_dir_path, filename))
    return reg_res

In [199]:
# Load tournament seed data and extract numerical seed value
def load_seed_data(gender):
    filename = f"{gender}NCAATourneySeeds.csv"
    seeds_df = pd.read_csv(os.path.join(basics_dir_path, filename))
    seeds_df['Seed'] = seeds_df['Seed'].str.extract('(\d+)').astype(int)
    return seeds_df

In [200]:
# Generate all valid matchups for a given season
def generate_matchups(team_df, season):
    team_ids = team_df['TeamID'].unique()
    pairs = [(a, b) for a, b in combinations(team_ids, 2)]
    return pd.DataFrame({
        'Season': season,
        'Team1ID': [min(a, b) for a, b in pairs],
        'Team2ID': [max(a, b) for a, b in pairs]
    })

In [201]:
# Aggregate average points and win rate statistics for each team
def build_team_stats(df):
    winning = df[['Season', 'WTeamID', 'WScore', 'LScore']].rename(
        columns={'WTeamID': 'TeamID', 'WScore': 'PointsFor', 'LScore': 'PointsAgainst'})
    winning['Win'] = 1

    losing = df[['Season', 'LTeamID', 'LScore', 'WScore']].rename(
        columns={'LTeamID': 'TeamID', 'LScore': 'PointsFor', 'WScore': 'PointsAgainst'})
    losing['Win'] = 0

    all_stats = pd.concat([winning, losing])
    team_stats = all_stats.groupby(['Season', 'TeamID']).agg(
        avg_points_for=('PointsFor', 'mean'),
        avg_points_against=('PointsAgainst', 'mean'),
        win_pct=('Win', 'mean')
    ).reset_index()
    return team_stats

In [202]:
def prep_training_data(gender):
    df = load_regular_season_results(gender)
    team_stats = build_team_stats(df)
    seeds_df = load_seed_data(gender)

    # Create labeled matchup data from real games
    matchups = df[['Season', 'WTeamID', 'LTeamID']].copy()
    matchups['Team1ID'] = matchups['WTeamID']
    matchups['Team2ID'] = matchups['LTeamID']
    matchups['Team1Won'] = 1

    # Merge in features for both teams
    for i in [1, 2]:
        matchups = matchups.merge(
            team_stats,
            how='left',
            left_on=['Season', f'Team{i}ID'],
            right_on=['Season', 'TeamID']
        ).merge(
            seeds_df,
            how='left',
            left_on=['Season', f'Team{i}ID'],
            right_on=['Season', 'TeamID']
        ).rename(columns={
            'avg_points_for': f'Team{i}_avg_points_for',
            'avg_points_against': f'Team{i}_avg_points_against',
            'win_pct': f'Team{i}_win_pct',
            'Seed': f'Team{i}_Seed'
        }).drop(columns=['TeamID_x', 'TeamID_y'])

    # Randomize order of teams to avoid label bias
    np.random.seed(42)
    swap_mask = np.random.rand(len(matchups)) < 0.5
    for col in ['avg_points_for', 'avg_points_against', 'win_pct', 'Seed']:
        matchups.loc[swap_mask, [f'Team1_{col}', f'Team2_{col}']] = matchups.loc[swap_mask, [f'Team2_{col}', f'Team1_{col}']].values
    matchups['Team1Won'] = (~swap_mask).astype(int)

    # Select features and train/test split
    feature_cols = [
        'Team1_avg_points_for', 'Team1_avg_points_against', 'Team1_win_pct', 'Team1_Seed',
        'Team2_avg_points_for', 'Team2_avg_points_against', 'Team2_win_pct', 'Team2_Seed']

    X = matchups[feature_cols].fillna(-1)
    y = matchups['Team1Won']
    return X, y, team_stats, seeds_df

In [203]:
def split_data(X, y):
    X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
    )

    X_train, X_valid, y_train, y_valid = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)
    X_test_scaled = scaler.transform(X_test)
    return scaler, X_train_scaled, y_train, X_valid_scaled, y_valid, X_test_scaled, y_test

In [204]:
def create_and_train_model(X_train_scaled, y_train, X_valid_scaled, y_valid):
    model = TabNetClassifier()
    model.fit(X_train=X_train_scaled, y_train=y_train, eval_set=[(X_valid_scaled, y_valid)])
    return model

In [205]:
def model_stats(model, X_test_scaled, y_test, gender):
    pred = model.predict(X_test_scaled)
    probs = model.predict_proba(X_test_scaled)[:, 1]

    print(f"{gender} Model Accuracy:", accuracy_score(y_test, pred))
    brier = brier_score_loss(y_test, probs)
    print(f"{gender} Model Brier Score: {brier:.4f}")

In [206]:
# Create features and predict win probabilities for generated matchups
def create_features_and_predict(matchups, team_stats, seeds_df, model, scaler):
    for i in [1, 2]:
        matchups = matchups.merge(
            team_stats,
            how='left',
            left_on=['Season', f'Team{i}ID'],
            right_on=['Season', 'TeamID']
        ).merge(
            seeds_df,
            how='left',
            left_on=['Season', f'Team{i}ID'],
            right_on=['Season', 'TeamID']
        ).rename(columns={
            'avg_points_for': f'Team{i}_avg_points_for',
            'avg_points_against': f'Team{i}_avg_points_against',
            'win_pct': f'Team{i}_win_pct',
            'Seed': f'Team{i}_Seed'
        }).drop(columns=['TeamID_x', 'TeamID_y'])

    features = matchups[[
        'Team1_avg_points_for', 'Team1_avg_points_against', 'Team1_win_pct', 'Team1_Seed',
        'Team2_avg_points_for', 'Team2_avg_points_against', 'Team2_win_pct', 'Team2_Seed']].fillna(-1)

    features_scaled = scaler.transform(features)
    probs = model.predict_proba(features_scaled)[:, 1]

    matchups['ID'] = matchups.apply(lambda row: f"2025_{int(row.Team1ID)}_{int(row.Team2ID)}", axis=1)
    matchups['Pred'] = probs
    return matchups[['ID', 'Pred']]

In [207]:
# Main pipeline function to train both models and generate the submission file
def create_submission():
    # Train men's model and predict
    m_X, m_y, m_stats, m_seeds = prep_training_data('M')
    m_scaler, m_X_train_scaled, m_y_train, m_X_valid_scaled, m_y_valid, m_X_test_scaled, m_y_test = split_data(m_X, m_y)
    m_model = create_and_train_model(m_X_train_scaled, m_y_train, m_X_valid_scaled, m_y_valid)
    model_stats(m_model, m_X_test_scaled, m_y_test, 'M')
    m_teams = pd.read_csv(os.path.join(basics_dir_path, "MTeams.csv"))
    m_matchups = generate_matchups(m_teams, 2025)
    m_preds = create_features_and_predict(m_matchups, m_stats, m_seeds, m_model, m_scaler)

    # Train women's model and predict
    w_X, w_y, w_stats, w_seeds = prep_training_data('W')
    w_scaler, w_X_train_scaled, w_y_train, w_X_valid_scaled, w_y_valid, w_X_test_scaled, w_y_test = split_data(w_X, w_y)
    w_model = create_and_train_model(w_X_train_scaled, w_y_train, w_X_valid_scaled, w_y_valid)
    model_stats(w_model, w_X_test_scaled, w_y_test, 'W')
    w_teams = pd.read_csv(os.path.join(basics_dir_path, "WTeams.csv"))
    w_matchups = generate_matchups(w_teams, 2025)
    w_preds = create_features_and_predict(w_matchups, w_stats, w_seeds, w_model, w_scaler)

    # Save combined predictions to submission file
    submission = pd.concat([m_preds, w_preds])
    submission.to_csv(f"submission.csv", index=False)
    print(f"Submission file saved as submission.csv")

In [208]:
create_submission()



epoch 0  | loss: 0.55229 | val_0_auc: 0.81531 |  0:00:04s
epoch 1  | loss: 0.52422 | val_0_auc: 0.81566 |  0:00:10s
epoch 2  | loss: 0.52333 | val_0_auc: 0.81627 |  0:00:15s
epoch 3  | loss: 0.5223  | val_0_auc: 0.81595 |  0:00:20s
epoch 4  | loss: 0.52162 | val_0_auc: 0.81453 |  0:00:25s
epoch 5  | loss: 0.52182 | val_0_auc: 0.81785 |  0:00:30s
epoch 6  | loss: 0.51997 | val_0_auc: 0.81775 |  0:00:35s
epoch 7  | loss: 0.51961 | val_0_auc: 0.81758 |  0:00:39s
epoch 8  | loss: 0.51951 | val_0_auc: 0.81817 |  0:00:44s
epoch 9  | loss: 0.51911 | val_0_auc: 0.81866 |  0:00:49s
epoch 10 | loss: 0.51838 | val_0_auc: 0.81853 |  0:00:54s
epoch 11 | loss: 0.51875 | val_0_auc: 0.81903 |  0:00:59s
epoch 12 | loss: 0.51893 | val_0_auc: 0.81909 |  0:01:04s
epoch 13 | loss: 0.51788 | val_0_auc: 0.81867 |  0:01:09s
epoch 14 | loss: 0.51784 | val_0_auc: 0.81961 |  0:01:14s
epoch 15 | loss: 0.51725 | val_0_auc: 0.81939 |  0:01:19s
epoch 16 | loss: 0.5179  | val_0_auc: 0.81916 |  0:01:24s
epoch 17 | los



M Model Accuracy: 0.7438708339812368
M Model Brier Score: 0.1711




epoch 0  | loss: 0.52935 | val_0_auc: 0.83873 |  0:00:03s
epoch 1  | loss: 0.48814 | val_0_auc: 0.84863 |  0:00:07s
epoch 2  | loss: 0.48158 | val_0_auc: 0.84888 |  0:00:11s
epoch 3  | loss: 0.47916 | val_0_auc: 0.85137 |  0:00:15s
epoch 4  | loss: 0.47799 | val_0_auc: 0.85236 |  0:00:18s
epoch 5  | loss: 0.47736 | val_0_auc: 0.85191 |  0:00:22s
epoch 6  | loss: 0.47591 | val_0_auc: 0.85274 |  0:00:25s
epoch 7  | loss: 0.47495 | val_0_auc: 0.8533  |  0:00:29s
epoch 8  | loss: 0.47524 | val_0_auc: 0.85072 |  0:00:32s
epoch 9  | loss: 0.47485 | val_0_auc: 0.853   |  0:00:35s
epoch 10 | loss: 0.47419 | val_0_auc: 0.85324 |  0:00:39s
epoch 11 | loss: 0.47418 | val_0_auc: 0.85385 |  0:00:42s
epoch 12 | loss: 0.47258 | val_0_auc: 0.85412 |  0:00:46s
epoch 13 | loss: 0.47415 | val_0_auc: 0.85319 |  0:00:49s
epoch 14 | loss: 0.47297 | val_0_auc: 0.85409 |  0:00:53s
epoch 15 | loss: 0.47309 | val_0_auc: 0.8536  |  0:00:56s
epoch 16 | loss: 0.47247 | val_0_auc: 0.85403 |  0:01:00s
epoch 17 | los



W Model Accuracy: 0.7679705174049478
W Model Brier Score: 0.1554
Submission file saved as submission.csv


Actual Kaggle Submission Brier Score: 0.13163