In [8]:
"""CREDIT TO PRUTHA ANNADATE, https://www.kaggle.com/code/pruthaannadate/catboost-march-machine-learning-mania-25"""

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations

from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss, f1_score, accuracy_score, auc, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier

from keras.layers import Dense
from keras.models import Sequential

# Helper functions

In [9]:
def load_df_dict(path='march-machine-learning-mania-2025/'):
    data_dict = {}
    
    for fname in os.listdir(path):
        data_dict[fname.split('.')[0]] = pd.read_csv(path + fname)
        
    return data_dict

In [122]:
# fix feature set logic so you dont need to pass both W and L, append at the end
# add models iteratively 

def bracketology(dataset, feature_set=['WTeamID', 'LTeamID'], model_name='catboost'):
    
    model_directory = {
        'catboost': CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=200),
        'xgb': XGBClassifier(enable_categorical=True),
        'lr': LogisticRegression()
    }

    result_dict = {}

    X = dataset[feature_set]
    y = dataset['result']
    
    k_strat = StratifiedKFold(n_splits=4, shuffle=False)
    folds = k_strat.split(X, y)

    auc = []
    accuracy = []
    brier_score = []

    for train, test in folds:
        
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.values[train], y.values[test]

        model = model_directory[model_name]
        model.fit(X_train, y_train)

        y_preds = model.predict(X_test)
        y_probs = model.predict_proba(X_test)
        auc.append(roc_auc_score(y_test, y_probs[:,1]))
        accuracy.append(accuracy_score(y_test, y_preds))
        brier_score.append(brier_score_loss(y_test, y_probs[:,1]))
    
    model.fit(X, y)

    result_dict['model_name'] = model_name
    result_dict['model'] = model
    result_dict['auc'] = np.average(auc)
    result_dict['accuracy'] = np.average(accuracy)
    result_dict['brier_score'] = np.average(brier_score)
    
    return result_dict
            

# Pull in basic stats

In [11]:
dfs = load_df_dict()
results = pd.concat([dfs['MNCAATourneyDetailedResults'], dfs['WNCAATourneyDetailedResults']])
teams = pd.concat([dfs['MTeams'], dfs['WTeams']])

results_W = results[['Season', 'DayNum', 'WTeamID', 'WScore']]
results_L = results[['Season', 'DayNum', 'LTeamID', 'LScore']]
stats = results.drop(['Season', 'DayNum', 'WTeamID', 'WScore', 'Season', 'DayNum', 'LTeamID', 'LScore'], axis=1)

full_W = pd.merge(left=results_W, right=teams, left_on='WTeamID', right_on='TeamID', how='left')
full_W = full_W.rename(columns={'TeamName':'WTeamName', 'FirstD1Season':'WFirstD1Season', 'LastD1Season':'WLastD1Season'}).drop('TeamID', axis=1)
full_L = pd.merge(left=results_L, right=teams, left_on='LTeamID', right_on='TeamID', how='left')
full_L = full_L.rename(columns={'TeamName':'LTeamName', 'FirstD1Season':'LFirstD1Season', 'LastD1Season':'LLastD1Season'}).drop('TeamID', axis=1).drop(['Season', 'DayNum'],axis=1)

df = pd.concat([full_W, full_L, stats.reset_index()],axis=1).drop(['index', 'WLoc'], axis=1)

# i honestly shouldnt be doing dupes here but we'll see
new_df = pd.DataFrame({})
WL_cols = np.sort([col for col in df.columns if 'W' in col or 'L' in col]).tolist()[::-1]

reversed_WL_cols = [i for i in WL_cols if i[0] == 'L'] + [i for i in WL_cols if i[0] == 'W']
other_cols = np.setdiff1d(df.columns, WL_cols)

raw_names = [col.strip("W") for col in WL_cols if 'W' in col]
WL_pairs = [[f'W{col}', f'L{col}'] for col in raw_names]

for pair in WL_pairs:
    new_df[pair] = df[pair[::-1]]

df['result'] = 1
new_df['result'] = 0

new_df[other_cols] = df[other_cols]
new_df = new_df[df.columns]

master_df = pd.concat([df, new_df])

# Since we'll have to predict on combinations, we need to aggregate some stats to generate static features

Unnamed: 0,Season,DayNum,WTeamID,WScore,WTeamName,WFirstD1Season,WLastD1Season,LTeamID,LScore,LTeamName,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,result
0,2003,134,1421,92,UNC Asheville,1987.0,2025.0,1411,84,TX Southern,...,14,31,17,28,16,15,5,0,22,1
1,2003,136,1112,80,Arizona,1985.0,2025.0,1436,51,Vermont,...,7,7,8,26,12,17,10,3,15,1
2,2003,136,1113,84,Arizona St,1985.0,2025.0,1272,71,Memphis,...,14,21,20,22,11,12,2,5,18,1
3,2003,136,1141,79,C Michigan,1985.0,2025.0,1166,73,Creighton,...,12,17,14,17,20,21,6,6,21,1
4,2003,136,1143,76,California,1985.0,2025.0,1301,74,NC State,...,15,20,10,26,16,14,5,8,19,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2271,2024,147,3425,73,USC,,,3163,80,Connecticut,...,17,27,5,30,17,12,6,5,21,0
2272,2024,147,3261,87,LSU,,,3234,94,Iowa,...,17,22,3,29,16,11,6,3,15,0
2273,2024,151,3163,69,Connecticut,,,3234,71,Iowa,...,10,14,9,23,12,16,7,1,9,0
2274,2024,151,3301,59,NC State,,,3376,78,South Carolina,...,4,4,10,34,18,15,10,6,16,0


In [20]:
test_case = master_df[(master_df.WTeamID == 1112) | (master_df.LTeamID == 1112)].sort_values(['Season', 'DayNum'])

In [21]:
test_case_deduped = test_case.groupby(['Season', 'DayNum'], as_index=False).first()

In [69]:
# de-duping first - order of dupe removal doesn't matter bc 
def agg_team_stats(team_id, master_df):
    team_df = master_df[(master_df.WTeamID == team_id) | (master_df.LTeamID == team_id)].sort_values(['Season', 'DayNum'])
    team_df = team_df.groupby(['Season', 'DayNum'], as_index=False).first()

    result_dict = {}
    
    W_df = team_df[team_df['WTeamID'] == team_id]
    L_df = team_df[team_df['LTeamID'] == team_id]
    
    try:
        team_name = np.unique([W_df['WTeamName'].values[0]] + [L_df['LTeamName'].values[0]])[0]
    except:
        team_name = L_df['LTeamName'].values[0]
        
    alltime_W = W_df.shape[0]
    alltime_L = L_df.shape[0]
    
    total_pts = np.sum(W_df['WScore'].tolist() + L_df['LScore'].tolist())
    ppg = np.average(W_df['WScore'].tolist() + L_df['LScore'].tolist())

    longest_run = W_df.groupby('Season').agg({'DayNum':'count'}).reset_index()['DayNum'].max()
    shortest_run = L_df.groupby('Season').agg({'DayNum':'count'}).reset_index()['DayNum'].min() - 1

    count_yrs_tourney = len(np.unique(W_df['Season'].tolist() + L_df['Season'].tolist()))

    result_dict['TeamID'] = team_id
    result_dict['TeamName'] = team_name
    result_dict['alltime_W'] = alltime_W
    result_dict['alltime_L'] = alltime_L
    result_dict['total_pts'] = total_pts
    result_dict['ppg'] = ppg
    result_dict['longest_run'] = longest_run
    result_dict['shortest_run'] = shortest_run
    result_dict['count_yrs_tourney'] = count_yrs_tourney

    return result_dict

In [47]:
unique_teams = np.unique(master_df['WTeamID'].tolist() + master_df['LTeamID'].tolist())

In [105]:
# Get all aggs
agg_dfs = []
for i in unique_teams:
    agg_df = pd.DataFrame(agg_team_stats(i, master_df), index=[0])
    agg_dfs.append(agg_df)

all_stats_agg = pd.concat(agg_dfs).reset_index().drop('index', axis=1).sort_values('TeamID')

In [114]:
test1 = pd.merge(left=master_df, right=all_stats_agg, left_on='WTeamID', right_on='TeamID', how='left')
test2 = pd.merge(left=test1, right=all_stats_agg, left_on='LTeamID', right_on='TeamID', how='left')

In [120]:
pip freeze | grep xgb

xgboost==2.1.4
Note: you may need to restart the kernel to use updated packages.


In [121]:
model = XGBClassifier(enable_categorical=True)

In [116]:
test2

Unnamed: 0,Season,DayNum,WTeamID,WScore,WTeamName,WFirstD1Season,WLastD1Season,LTeamID,LScore,LTeamName,...,count_yrs_tourney_x,TeamID_y,TeamName_y,alltime_W_y,alltime_L_y,total_pts_y,ppg_y,longest_run_y,shortest_run_y,count_yrs_tourney_y
0,2003,134,1421,92,UNC Asheville,1987.0,2025.0,1411,84,TX Southern,...,5,1411,TX Southern,3,8,755,68.636364,1.0,1,8
1,2003,136,1112,80,Arizona,1985.0,2025.0,1436,51,Vermont,...,17,1436,Vermont,2,10,728,60.666667,1.0,1,10
2,2003,136,1113,84,Arizona St,1985.0,2025.0,1272,71,Memphis,...,6,1272,Memphis,17,12,2083,71.827586,5.0,1,12
3,2003,136,1141,79,C Michigan,1985.0,2025.0,1166,73,Creighton,...,1,1166,Creighton,11,12,1596,69.391304,3.0,1,12
4,2003,136,1143,76,California,1985.0,2025.0,1301,74,NC State,...,7,1301,NC State,13,11,1644,68.500000,4.0,1,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4547,2024,147,3425,73,USC,,,3163,80,Connecticut,...,3,3163,Connecticut,65,9,6002,81.108108,6.0,1,14
4548,2024,147,3261,87,LSU,,,3234,94,Iowa,...,10,3234,Iowa,21,12,2532,76.727273,5.0,1,12
4549,2024,151,3163,69,Connecticut,,,3234,71,Iowa,...,14,3234,Iowa,21,12,2532,76.727273,5.0,1,12
4550,2024,151,3301,59,NC State,,,3376,78,South Carolina,...,9,3376,South Carolina,42,9,3737,73.274510,6.0,1,12


In [86]:
# THIS NEEDS TO GET DONE

fin = []

for id in unique_teams:
    test_agg = pd.DataFrame(agg_team_stats(id, master_df), index=[0])
    test_deagg = master_df[(master_df['WTeamID'] == id) | (master_df['LTeamID'] == id)].sort_values('WTeamName')

    one = pd.merge(left=test_deagg, right=test_agg, left_on='WTeamID', right_on='TeamID', how='inner')
    two = pd.merge(left=test_deagg, right=test_agg, left_on='LTeamID', right_on='TeamID', how='inner')

    fin.append(pd.concat([one,two]))

test_master = pd.concat(fin)

In [150]:
feature_set = ['WTeamID', 'LTeamID', 'WFirstD1Season', 'LFirstD1Season', 'Season', 'NumOT']
default = bracketology(dataset=test_master, feature_set=feature_set, model_name='xgb')

In [147]:
len(unique_teams)

493

In [139]:
model_to_save = default['model']

In [154]:
''.startswith('L')

False

In [None]:
def forecast(model, unique_teams=unique_teams, master_df=master_df, feature_set=feature_set):
    
    matchups = list(combinations(unique_teams, 2))
    
    for matchup in matchups:
        
        X_predict = []

        left_team = min(matchup)
        right_team = max(matchup)
        year = 2025

        left_df = master_df[master_df['LTeamID'] == left_team]
        right_df = master_df[master_df['LTeamID'] == right_team]

        chiral_features = [i for i in feature_set if i.startswith('W') or i.startswith('L')]
        agg_features = np.setdiff1d(feature_set, chiral_features)
        
        # go left than right
        for feaure in feature_set:
            pass
        
        formatted_matchup = f"2025_{left_team}_{right_team}"
        prediction = ...

# Kaggle reference

In [6]:
# File path
input_folder = r"march-machine-learning-mania-2025/"


# Pertinent file names
mens_files = ["MNCAATourneyCompactResults.csv", "MRegularSeasonCompactResults.csv", "MTeams.csv"]
womens_files = ["WNCAATourneyCompactResults.csv", "WRegularSeasonCompactResults.csv", "WTeams.csv"]

dataframes = {}

# Load in csv data and cast them as dataframes
def load_csv_files(file_list):
    for file in file_list:
        file_path = os.path.join(input_folder, file)
        key = os.path.splitext(file)[0]
        try:
            dataframes[key] = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file}: {e}")

# Load mens and womens data
load_csv_files(mens_files)
load_csv_files(womens_files)

# Preprocess - the author uses a very minimal feature set so it is very easy to follow
def preprocess_results(df):
    df = df[['Season', 'WTeamID', 'LTeamID']].copy()
    df.loc[:, 'Result'] = 1  # Win label
    df_inv = df.copy()
    df_inv[['WTeamID', 'LTeamID']] = df[['LTeamID', 'WTeamID']].values
    df_inv.loc[:, 'Result'] = 0  # Loss label
    return pd.concat([df, df_inv], ignore_index=True)

# Preprocesses them into dataframes 
men_results = preprocess_results(dataframes['MNCAATourneyCompactResults'])
women_results = preprocess_results(dataframes['WNCAATourneyCompactResults'])

all_results = pd.concat([men_results, women_results], ignore_index=True)

# Merge men's and women's datasets
# all_results = pd.concat([men_results, women_results], ignore_index=True)

# Train model
X = all_results[['Season', 'WTeamID', 'LTeamID']]
y = all_results['Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model is trained here - but the predictions are combinations of *all* possible pairings teams
model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=200)
model.fit(X_train, y_train)

# Evaluate model - uses a brier score?
preds = model.predict_proba(X_test)[:, 1]
print(f"Brier Score: {brier_score_loss(y_test, preds)}")

# Generate submission
all_teams = np.concatenate((dataframes['MTeams']['TeamID'].values, dataframes['WTeams']['TeamID'].values))
pairings = list(combinations(all_teams, 2))

# Load required matchup IDs
required_ids_df = pd.read_csv(os.path.join(input_folder, "SampleSubmissionStage2.csv"))
required_ids = set(required_ids_df['ID'])

# Submit to kaggle?
def create_submission(pairings, season=2025, max_rows=131407):
    submission = []
    for (team1, team2) in pairings:
        matchup_id = f"{season}_{min(team1, team2)}_{max(team1, team2)}"
        if matchup_id in required_ids:
            input_data = pd.DataFrame({'Season': [season], 'WTeamID': [min(team1, team2)], 'LTeamID': [max(team1, team2)]})
            pred = model.predict_proba(input_data)[0, 1] if len(input_data) > 0 else 0.5
            submission.append([matchup_id, pred])
    submission_df = pd.DataFrame(submission, columns=["ID", "Pred"])
    print(f"Submission file has {submission_df.shape[0]} rows.")
    return submission_df


0:	learn: 0.6918448	total: 1.21ms	remaining: 1.21s
200:	learn: 0.5864456	total: 223ms	remaining: 886ms
400:	learn: 0.5174959	total: 451ms	remaining: 674ms
600:	learn: 0.4707636	total: 680ms	remaining: 452ms
800:	learn: 0.4331983	total: 908ms	remaining: 226ms
999:	learn: 0.4016661	total: 1.13s	remaining: 0us
Brier Score: 0.2188024186822397
