In [51]:
"""CREDIT TO PRUTHA ANNADATE, https://www.kaggle.com/code/pruthaannadate/catboost-march-machine-learning-mania-25"""

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations

from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss, f1_score, accuracy_score, auc, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier

# Helper functions

In [11]:
def load_df_dict(path='march-machine-learning-mania-2025/'):
    data_dict = {}
    
    for fname in os.listdir(path):
        data_dict[fname.split('.')[0]] = pd.read_csv(path + fname)
        
    return data_dict

In [73]:
# fix feature set logic so you dont need to pass both W and L, append at the end
# add models iteratively 

def bracketology(dataset, feature_set=['WTeamID', 'LTeamID'], model_name='catboost'):
    
    model_directory = {
        'catboost': CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=200),
        'lr': LogisticRegression()
    }

    result_dict = {}

    X = dataset[feature_set]
    y = dataset['result']
    
    k_strat = StratifiedKFold(n_splits=4, shuffle=False)
    folds = k_strat.split(X, y)

    auc = []
    accuracy = []
    brier_score = []

    for train, test in folds:
        
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.values[train], y.values[test]

        model = model_directory[model_name]
        model.fit(X_train, y_train)

        y_preds = model.predict(X_test)
        y_probs = model.predict_proba(X_test)
        auc.append(roc_auc_score(y_test, y_probs[:,1]))
        accuracy.append(accuracy_score(y_test, y_preds))
        brier_score.append(brier_score_loss(y_test, y_probs[:,1]))
    
    model.fit(X, y)

    result_dict['model_name'] = model_name
    result_dict['model'] = model
    result_dict['auc'] = np.average(auc)
    result_dict['accuracy'] = np.average(accuracy)
    result_dict['brier_score'] = np.average(brier_score)
    
    return result_dict
            

# Pull in basic stats

In [74]:
dfs = load_df_dict()
results = pd.concat([dfs['MNCAATourneyDetailedResults'], dfs['WNCAATourneyDetailedResults']])
teams = pd.concat([dfs['MTeams'], dfs['WTeams']])

results_W = results[['Season', 'DayNum', 'WTeamID', 'WScore']]
results_L = results[['Season', 'DayNum', 'LTeamID', 'LScore']]
stats = results.drop(['Season', 'DayNum', 'WTeamID', 'WScore', 'Season', 'DayNum', 'LTeamID', 'LScore'], axis=1)

full_W = pd.merge(left=results_W, right=teams, left_on='WTeamID', right_on='TeamID', how='left')
full_W = full_W.rename(columns={'TeamName':'WTeamName', 'FirstD1Season':'WFirstD1Season', 'LastD1Season':'WLastD1Season'}).drop('TeamID', axis=1)
full_L = pd.merge(left=results_L, right=teams, left_on='LTeamID', right_on='TeamID', how='left')
full_L = full_L.rename(columns={'TeamName':'LTeamName', 'FirstD1Season':'LFirstD1Season', 'LastD1Season':'LLastD1Season'}).drop('TeamID', axis=1).drop(['Season', 'DayNum'],axis=1)

df = pd.concat([full_W, full_L, stats.reset_index()],axis=1).drop(['index', 'WLoc'], axis=1)

new_df = pd.DataFrame({})
WL_cols = np.sort([col for col in df.columns if 'W' in col or 'L' in col]).tolist()[::-1]

reversed_WL_cols = [i for i in WL_cols if i[0] == 'L'] + [i for i in WL_cols if i[0] == 'W']
other_cols = np.setdiff1d(df.columns, WL_cols)

raw_names = [col.strip("W") for col in WL_cols if 'W' in col]
WL_pairs = [[f'W{col}', f'L{col}'] for col in raw_names]

for pair in WL_pairs:
    new_df[pair] = df[pair[::-1]]

df['result'] = 1
new_df['result'] = 0

new_df[other_cols] = df[other_cols]
new_df = new_df[df.columns]

master_df = pd.concat([df, new_df])

# Since we'll have to predict on combinations, we need to aggregate some stats to generate static features

In [79]:
unique_teams = np.unique(master_df['WTeamID'].tolist() + master_df['LTeamID'].tolist())

493

In [78]:
feats = ['WTeamID', 'LTeamID']
feats.extend(['NumOT', 'Season'])
bracketology(dataset=master_df, feature_set=feats)

0:	learn: 0.6924851	total: 1.67ms	remaining: 1.67s
200:	learn: 0.5401447	total: 205ms	remaining: 816ms
400:	learn: 0.4512124	total: 368ms	remaining: 549ms
600:	learn: 0.3866578	total: 527ms	remaining: 350ms
800:	learn: 0.3416352	total: 685ms	remaining: 170ms
999:	learn: 0.3024140	total: 844ms	remaining: 0us
0:	learn: 0.6924321	total: 812us	remaining: 812ms
200:	learn: 0.5345464	total: 159ms	remaining: 631ms
400:	learn: 0.4457989	total: 319ms	remaining: 477ms
600:	learn: 0.3845183	total: 477ms	remaining: 317ms
800:	learn: 0.3350167	total: 634ms	remaining: 158ms
999:	learn: 0.2976283	total: 790ms	remaining: 0us
0:	learn: 0.6921007	total: 708us	remaining: 708ms
200:	learn: 0.5515049	total: 155ms	remaining: 615ms
400:	learn: 0.4610932	total: 315ms	remaining: 470ms
600:	learn: 0.4005175	total: 471ms	remaining: 313ms
800:	learn: 0.3521907	total: 628ms	remaining: 156ms
999:	learn: 0.3132305	total: 783ms	remaining: 0us
0:	learn: 0.6922426	total: 647us	remaining: 646ms
200:	learn: 0.5508886	tot

{'model_name': 'catboost',
 'model': <catboost.core.CatBoostClassifier at 0x349992ad0>,
 'auc': 0.6571591389945052,
 'accuracy': 0.6166520210896309,
 'brier_score': 0.25390726854978035}

# Kaggle work

In [2]:
# File path
input_folder = r"march-machine-learning-mania-2025/"

# Pertinent file names
mens_files = ["MNCAATourneyCompactResults.csv", "MRegularSeasonCompactResults.csv", "MTeams.csv"]
womens_files = ["WNCAATourneyCompactResults.csv", "WRegularSeasonCompactResults.csv", "WTeams.csv"]

dataframes = {}

# Load in csv data and cast them as dataframes
def load_csv_files(file_list):
    for file in file_list:
        file_path = os.path.join(input_folder, file)
        key = os.path.splitext(file)[0]
        try:
            dataframes[key] = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file}: {e}")

# Load mens and womens data
load_csv_files(mens_files)
load_csv_files(womens_files)


In [28]:
# Preprocess - the author uses a very minimal feature set so it is very easy to follow
def preprocess_results(df):
    df = df[['Season', 'WTeamID', 'LTeamID']].copy()
    df.loc[:, 'Result'] = 1  # Win label
    df_inv = df.copy()
    df_inv[['WTeamID', 'LTeamID']] = df[['LTeamID', 'WTeamID']].values
    df_inv.loc[:, 'Result'] = 0  # Loss label
    return pd.concat([df, df_inv], ignore_index=True)

# Preprocesses them into dataframes 
men_results = preprocess_results(dataframes['MNCAATourneyCompactResults'])
women_results = preprocess_results(dataframes['WNCAATourneyCompactResults'])

# Merge men's and women's datasets
# all_results = pd.concat([men_results, women_results], ignore_index=True)
all_results = master_df
# Train model
X = all_results[['Season', 'WTeamID', 'LTeamID']]
y = all_results['Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'dataframes' is not defined

In [53]:
# Model is trained here - but the predictions are combinations of *all* possible pairings teams
model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=200)
model.fit(X_train, y_train)

# Evaluate model - uses a brier score?
preds = model.predict_proba(X_test)[:, 1]
print(f"Brier Score: {brier_score_loss(y_test, preds)}")

# Generate submission
all_teams = np.concatenate((dfs['MTeams']['TeamID'].values, dfs['WTeams']['TeamID'].values))
pairings = list(combinations(all_teams, 2))

0:	learn: 0.6920418	total: 2.55ms	remaining: 2.55s
200:	learn: 0.5508519	total: 163ms	remaining: 649ms
400:	learn: 0.4606697	total: 320ms	remaining: 479ms
600:	learn: 0.3985509	total: 477ms	remaining: 317ms
800:	learn: 0.3513631	total: 634ms	remaining: 157ms
999:	learn: 0.3133479	total: 791ms	remaining: 0us
Brier Score: 0.2418853706566385


In [43]:
len(pairings)

286903

In [None]:
# Load required matchup IDs
required_ids_df = pd.read_csv(os.path.join(input_folder, "SampleSubmissionStage2.csv"))
required_ids = set(required_ids_df['ID'])

# Submit to kaggle?
def create_submission(pairings, season=2025, max_rows=131407):
    submission = []
    for (team1, team2) in pairings:
        matchup_id = f"{season}_{min(team1, team2)}_{max(team1, team2)}"
        if matchup_id in required_ids:
            input_data = pd.DataFrame({'Season': [season], 'WTeamID': [min(team1, team2)], 'LTeamID': [max(team1, team2)]})
            pred = model.predict_proba(input_data)[0, 1] if len(input_data) > 0 else 0.5
            submission.append([matchup_id, pred])
    submission_df = pd.DataFrame(submission, columns=["ID", "Pred"])
    print(f"Submission file has {submission_df.shape[0]} rows.")
    return submission_df

# Looks like the author locally saves then directly uploads to Kaggle?
submission_df = create_submission(pairings)
# submission_df.to_csv("submission.csv", index=False)
print("Submission file created successfully.")

# Visualization
plt.figure(figsize=(10, 6))
sns.histplot(submission_df['Pred'], bins=50, kde=True)
plt.xlabel("Predicted Probability")
plt.ylabel("Frequency")
plt.title("Distribution of Predicted Probabilities")
plt.show()