In [None]:
%pip install pandas --quiet
%pip install optuna --quiet
%pip install tqdm --quiet
%pip install scikit-learn --quiet
%pip install lightgbm --quiet
%pip install tqdm --quiet

In [None]:
from itertools import chain
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.stats import linregress
from tqdm import tqdm
import glob
import lightgbm as lgb
import numpy as np
import optuna as op
import os
import pandas as pd
import warnings

op.logging.set_verbosity(op.logging.WARNING)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
DATA_DIRS = ['/kaggle/input/march-machine-learning-mania-2024', '/kaggle/input/ncaa-men-538-team-ratings', '/kaggle/input/ncaa-women-538-team-ratings']
ALL_DATA = {}
for path in list(chain(*map(lambda x: glob.glob(x + '/*.csv'), DATA_DIRS))):
    ALL_DATA[os.path.basename(path).split('.')[0]] = pd.read_csv(path, encoding='cp1252')
ALL_DATA['538ratingsM'] = ALL_DATA.pop('538ratingsMen')
ALL_DATA['538ratingsW'] = ALL_DATA.pop('538ratingsWomen')

In [None]:
def device():
    from tensorflow.python.client import device_lib
    return 'gpu' if len(list(filter(lambda x: x.device_type == 'GPU', device_lib.list_local_devices()))) > 0 else 'cpu'

print("Device: {}".format(device()))

# Build DF

## Feature Engineering

In [None]:
def based_n_gender_build_results(gender):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: ALL_DATA[x], csv_names))
    return pd.concat(csvs)

results_m = based_n_gender_build_results('M')
results_w = based_n_gender_build_results('W')
display(results_m)
display(results_w)

In [None]:
def gender_based_teams_building(gender):
    teams = ALL_DATA["{}Teams".format(gender)].copy()
    teams = teams.drop('TeamName', axis=1)
    teams = teams.set_index('TeamID')
    return teams

teams_men = gender_based_teams_building('M')
teams_women = gender_based_teams_building('W') # FIXME: Maybe useless since there is no data aside from TeamName.
display(teams_men)
display(teams_women)

In [None]:
def calculate_elo_data(teams, data, initial_rating=2000, k=140, alpha=None):
    team_dict = {}
    for team in teams:
        team_dict[team] = initial_rating
    r1, r2 = [], []
    margin_of_victory = 1
    for wteam, lteam, ws, ls  in tqdm(zip(data.WTeamID, data.LTeamID, data.WScore, data.LScore), total=len(data)):
        r1.append(team_dict[wteam])
        r2.append(team_dict[lteam])
        rateW = 1 / (1 + 10 ** ((team_dict[lteam] - team_dict[wteam]) / initial_rating))
        rateL = 1 / (1 + 10 ** ((team_dict[wteam] - team_dict[lteam]) / initial_rating))
        if alpha:
            margin_of_victory = (ws - ls)/alpha
        team_dict[wteam] += k * margin_of_victory * (1 - rateW)
        team_dict[lteam] += k * margin_of_victory * (0 - rateL)
        if team_dict[lteam] < 1:
            team_dict[lteam] = 1
        
    return r1, r2

def create_elo_data_teams(teams, data, initial_rating=2000, k=140, alpha=None):
    r1, r2 = calculate_elo_data(teams, data, initial_rating, k, alpha)
    seasons = np.concatenate([data.Season, data.Season])
    days = np.concatenate([data.DayNum, data.DayNum])
    teams = np.concatenate([data.WTeamID, data.LTeamID])
    tourney = np.concatenate([data.tourney, data.tourney])
    ratings = np.concatenate([r1, r2])
    rating_df = pd.DataFrame({
        'Season': seasons,
        'DayNum': days,
        'TeamID': teams,
        'Rating': ratings,
        'Tourney': tourney
    })

    rating_df.sort_values(['TeamID', 'Season', 'DayNum'], inplace=True)
    rating_df = rating_df[rating_df['Tourney'] == 0]
    grouped = rating_df.groupby(['TeamID', 'Season'])
    results = grouped['Rating'].agg(['mean', 'median', 'std', 'min', 'max', 'last'])
    results.columns = ['Rating_Mean', 'Rating_Median', 'Rating_Std', 'Rating_Min', 'Rating_Max', 'Rating_Last']
    results['Rating_Trend'] = grouped.apply(lambda x: linregress(range(len(x)), x['Rating']).slope, include_groups=False)
    results.reset_index(inplace=True)
    return results

In [None]:
def based_on_gender_build_elo(gender, results, teams):
    csv_names = ['NCAATourneyCompactResults', 'RegularSeasonCompactResults']
    csv_names = list(map(lambda x: gender + x, csv_names))
    csvs      = list(map(lambda x: ALL_DATA[x], csv_names))
    tourneys = results.copy()
    tourneys['tourney'] = 0
    tourneys.loc[len(csvs[0]):, 'tourney'] = 1
    tourneys = tourneys.sort_values(['Season', 'DayNum'])
    return create_elo_data_teams(teams.reset_index().TeamID, tourneys).drop('Season', axis=1).groupby('TeamID').mean()


elo_men = based_on_gender_build_elo('M', results_m, teams_men)
elo_women = based_on_gender_build_elo('W', results_w, teams_women)
display(elo_men)
display(elo_women)

In [None]:
def winner(ids):
    id, wId, lId = ids
    return int(id == wId)

def opponent(x):
    winInt, wId, lId = x
    win = not winInt
    return wId if win else lId

def score_difference(x):
    winInt, wScore, lScore = x
    win = not winInt
    return (wScore - lScore) if win else (lScore - wScore)

def build_season_results_data(df):
    season_results = df
    season_results['TeamID'] = season_results[['WTeamID', 'LTeamID']].values.tolist()
    season_results = season_results.explode('TeamID')
    season_results['Win'] = season_results[['TeamID', 'WTeamID', 'LTeamID']].apply(winner, axis=1)
    season_results['Defeat'] = season_results['Win'].apply(lambda x: 1 - x)
    season_results['Games'] = season_results['Win'] + season_results['Defeat']
    season_results['ScoreDiff'] = season_results[['Win', 'WScore', 'LScore']].apply(score_difference, axis=1)
    season_results['OTeamID'] = season_results[['Win', 'WTeamID', 'LTeamID']].apply(opponent, axis=1)
    season_results['Home'] = season_results['WLoc'].apply(lambda x: int(x[0] == 'H'))
    season_results = season_results.drop(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc', 'NumOT'], axis=1)
    season_results = season_results.groupby(by=['TeamID', 'OTeamID']).sum()
    season_results['WinRatio'] = season_results['Win'] / season_results['Games']
    season_results = season_results.drop(['Win', 'Defeat'], axis=1)
    return season_results

season_results_men = build_season_results_data(results_m)
season_results_women = build_season_results_data(results_w)

display(season_results_men)
display(season_results_women)

In [None]:
def build_rpi(results):
    win_pct = results.copy()[['WinRatio']]
    win_pct = win_pct.groupby('TeamID').mean()
    win_pct['WP'] = win_pct['WinRatio'] * 100
    win_pct = win_pct.drop('WinRatio', axis=1)
    rpi = results.copy().reset_index()
    rpi = pd.merge(rpi, win_pct, on=['TeamID'])
    rpi = pd.merge(rpi, win_pct, left_on=['OTeamID'], right_on=['TeamID'], suffixes=('_T', '_O'))
    wp_oo = rpi[['TeamID', 'WP_O']].groupby('TeamID').mean()
    wp_oo = wp_oo.rename(columns={'WP_O': 'WP_OO'})
    rpi = pd.merge(rpi, wp_oo, left_on=['OTeamID'], right_on=['TeamID'])
    rpi['RPI'] = (rpi['WP_T'] * 0.25) + (rpi['WP_O'] * 0.50) + (rpi['WP_OO'] * 0.25)
    return rpi[['TeamID', 'OTeamID', 'RPI']].set_index(['TeamID', 'OTeamID'])

rpi_men = build_rpi(season_results_men)
rpi_women = build_rpi(season_results_women)

display(rpi_men)
display(rpi_women)

In [None]:
def clean_seeds(seed):
    res = seed[1:]
    if len(res) > 2:
        res = res[:-1]
    return int(res)

def based_on_gender_build_seeds(gender):
    seeds = ALL_DATA["{}NCAATourneySeeds".format(gender)] 
    seeds['Seed'] = seeds['Seed'].apply(clean_seeds)
    seeds = seeds.drop('Season', axis=1)
    seeds = seeds.groupby(by='TeamID').mean()
    return seeds

seeds_men = based_on_gender_build_seeds('M')
seeds_women = based_on_gender_build_seeds('W')
display(seeds_men)
display(seeds_women)

In [None]:
def gender_based_build_rankings(gender):
    rankings = ALL_DATA["{}MasseyOrdinals_thruSeason2024_day128".format(gender)]
    rankings = rankings.drop(['SystemName', 'RankingDayNum'], axis=1)
    rankings = rankings.groupby(by='TeamID').mean()
    rankings = rankings.drop('Season', axis=1)
    return rankings

rankings_m = gender_based_build_rankings('M')
rankings_m

In [None]:
def gender_based_build_rating(gender):
    ranks = ALL_DATA["538ratings{}".format(gender)].copy()
    ranks = ranks[['TeamID', '538rating']].groupby('TeamID').mean()
    return ranks

rating_m = gender_based_build_rating('M')
rating_w = gender_based_build_rating('W')
display(rating_m)
display(rating_w)

In [None]:
def for_season_results_build_history(season_results, seeds, teams, elo, rpi, rating, rankings=None):
    history = season_results.join(teams, on='TeamID').join(seeds, on='TeamID').join(elo, on='TeamID').join(rpi, on=['TeamID', 'OTeamID']).join(rating, on=['TeamID'])
    history = history.reset_index()
    history = pd.merge(history, rpi.reset_index().rename(columns={'TeamID': 'OTeamID', 'OTeamID': 'TeamID'}), on=['TeamID', 'OTeamID'], suffixes=('_T', '_O'))
    history = pd.merge(history, seeds, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
    history = pd.merge(history, rating, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
    history['RPIDiff'] = history['RPI_T'] - history['RPI_O']
    history['SeedDiff'] = history['Seed_T'] - history['Seed_O']
    history['538ratingDiff'] = history['538rating_T'] - history['538rating_O']
    history = history.drop(['538rating_T', '538rating_O', 'RPI_T', 'RPI_O', 'Seed_T', 'Seed_O'], axis=1)
    if rankings is not None:
        history = history.join(rankings, on='TeamID')
        history = pd.merge(history, rankings, left_on='OTeamID', right_on='TeamID', suffixes=('_T', '_O'))
        history['RankingsDiff'] = history['OrdinalRank_T'] - history['OrdinalRank_O']
        history = history.drop(['OrdinalRank_T', 'OrdinalRank_O'], axis=1)
    return history.set_index(['TeamID', 'OTeamID']).fillna(0)

history_men = for_season_results_build_history(season_results_men, seeds_men, teams_men, elo_men, rpi_men, rating_m, rankings_m)
history_women = for_season_results_build_history(season_results_women, seeds_women, teams_women, elo_women, rpi_women, rating_w)
display(history_men)
display(history_women)

In [None]:
def build_avg_history(history):
    agg = {}
    for col in history.columns:
        if col == 'Games' or col == 'Home':
            agg[col] = 'sum'
        else:
            agg[col] = 'mean'
    avg = history.groupby('TeamID').agg(agg)
    return avg

avg_men = build_avg_history(history_men)
avg_women = build_avg_history(history_women)
display(avg_men)
display(avg_women)

In [None]:
def based_on_gender_build_matchups(gender):
    teams = ALL_DATA["{}Teams".format(gender)].copy()
    teams = teams[['TeamID']]
    teams = pd.merge(teams, teams, how='cross')
    teams = teams.rename(columns={'TeamID_x': 'TeamID', 'TeamID_y': 'OTeamID'})
    teams = teams[teams['TeamID'] != teams['OTeamID']]
    teams = teams.set_index(['TeamID', 'OTeamID'])
    return teams

matchups_men = based_on_gender_build_matchups('M')
matchups_women = based_on_gender_build_matchups('W')
display(matchups_men)
display(matchups_women)

In [None]:
def build_dataframe(history, matchups, avg):
    df = pd.merge(matchups, history, on=['TeamID', 'OTeamID'], how='left')
    df = df.fillna(avg).fillna(0)
    if 'FirstD1Season' in df.columns:
        df['FirstD1Season'] = df['FirstD1Season'].astype(int)
        df['LastD1Season'] = df['LastD1Season'].astype(int)
    return df

df_men = build_dataframe(history_men, matchups_men, avg_men)
df_women = build_dataframe(history_women, matchups_women, avg_women)
display(df_men)
display(df_women)

## Feature analysis

In [None]:
corr_men = df_men.corr()
corr_men.style.background_gradient(cmap='coolwarm')

In [None]:
corr_women = df_women.corr()
corr_women.style.background_gradient(cmap='coolwarm')

In [None]:
corr_men = df_men.corr()['WinRatio'].sort_values(ascending=False)
high_corr_men = corr_men[[abs(corr_men) > 0.1 for corr_men in corr_men]]
corr_women = df_women.corr()['WinRatio'].sort_values(ascending=False)
high_corr_women = corr_women[[abs(corr_women) > 0.1 for corr_women in corr_women]]
display(high_corr_men)
display(high_corr_women)

# Training

In [None]:
def score_the_dataset(lgbm_params, X, y):
    reg   = lgb.LGBMRegressor(**lgbm_params)
    score = cross_val_score(reg, X, y)
    score = -1 * score.mean() + score.std()
    return score

def objective(trial, X, y):
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves': trial.suggest_int('num_leaves', 5, 31),
        'n_estimators': trial.suggest_int('n_estimators', 1, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'device_type': device(),
        'verbose': -1
    }
    return score_the_dataset(params, X, y)

def study(X, y):
    study = op.create_study()
    study.optimize(lambda trial: objective(trial, X, y), n_trials=100, n_jobs=-1, show_progress_bar=True)
    return study.best_params

In [None]:
def build_x_y_both_genders(df):
    target_column = 'WinRatio'
    feature_columns = df.columns.tolist()
    feature_columns.remove(target_column)
    return df[feature_columns], df[target_column]

X_men, y_men = build_x_y_both_genders(df_men)
X_women, y_women = build_x_y_both_genders(df_women)

params_men = study(X_men, y_men)
params_women = study(X_women, y_women)

In [None]:
def accuracy(X, y, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    reg_test = lgb.LGBMRegressor(**params)
    reg_test.fit(X_train, y_train)

    print('LightGBM Model accuracy score: {0:0.4f}'.format(reg_test.score(X_test, y_test)))
    print('LightGBM Model accuracy score [train]: {0:0.4f}'.format(reg_test.score(X_train, y_train)))
    
accuracy(X_men, y_men, params_men)
accuracy(X_women, y_women, params_women)

# Prediction

In [None]:
def build_wins_for_prediction(X, y, params):
    reg = lgb.LGBMRegressor(**params)
    reg.fit(X, y)
    wins = X
    wins['WinRatio'] = reg.predict(X)
    wins = wins[['WinRatio']]
    return wins

wins_men = build_wins_for_prediction(X_men, y_men, params_men)
wins_women = build_wins_for_prediction(X_women, y_women, params_women)

display(wins_men)
display(wins_women)

In [None]:
def build_slots_based_gender(gender):
    slots = ALL_DATA["{}NCAATourneySlots".format(gender)]
    year = np.max(slots['Season'])
    slots = slots[slots['Season'] == year]
    slots = slots[slots['Slot'].str.contains('R')] 
    return slots

slots_men = build_slots_based_gender('M')
slots_women = build_slots_based_gender('W')

display(slots_men)
display(slots_women)

In [None]:
def build_seeds_2024_for_both_genders():
    seeds_2024 = ALL_DATA['2024_tourney_seeds']
    return seeds_2024[seeds_2024['Tournament'] == 'M'], seeds_2024[seeds_2024['Tournament'] == 'W']

seeds_2024_m, seeds_2024_w = build_seeds_2024_for_both_genders()

display(seeds_2024_m)
display(seeds_2024_w)

In [None]:
def prepare_data_for_simulation(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}
    return seed_dict, inverted_seed_dict

def simulate_the_data(round_slots, seeds, inverted_seeds, wins):
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]
        team_1_prob = wins.loc[team_1, team_2].WinRatio
        winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])
        winners.append(winner)
        slots.append(slot)
        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulations(seeds, round_slots, wins, brackets):
    seed_dict, inverted_seed_dict = prepare_data_for_simulation(seeds)
    results = []
    bracket = []
    slots = []

    for b in tqdm(range(1, brackets + 1)):
        r, s = simulate_the_data(round_slots, seed_dict, inverted_seed_dict, wins)
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})
    return result_df

In [None]:
num_brackets = 100000
result_men = run_simulations(seeds_2024_m, slots_men, wins_men, num_brackets)
result_men.insert(0, 'Tournament', 'M')
result_women = run_simulations(seeds_2024_w, slots_women, wins_women, num_brackets)
result_women.insert(0, 'Tournament', 'W')

In [None]:
submission = pd.concat([result_men, result_women])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission

In [None]:
submission = submission.reset_index(drop=False)

submission.to_csv('submission.csv', index=False)