In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime, timedelta
from IPython.display import clear_output
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from numpy import mean, std

In [2]:
teams_file = f'/march-machine-learning-mania-2024/MTeams.csv'
teams_df = pd.read_csv(teams_file)

tourn_file = f'/march-machine-learning-mania-2024/MNCAATourneyCompactResults.csv'
tourn_df = pd.read_csv(tourn_file)

kenpom_file = f'/march-machine-learning-mania-2024/KenPom_Barttorvik.csv'
kenpom_df = pd.read_csv(kenpom_file)

seed_file = f'/march-machine-learning-mania-2024/MNCAATourneySeeds.csv'
seed_df = pd.read_csv(seed_file)

spelling_file = f'/march-machine-learning-mania-2024/MTeamSpellings.csv'
spelling_df = pd.read_csv(spelling_file, encoding='latin-1')

tourn24_file = f'/march-machine-learning-mania-2024/Tournament_Simulation.csv'
tourn24_df = pd.read_csv(tourn24_file)

In [3]:
start_data_season = 2008
season = 2024

In [4]:
def kenpom_team(spelling_df,kenpom_df):
    kenpom_df['TEAM'] = kenpom_df['TEAM'].str.lower()
    spelling_df['TeamNameSpelling'] = spelling_df['TeamNameSpelling'].str.lower()
    merge_df = pd.merge(kenpom_df,spelling_df,left_on='TEAM',right_on='TeamNameSpelling',how='left')
    merge_df.drop(columns=['CONF', 'CONF ID', 'QUAD NO', 'QUAD ID', 'ROUND', 'TEAM NO', 'TEAM ID', 'TeamNameSpelling'], inplace=True)
    
    return merge_df

In [5]:
def kenpom_opp_team(spelling_df,kenpom_df):
    kenpom_df['TEAM'] = kenpom_df['TEAM'].str.lower()
    spelling_df['TeamNameSpelling'] = spelling_df['TeamNameSpelling'].str.lower()
    merge_df = pd.merge(kenpom_df,spelling_df,left_on='TEAM',right_on='TeamNameSpelling',how='left')
    merge_df.drop(columns=['CONF', 'CONF ID', 'QUAD NO', 'QUAD ID', 'ROUND', 'TEAM NO', 'TEAM ID', 'TeamNameSpelling'], inplace=True)
    merge_df.columns = ['Opp_' + col for col in merge_df.columns]
    
    return merge_df

In [6]:
kenpom_id_df = kenpom_team(spelling_df,kenpom_df)
kenpom_id_df.head()

Unnamed: 0,YEAR,TEAM,SEED,K TEMPO,K TEMPO RANK,KADJ T,KADJ T RANK,K OFF,KO RANK,KADJ O,...,AVG HGT RANK,EFF HGT RANK,EXP RANK,TALENT RANK,FT% RANK,OP FT% RANK,PPPO RANK,PPPD RANK,ELITE SOS RANK,TeamID
0,2024,akron,14,66.7747,274,65.8933,268,107.841,122,107.009,...,238,199,19,176,164,47,122,51,249,1103
1,2024,alabama,4,74.1625,7,72.6461,9,121.712,2,125.601,...,33,8,156,106,10,314,2,263,7,1104
2,2024,arizona,2,73.376,11,71.8379,16,117.653,10,121.125,...,50,37,196,7,195,134,8,14,47,1112
3,2024,auburn,4,70.9629,54,69.7887,58,117.364,14,120.579,...,86,76,127,69,59,284,12,8,69,1120
4,2024,baylor,3,66.8428,270,65.6032,281,117.262,15,122.49,...,31,22,304,34,97,254,15,155,1,1124


In [7]:
def get_game_stats(season, team_a, team_b, kenpom):

    stats_a_df = kenpom[(kenpom['YEAR'] == season) & (kenpom['TeamID'] == team_a)]
    stats_b_df = kenpom[(kenpom['YEAR'] == season) & (kenpom['TeamID'] == team_b)]
    combined_df = pd.merge(stats_a_df, stats_b_df, on='YEAR', suffixes=('_a', '_b'))
    
    return combined_df

In [8]:
def build_dataset(start_data_season, kenpom_id_df, tourn_df, end_season):
    data = []

    for season in range(start_data_season + 1, end_season + 1):
        tourney_games = tourn_df[tourn_df['Season'] == season].reset_index(drop=True)
        for idx, g in tourney_games.iterrows():

            clear_output(wait=True)
            print(f"{season}: {idx}/{len(tourney_games)}")

            team_a = min([g['WTeamID'], g['LTeamID']])
            team_b = max([g['WTeamID'], g['LTeamID']])

            if team_a == g['WTeamID']:
                winner = 'A'
                team_a_score = g['WScore']
                team_b_score = g['LScore']
            else:
                winner = 'B'
                team_a_score = g['LScore']
                team_b_score = g['WScore']

            print(f"{team_a} x {team_b}")

            game_stats = get_game_stats(season, team_a, team_b, kenpom_id_df)
            game_stats['Winner'] = winner
            game_stats['Day'] = g['DayNum']
            
            data.append(game_stats)

    data_df = pd.concat(data, ignore_index=True)
    display(data_df)
    
    return data_df


In [9]:
data_df = build_dataset(2007, kenpom_id_df, tourn_df, end_season = 2023)

2023: 66/67
1163 x 1361


Unnamed: 0,YEAR,TEAM_a,SEED_a,K TEMPO_a,K TEMPO RANK_a,KADJ T_a,KADJ T RANK_a,K OFF_a,KO RANK_a,KADJ O_a,...,EXP RANK_b,TALENT RANK_b,FT% RANK_b,OP FT% RANK_b,PPPO RANK_b,PPPD RANK_b,ELITE SOS RANK_b,TeamID_b,Winner,Day
0,2008,coppin st,16,63.8026,289,62.2413,263,90.6896,325,92.4179,...,252,199,152,239,185,65,302,1291,B,134
1,2008,belmont,15,71.4309,39,68.1091,49,109.9000,41,109.4360,...,299,1,159,58,13,20,9,1181,B,136
2,2008,kansas,1,68.8789,106,66.8546,88,117.5300,1,121.4330,...,57,271,239,313,48,136,206,1340,A,136
3,2008,kansas st,11,72.2437,26,69.2125,29,108.2930,58,113.3830,...,339,34,187,166,133,37,2,1425,A,136
4,2008,kentucky,11,65.9148,203,61.9035,276,102.8670,153,106.8490,...,126,47,138,107,42,17,31,1266,B,136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,2023,miami fl,5,68.9799,128,68.8527,105,114.5390,8,118.2330,...,125,16,53,229,50,20,8,1400,A,146
991,2023,creighton,6,69.4467,104,67.9808,149,109.5920,47,114.6210,...,10,84,118,286,103,25,65,1361,B,146
992,2023,connecticut,4,68.5693,154,66.6901,210,114.2280,9,119.4370,...,155,63,17,316,8,211,79,1274,A,152
993,2023,florida atlantic,9,68.7232,140,68.4356,126,112.9470,15,114.2750,...,10,84,118,286,103,25,65,1361,B,152


In [10]:
def get_models():
    models = dict()
    
    for i in range(8, 21):
        rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=i)
        model = DecisionTreeClassifier()
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

In [11]:
X_select = data_df[data_df['YEAR'].isin([2008, 2009])]
y_select = data_df[data_df['YEAR'].isin([2008, 2009])]
X_select_drop = X_select.drop(['YEAR','TEAM_a', 'TeamID_a','TEAM_b', 'TeamID_b','Winner','Day'], axis=1)
y_select_drop = y_select['Winner']

In [12]:
model = get_models()

results=[]
names = []
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
for name, model in model.items():
    scores = cross_val_score(model, X_select_drop, y_select_drop, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>8 0.707 (0.116)
>9 0.714 (0.129)
>10 0.704 (0.117)
>11 0.704 (0.121)
>12 0.699 (0.116)
>13 0.712 (0.115)
>14 0.709 (0.116)
>15 0.707 (0.117)
>16 0.709 (0.122)
>17 0.712 (0.124)
>18 0.704 (0.131)
>19 0.709 (0.118)
>20 0.704 (0.112)


In [13]:
X = data_df.drop(['YEAR','TEAM_a', 'TeamID_a','TEAM_b', 'TeamID_b','Winner','Day'], axis=1)
y = data_df['Winner']


rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=9)
rfe.fit(X, y)

RFE(estimator=DecisionTreeClassifier(), n_features_to_select=9)

In [14]:
selected_features = rfe.support_
selected_feature_names = [feature_name for feature_name, is_selected in zip(X.columns, selected_features) if is_selected]
selected_feature_names.append('SEED_a')
selected_feature_names.append('SEED_b')

In [15]:
X1 = data_df[selected_feature_names]
y1 = data_df['Winner']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

accuracy = rf_model.score(X_test, y_test)
print("Accuracy with selected features:", accuracy)

Accuracy with selected features: 0.7123745819397993


In [18]:
trained_model = rf_model

In [19]:
def bracket(tourn24_df):

    combined_rows = []

    # Iterate over the DataFrame skipping every other row
    for i in range(1, len(tourn24_df), 2):
        combined_row = pd.concat([tourn24_df.iloc[i-1], tourn24_df.iloc[i]], axis=0)
        combined_rows.append(combined_row)

    # Concatenate the combined rows into a new DataFrame
    combined_df = pd.concat(combined_rows, axis=1).T
    combined_df.columns = [f'Column_{i}' for i in range(len(combined_df.columns))]
    columns_to_drop = ['Column_0', 'Column_1', 'Column_2','Column_3', 'Column_7','Column_8','Column_9','Column_10','Column_11','Column_14','Column_15']
    combined_df = combined_df.drop(columns=columns_to_drop)
    combined_df = combined_df.rename(columns={'Column_4':'TeamName','Column_5':'TeamSeed', 'Column_6':'Round',
                                             'Column_12':'OppTeamName','Column_13':'OppTeamSeed'})
  
    
    return combined_df

In [20]:
def bracket_team_id(spelling_df,bracket_df):
    bracket_df['TeamName'] = bracket_df['TeamName'].str.lower()
    bracket_df['OppTeamName'] = bracket_df['OppTeamName'].str.lower()
    spelling_df['TeamNameSpelling'] = spelling_df['TeamNameSpelling'].str.lower()
    merge_df = pd.merge(bracket_df,spelling_df,left_on='TeamName',right_on='TeamNameSpelling',how='left')
    merge_df2 = pd.merge(merge_df,spelling_df,left_on='OppTeamName',right_on='TeamNameSpelling',how='left')
    merge_df2 = merge_df2.drop(columns=['TeamNameSpelling_x','TeamNameSpelling_y'])
    merge_df2 = merge_df2.rename(columns={'TeamID_x': 'TeamID', 'TeamID_y': 'OppTeamID'})
    
    column_to_move = merge_df2['TeamID']  
    merge_df2 = merge_df2.drop(columns=['TeamID'])  
    merge_df2.insert(1, 'TeamID', column_to_move)
    
    column_to_move2 = merge_df2['OppTeamID']  
    merge_df2 = merge_df2.drop(columns=['OppTeamID'])  
    merge_df2.insert(5, 'OppTeamID', column_to_move)
   
    return merge_df2

In [21]:
def get_pred_stats(team_a,team_b,kenpom):
    
    stats_a_df = kenpom[(kenpom['YEAR'] == 2024) & (kenpom['TeamID'] == team_a)]
    stats_b_df = kenpom[(kenpom['YEAR'] == 2024) & (kenpom['TeamID'] == team_b)]
    combined_df = pd.merge(stats_a_df, stats_b_df, on='YEAR', suffixes=('_a', '_b'))[selected_feature_names]
    
    return combined_df

In [22]:
def build_bracket(kenpom_id_df, bracket_df):
    data = []
    
    for _, g in bracket_df.iterrows():
        clear_output(wait=True)

        team_a = g['TeamID']
        team_b = g['OppTeamID']
        game_stats = get_pred_stats(team_a, team_b, kenpom_id_df)         
        data.append(game_stats)

    data_df = pd.concat(data, ignore_index=True)
    
    return data_df

In [23]:
def next_round(bracket_result):
    bracket_next_round = []
    
    for idx, row in bracket_result.iterrows():
        clear_output(wait=True)
        
        if row['winner'] == 'A':
            new_row = {'TeamName': row['TeamName'], 'TeamID': row['TeamID'], 'TeamSeed': row['TeamSeed']}
        else:
            new_row = {'TeamName': row['OppTeamName'], 'TeamID': row['OppTeamID'], 'TeamSeed': row['OppTeamSeed']}
        
        bracket_next_round.append(new_row)
        
    data_df = pd.DataFrame(bracket_next_round)

    return data_df

In [24]:
def bracket_next_round(tourn_df):
    combined_rows = []

    # Iterate over the DataFrame skipping every other row
    for i in range(1, len(tourn_df), 2):
        combined_row = pd.concat([tourn_df.iloc[i-1], tourn_df.iloc[i]], axis=0)
        combined_rows.append(combined_row)
        
    combined_df = pd.concat(combined_rows, axis=1).T
    combined_df.columns = [f'Column_{i}' for i in range(len(combined_df.columns))]
    combined_df = combined_df.rename(columns={'Column_0':'TeamName','Column_1':'TeamID','Column_2':'TeamSeed',
                                             'Column_3':'OppTeamName','Column_4':'OppTeamID', 'Column_5':'OppTeamSeed'})
    combined_df['winner']=0
        
    return combined_df

In [25]:
bracket_df = bracket(tourn24_df)
bracket_round1 = bracket_team_id(spelling_df,bracket_df)
bracket_structure = bracket_round1
bracket_structure.head()

Unnamed: 0,TeamName,TeamID,TeamSeed,Round,OppTeamName,OppTeamID,OppTeamSeed
0,connecticut,1163,1,1,stetson,1163,16
1,florida atlantic,1194,8,1,northwestern,1194,9
2,san diego st.,1361,5,1,uab,1361,12
3,auburn,1120,4,1,yale,1120,13
4,byu,1140,6,1,duquesne,1140,11


In [26]:
def simulate_bracket(kenpom_id_df, trained_model, bracket_structure):

    bracket_results = pd.DataFrame()
    current_bracket = bracket_structure.copy()

    for i in range(1,7):
        current_round_data = build_bracket(kenpom_id_df, current_bracket)
        predictions = trained_model.predict(current_round_data)
        current_bracket['winner'] = predictions
        if i < 6:
            current_bracket['Round'] = i
            bracket_results = bracket_results.append(current_bracket, ignore_index=True)
            next_round_name = next_round(current_bracket)
            current_bracket = bracket_next_round(next_round_name)
        
        if i == 6 :
            current_bracket['Round'] = i
            bracket_results = bracket_results.append(current_bracket, ignore_index=True)
    

    return bracket_results


In [27]:
pd.set_option('display.max_rows', None)

In [28]:
pred_result = simulate_bracket(kenpom_id_df, trained_model, bracket_structure)
pred_result

  bracket_results = bracket_results.append(current_bracket, ignore_index=True)


Unnamed: 0,TeamName,TeamID,TeamSeed,Round,OppTeamName,OppTeamID,OppTeamSeed,winner
0,connecticut,1163,1,1,stetson,1163,16,B
1,florida atlantic,1194,8,1,northwestern,1194,9,B
2,san diego st.,1361,5,1,uab,1361,12,B
3,auburn,1120,4,1,yale,1120,13,B
4,byu,1140,6,1,duquesne,1140,11,B
5,illinois,1228,3,1,morehead st.,1228,14,A
6,washington st.,1450,7,1,drake,1450,10,A
7,iowa st.,1235,2,1,south dakota st.,1235,15,B
8,north carolina,1314,1,1,howard,1314,16,A
9,mississippi st.,1280,8,1,michigan st.,1280,9,B
