# Kaggle March Madness Competition

In [1]:
from pathlib import Path
import os
import pandas as pd

In [2]:
# set path to data directories
raw_data = Path('../march-machine-learning-mania-2023/')
data_path = Path('../data/')

## Load Relevant Raw Data

In [3]:
# teams
m_teams = pd.read_csv(raw_data/'MTeams.csv')
w_teams = pd.read_csv(raw_data/'WTeams.csv')

# seasons
m_seasons = pd.read_csv(raw_data/'MSeasons.csv')
w_seasons = pd.read_csv(raw_data/'WSeasons.csv')

# NCAA tournament seeds
m_seeds = pd.read_csv(raw_data/'MNCAATourneySeeds.csv')
w_seeds = pd.read_csv(raw_data/'WNCAATourneySeeds.csv')

# compact regular season results
m_season_compact = pd.read_csv(raw_data/'MRegularSeasonCompactResults.csv')
w_season_compact = pd.read_csv(raw_data/'WRegularSeasonCompactResults.csv')

# compact tournament results
m_ncaa_compact = pd.read_csv(raw_data/'MNCAATourneyCompactResults.csv')
w_ncaa_compact = pd.read_csv(raw_data/'WNCAATourneyCompactResults.csv')

# sample submission file
samp_submission = pd.read_csv(raw_data/'SampleSubmission2023.csv')

## Preprocess Data

In [4]:
# concat mens and womens datasets
teams = pd.concat([m_teams, w_teams], axis=0)
seasons = pd.concat([m_seasons, w_seasons], axis=0)
seeds = pd.concat([m_seeds, w_seeds], axis=0)
season_results = pd.concat([m_season_compact, w_season_compact], axis=0)
ncaa_results = pd.concat([m_ncaa_compact, w_ncaa_compact])

In [5]:
# add winning team seeds to ncaa_results
ncaa_results = ncaa_results.merge(seeds, how='left', left_on=['WTeamID', 'Season'], right_on=['TeamID', 'Season'])
ncaa_results.rename(columns={'Seed': 'WTeamSeed'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

# add losing team seeds
ncaa_results = ncaa_results.merge(seeds, how='left', left_on=['LTeamID', 'Season'], right_on=['TeamID', 'Season'])
ncaa_results.rename(columns={'Seed':'LTeamSeed'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

In [6]:
# rename season columns
seasons.columns = ['Season', 'SeasonStart', 'RegionW', 'RegionX', 'RegionY', 'RegionZ']

# add to ncaa_results
ncaa_results = ncaa_results.merge(seasons, how='left', on='Season')

In [7]:
# identify team regions based on seed
ncaa_results['WTeamRegion'] = ncaa_results['WTeamSeed'].apply(lambda x: x[0])
ncaa_results['LTeamRegion'] = ncaa_results['LTeamSeed'].apply(lambda x: x[0])

In [8]:
# create mapping dict for regions
ncaa_results['mapping_dict'] = ncaa_results.apply(lambda x: {'W': x['RegionW'], 
                                                             'X': x['RegionX'],
                                                             'Y': x['RegionY'],
                                                             'Z': x['RegionZ']}, axis=1)

In [9]:
# replace region codes with the proper region name
ncaa_results['WTeamRegion'] = ncaa_results.apply(lambda x: x['mapping_dict'][x['WTeamRegion']], axis=1)
ncaa_results['LTeamRegion'] = ncaa_results.apply(lambda x: x['mapping_dict'][x['LTeamRegion']], axis=1)

In [10]:
# drop unnecessary columns
ncaa_results.drop(['RegionW', 'RegionX', 'RegionY', 'RegionZ', 'mapping_dict'], axis=1, inplace=True)

In [11]:
# winning team details 
ncaa_results = ncaa_results.merge(teams, how='left', left_on='WTeamID', right_on='TeamID')
ncaa_results.rename(columns={'TeamName': 'WTeamName',
                             'FirstD1Season': 'WTeamFirstD1Season',
                             'LastD1Season': 'WTeamLastD1Season'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

# losing team details 
ncaa_results = ncaa_results.merge(teams, how='left', left_on='LTeamID', right_on='TeamID')
ncaa_results.rename(columns={'TeamName':'LTeamName',
                             'FirstD1Season': 'LTeamFirstD1Season',
                             'LastD1Season': 'LTeamLastD1Season'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

In [12]:
# add columns 
season_results['WTeamScoreDiff'] = season_results['WScore'] - season_results['LScore']
season_results['LTeamScoreDiff'] = season_results['LScore'] - season_results['WScore']

In [13]:
# regular season games won
grouped = season_results.groupby(['Season', 'WTeamID']).count().reset_index()
grouped = grouped[['Season', 'WTeamID', 'WScore']]
grouped.columns = ['Season', 'TeamID', 'GamesWon']

# concat into ncaa_results on WTeam
ncaa_results = ncaa_results.merge(grouped, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'GamesWon':'WTeamGamesWon'}, inplace=True)

# concat into ncaa_results on LTeam
ncaa_results = ncaa_results.merge(grouped, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'GamesWon':'LTeamGamesWon'}, inplace=True)

In [14]:
# regular season games lost
grouped = season_results.groupby(['Season', 'LTeamID']).count().reset_index()
grouped = grouped[['Season', 'LTeamID', 'LScore']]
grouped.columns = ['Season', 'TeamID', 'GamesLost']

# concat into ncaa_results on WTeam
ncaa_results = ncaa_results.merge(grouped, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'GamesLost':'WTeamGamesLost'}, inplace=True)

# concat into ncaa_results on LTeam
ncaa_results = ncaa_results.merge(grouped, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'GamesLost':'LTeamGamesLost'}, inplace=True)

In [15]:
# average regular season point differential
grouped_win = season_results.groupby(['Season', 'WTeamID']).mean(numeric_only=True).reset_index()
grouped_lose = season_results.groupby(['Season', 'LTeamID']).mean(numeric_only=True).reset_index()

# Win score diff
grouped_win = grouped_win[['Season', 'WTeamID', 'WTeamScoreDiff']]
grouped_win.columns = ['Season', 'TeamID', 'MeanWinScoreDiff']

# Lose score diff
grouped_lose = grouped_lose[['Season', 'LTeamID', 'LTeamScoreDiff']]
grouped_lose.columns = ['Season', 'TeamID', 'MeanLoseScoreDiff']

In [16]:
# concat winning team meanwinscorediff
ncaa_results = ncaa_results.merge(grouped_win, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'MeanWinScoreDiff':'WTeamMeanWinDiff'}, inplace=True)

# concat losing team meanwinscorediff
ncaa_results = ncaa_results.merge(grouped_win, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'MeanWinScoreDiff':'LTeamMeanWinDiff'}, inplace=True)

In [17]:
# concat winning team mean lose score diff
ncaa_results = ncaa_results.merge(grouped_lose, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'MeanLoseScoreDiff':'WTeamMeanLoseDiff'}, inplace=True)

# concat losing team mean lose score diff
ncaa_results = ncaa_results.merge(grouped_lose, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
ncaa_results.drop('TeamID', axis=1, inplace=True)
ncaa_results.rename(columns={'MeanLoseScoreDiff':'LTeamMeanLoseDiff'}, inplace=True)

In [18]:
ncaa_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WTeamSeed,LTeamSeed,...,LTeamFirstD1Season,LTeamLastD1Season,WTeamGamesWon,LTeamGamesWon,WTeamGamesLost,LTeamGamesLost,WTeamMeanWinDiff,LTeamMeanWinDiff,WTeamMeanLoseDiff,LTeamMeanLoseDiff
0,1985,136,1116,63,1234,54,N,0,X09,X08,...,1985.0,2023.0,21,20,12.0,10.0,10.333333,18.45,-8.083333,-5.5
1,1985,136,1120,59,1345,58,N,0,Z11,Z06,...,1985.0,2023.0,18,17,11.0,8.0,11.833333,12.529412,-9.636364,-14.75
2,1985,136,1207,68,1250,43,N,0,W01,W16,...,1985.0,2023.0,25,11,2.0,18.0,17.04,6.0,-1.5,-10.833333
3,1985,136,1229,58,1425,55,N,0,Y09,Y08,...,1985.0,2023.0,20,19,7.0,9.0,11.35,9.421053,-9.428571,-8.111111
4,1985,136,1242,49,1325,38,N,0,Z03,Z14,...,1985.0,2023.0,23,20,7.0,7.0,10.043478,8.8,-8.857143,-7.571429


In [19]:
ncaa_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WTeamSeed,LTeamSeed,...,LTeamFirstD1Season,LTeamLastD1Season,WTeamGamesWon,LTeamGamesWon,WTeamGamesLost,LTeamGamesLost,WTeamMeanWinDiff,LTeamMeanWinDiff,WTeamMeanLoseDiff,LTeamMeanLoseDiff
0,1985,136,1116,63,1234,54,N,0,X09,X08,...,1985.0,2023.0,21,20,12.0,10.0,10.333333,18.45,-8.083333,-5.5
1,1985,136,1120,59,1345,58,N,0,Z11,Z06,...,1985.0,2023.0,18,17,11.0,8.0,11.833333,12.529412,-9.636364,-14.75
2,1985,136,1207,68,1250,43,N,0,W01,W16,...,1985.0,2023.0,25,11,2.0,18.0,17.04,6.0,-1.5,-10.833333
3,1985,136,1229,58,1425,55,N,0,Y09,Y08,...,1985.0,2023.0,20,19,7.0,9.0,11.35,9.421053,-9.428571,-8.111111
4,1985,136,1242,49,1325,38,N,0,Z03,Z14,...,1985.0,2023.0,23,20,7.0,7.0,10.043478,8.8,-8.857143,-7.571429


In [20]:
list(ncaa_results.columns)

['Season',
 'DayNum',
 'WTeamID',
 'WScore',
 'LTeamID',
 'LScore',
 'WLoc',
 'NumOT',
 'WTeamSeed',
 'LTeamSeed',
 'SeasonStart',
 'WTeamRegion',
 'LTeamRegion',
 'WTeamName',
 'WTeamFirstD1Season',
 'WTeamLastD1Season',
 'LTeamName',
 'LTeamFirstD1Season',
 'LTeamLastD1Season',
 'WTeamGamesWon',
 'LTeamGamesWon',
 'WTeamGamesLost',
 'LTeamGamesLost',
 'WTeamMeanWinDiff',
 'LTeamMeanWinDiff',
 'WTeamMeanLoseDiff',
 'LTeamMeanLoseDiff']

In [21]:
# drop unnecessary columns
ncaa_results = ncaa_results.drop(['WLoc', 'NumOT', 'SeasonStart', 'WTeamRegion', 'LTeamRegion',
                                  'WTeamFirstD1Season', 'WTeamLastD1Season',
                                  'LTeamFirstD1Season', 'LTeamLastD1Season'], axis=1)

In [22]:
# create copy and drop additional columns
preprocessed = ncaa_results.copy() 
preprocessed = preprocessed.drop(['WScore', 'LScore', 'WTeamName', 'LTeamName', 'DayNum'], axis=1)

In [23]:
# pull out seed numbers
preprocessed['WTeamSeed'] = preprocessed['WTeamSeed'].apply(lambda x: x[1:])
preprocessed['LTeamSeed'] = preprocessed['LTeamSeed'].apply(lambda x: x[1:])

In [24]:
# preview sample submission
samp_submission.head()

Unnamed: 0,ID,Pred
0,2023_1101_1102,0.5
1,2023_1101_1103,0.5
2,2023_1101_1104,0.5
3,2023_1101_1105,0.5
4,2023_1101_1106,0.5


In [25]:
# add columns to preprocessed per the above
preprocessed['ID'] = preprocessed.apply(lambda x: f'{x.Season}_{x.WTeamID}_{x.LTeamID}', axis=1)

# team listed first will always be the winning team
preprocessed['Pred'] = 1

In [30]:
# rename columns
preprocessed.columns = ['season', 'team_a', 'team_b', 
                        'seed_a', 'seed_b', 
                        'reg_season_wins_a', 'reg_season_wins_b',
                        'reg_season_losses_a', 'reg_season_losses_b',
                        'mean_win_diff_a', 'mean_win_diff_b',
                        'mean_lose_diff_a', 'mean_lose_diff_b',
                        'ID', 'Pred']

## Prepare for Modeling

In [31]:
preprocessed.head()

Unnamed: 0,season,team_a,team_b,seed_a,seed_b,reg_season_wins_a,reg_season_wins_b,reg_season_losses_a,reg_season_losses_b,mean_win_diff_a,mean_win_diff_b,mean_lose_diff_a,mean_lose_diff_b,ID,Pred
0,1985,1116,1234,9,8,21,20,12.0,10.0,10.333333,18.45,-8.083333,-5.5,1985_1116_1234,1
1,1985,1120,1345,11,6,18,17,11.0,8.0,11.833333,12.529412,-9.636364,-14.75,1985_1120_1345,1
2,1985,1207,1250,1,16,25,11,2.0,18.0,17.04,6.0,-1.5,-10.833333,1985_1207_1250,1
3,1985,1229,1425,9,8,20,19,7.0,9.0,11.35,9.421053,-9.428571,-8.111111,1985_1229_1425,1
4,1985,1242,1325,3,14,23,20,7.0,7.0,10.043478,8.8,-8.857143,-7.571429,1985_1242_1325,1


In [None]:
len(preprocessed.columns)

14

In [None]:
# we have 14 features