# Kaggle March Madness Competition

In [1]:
from pathlib import Path
import os
import pandas as pd

In [2]:
# set path to data directories
raw_data = Path('../march-machine-learning-mania-2023/')
data_path = Path('../data/')

## Load Relevant Raw Data

In [3]:
# teams
m_teams = pd.read_csv(raw_data/'MTeams.csv')
w_teams = pd.read_csv(raw_data/'WTeams.csv')

# seasons
m_seasons = pd.read_csv(raw_data/'MSeasons.csv')
w_seasons = pd.read_csv(raw_data/'WSeasons.csv')

# NCAA tournament seeds
m_seeds = pd.read_csv(raw_data/'MNCAATourneySeeds.csv')
w_seeds = pd.read_csv(raw_data/'WNCAATourneySeeds.csv')

# compact regular season results
m_season_compact = pd.read_csv(raw_data/'MRegularSeasonCompactResults.csv')
w_season_compact = pd.read_csv(raw_data/'WRegularSeasonCompactResults.csv')

# compact tournament results
m_ncaa_compact = pd.read_csv(raw_data/'MNCAATourneyCompactResults.csv')
w_ncaa_compact = pd.read_csv(raw_data/'WNCAATourneyCompactResults.csv')

# sample submission file
samp_submission = pd.read_csv(raw_data/'SampleSubmission2023.csv')

## Preprocess Data

In [4]:
# concat mens and womens datasets
teams = pd.concat([m_teams, w_teams], axis=0)
seasons = pd.concat([m_seasons, w_seasons], axis=0)
seeds = pd.concat([m_seeds, w_seeds], axis=0)
season_results = pd.concat([m_season_compact, w_season_compact], axis=0)
ncaa_results = pd.concat([m_ncaa_compact, w_ncaa_compact])

In [5]:
# add winning team seeds to ncaa_results
ncaa_results = ncaa_results.merge(seeds, how='left', left_on=['WTeamID', 'Season'], right_on=['TeamID', 'Season'])
ncaa_results.rename(columns={'Seed': 'WTeamSeed'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

# add losing team seeds
ncaa_results = ncaa_results.merge(seeds, how='left', left_on=['LTeamID', 'Season'], right_on=['TeamID', 'Season'])
ncaa_results.rename(columns={'Seed':'LTeamSeed'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

In [6]:
# rename season columns
seasons.columns = ['Season', 'SeasonStart', 'RegionW', 'RegionX', 'RegionY', 'RegionZ']

# add to ncaa_results
ncaa_results = ncaa_results.merge(seasons, how='left', on='Season')

In [7]:
# identify team regions based on seed
ncaa_results['WTeamRegion'] = ncaa_results['WTeamSeed'].apply(lambda x: x[0])
ncaa_results['LTeamRegion'] = ncaa_results['LTeamSeed'].apply(lambda x: x[0])

In [8]:
# create mapping dict for regions
ncaa_results['mapping_dict'] = ncaa_results.apply(lambda x: {'W': x['RegionW'], 
                                                             'X': x['RegionX'],
                                                             'Y': x['RegionY'],
                                                             'Z': x['RegionZ']}, axis=1)

In [9]:
# replace region codes with the proper region name
ncaa_results['WTeamRegion'] = ncaa_results.apply(lambda x: x['mapping_dict'][x['WTeamRegion']], axis=1)
ncaa_results['LTeamRegion'] = ncaa_results.apply(lambda x: x['mapping_dict'][x['LTeamRegion']], axis=1)

In [10]:
# drop unnecessary columns
ncaa_results.drop(['RegionW', 'RegionX', 'RegionY', 'RegionZ', 'mapping_dict'], axis=1, inplace=True)

In [14]:
# winning team details 
ncaa_results = ncaa_results.merge(teams, how='left', left_on='WTeamID', right_on='TeamID')
ncaa_results.rename(columns={'TeamName': 'WTeamName',
                             'FirstD1Season': 'WTeamFirstD1Season',
                             'LastD1Season': 'WTeamLastD1Season'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

# losing team details 
ncaa_results = ncaa_results.merge(teams, how='left', left_on='LTeamID', right_on='TeamID')
ncaa_results.rename(columns={'TeamName':'LTeamName',
                             'FirstD1Season': 'LTeamFirstD1Season',
                             'LastD1Season': 'LTeamLastD1Season'}, inplace=True)
ncaa_results.drop('TeamID', axis=1, inplace=True)

In [15]:
ncaa_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WTeamSeed,LTeamSeed,SeasonStart,WTeamRegion,LTeamRegion,WTeamName,WTeamFirstD1Season,WTeamLastD1Season,LTeamName,LTeamFirstD1Season,LTeamLastD1Season
0,1985,136,1116,63,1234,54,N,0,X09,X08,10/29/1984,West,West,Arkansas,1985.0,2023.0,Iowa,1985.0,2023.0
1,1985,136,1120,59,1345,58,N,0,Z11,Z06,10/29/1984,Southeast,Southeast,Auburn,1985.0,2023.0,Purdue,1985.0,2023.0
2,1985,136,1207,68,1250,43,N,0,W01,W16,10/29/1984,East,East,Georgetown,1985.0,2023.0,Lehigh,1985.0,2023.0
3,1985,136,1229,58,1425,55,N,0,Y09,Y08,10/29/1984,Midwest,Midwest,Illinois St,1985.0,2023.0,USC,1985.0,2023.0
4,1985,136,1242,49,1325,38,N,0,Z03,Z14,10/29/1984,Southeast,Southeast,Kansas,1985.0,2023.0,Ohio,1985.0,2023.0
