# March Madness Monte Carlo Simulation

Use information to try to predict the 2023 March Madness Men's and Women's Basketball Championships.

## Configure Notebook and Download Competition Items

In [1]:
from pathlib import Path

cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [2]:
from pathlib import Path
import os

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/march-machine-learning-mania-2023')
    !pip install -Uqq fastai
else:
    import zipfile,kaggle
    path = Path('march-machine-learning-mania-2023')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

## Load and Preview Provided Data

In [3]:
import pandas as pd

# general datasets
cities = pd.read_csv(path/'Cities.csv')
conferences = pd.read_csv(path/'Conferences.csv')
samp_submission = pd.read_csv(path/'SampleSubmission2023.csv')

# men's datasets
m_conf_tourney_games = pd.read_csv(path/'MConferenceTourneyGames.csv')
m_game_cities = pd.read_csv(path/'MGameCities.csv')
m_massey_ordinals = pd.read_csv(path/'MMasseyOrdinals_thru_Season2023_Day128.csv')
m_tourney_compact_results = pd.read_csv(path/'MNCAATourneyCompactResults.csv')
m_tourney_detailed_results = pd.read_csv(path/'MNCAATourneyDetailedResults.csv')
m_tourney_seed_round_slots = pd.read_csv(path/'MNCAATourneySeedRoundSlots.csv')
m_tourney_seeds = pd.read_csv(path/'MNCAATourneySeeds.csv')
m_tourney_slots = pd.read_csv(path/'MNCAATourneySlots.csv')
m_reg_season_compact_results = pd.read_csv(path/'MRegularSeasonCompactResults.csv')
m_reg_season_detailed_results = pd.read_csv(path/'MSeasons.csv')
m_secondary_tourney_compact_results = pd.read_csv(path/'MSecondaryTourneyCompactResults.csv')
m_secondary_tourney_teams = pd.read_csv(path/'MSecondaryTourneyTeams.csv')
m_team_coaches = pd.read_csv(path/'MTeamCoaches.csv')
m_team_conferences = pd.read_csv(path/'MTeamConferences.csv')
m_teams = pd.read_csv(path/'MTeams.csv')
m_team_spellings = pd.read_csv(path/'MTeamSpellings.csv', encoding='latin-1')

# women's datasets
w_game_cities = pd.read_csv(path/'WGameCities.csv')
w_tourney_compact_results = pd.read_csv(path/'WNCAATourneyCompactResults.csv')
w_tourney_detailed_results = pd.read_csv(path/'WNCAATourneyDetailedResults.csv')
w_tourney_seeds = pd.read_csv(path/'WNCAATourneySeeds.csv')
w_tourney_slots = pd.read_csv(path/'WNCAATourneySlots.csv')
w_reg_season_compact_results = pd.read_csv(path/'WRegularSeasonCompactResults.csv')
w_reg_season_detailed_results = pd.read_csv(path/'WRegularSeasonDetailedResults.csv')
w_seasons = pd.read_csv(path/'WSeasons.csv')
w_team_conferences = pd.read_csv(path/'WTeamConferences.csv')
w_teams = pd.read_csv(path/'WTeams.csv')
w_team_spellings = pd.read_csv(path/'WTeamSpellings.csv', encoding='latin-1')

In [4]:
m_teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2023
1,1102,Air Force,1985,2023
2,1103,Akron,1985,2023
3,1104,Alabama,1985,2023
4,1105,Alabama A&M,2000,2023


# Process Overview

Use ML to submit predictions related to the [Kaggle 2023 March Madness Competition](https://www.kaggle.com/competitions/march-machine-learning-mania-2023).

Process steps:
1. Evaluate data and build a classifier to predict probability a team wins in a head-to-head matchup
2. Use model within a Monte Carlo simulation to simulate the tournament


### Sample Submission Format

In [5]:
# preview
samp_submission.head()

Unnamed: 0,ID,Pred
0,2023_1101_1102,0.5
1,2023_1101_1103,0.5
2,2023_1101_1104,0.5
3,2023_1101_1105,0.5
4,2023_1101_1106,0.5


Note: `Pred` is the probability that the first team won.
- `ID` is formatted such that the season, team_1, team_2 are presented

## Initial Data Preparation
### Tournament Datasets

#### Concatenate Men's and Women's Tournament Results

In [6]:
# add column to identify whether it is men's or women's 
m_tourney_compact_results['gender'] = 'men'
w_tourney_compact_results['gender'] = 'women'
m_tourney_detailed_results['gender'] = 'men'
w_tourney_detailed_results['gender'] = 'women'

# concat datasets together
tourney_compact = pd.concat([m_tourney_compact_results, w_tourney_compact_results], axis=0)
tourney_detailed = pd.concat([m_tourney_detailed_results, w_tourney_detailed_results], axis=0)

#### Add Conference Detail

In [7]:
# concat mens and womens datasets
team_conferences = pd.concat([m_team_conferences, w_team_conferences], axis=0)

In [8]:
# create copy of compact results dataset for now
df = tourney_compact.copy()

In [9]:
# add winning team conference detail
df = df.merge(team_conferences, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
df = df.rename(columns={'ConfAbbrev':'WTeamConf'})
df.drop(['TeamID'], axis=1, inplace=True)

# add losing team conference detail
df = df.merge(team_conferences, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
df = df.rename(columns={'ConfAbbrev':'LTeamConf'})
df.drop(['TeamID'], axis=1, inplace=True)

#### Concat City Detail

In [10]:
# concat mens and womens datasets
game_cities = pd.concat([m_game_cities, w_game_cities], axis=0)

In [11]:
# merge CRType and CityID into df
df = df.merge(game_cities, how='left', on=['Season', 'DayNum', 'WTeamID', 'LTeamID'])

#### Concat Tourney Seed Detail

In [12]:
# concat men's and women's datasets
tourney_seeds = pd.concat([m_tourney_seeds, w_tourney_seeds], axis=0)

In [13]:
# concat winning seed data
df = df.merge(tourney_seeds, how='left', left_on=['WTeamID', 'Season'], right_on=['TeamID', 'Season']).rename(columns={'Seed':'WTeamSeed'})
df = df.drop('TeamID', axis=1)

In [14]:
# concat losing seed data
df = df.merge(tourney_seeds, how='left', left_on=['LTeamID', 'Season'], right_on=['TeamID', 'Season']).rename(columns={'Seed':'LTeamSeed'})
df = df.drop('TeamID', axis=1)

#### Save Copy of Merged Tourney Data

In [15]:
tourney_df = df.copy()

### Regular Season Data

#### Concat Men's and Women's Regular Season Data

In [16]:
# add gender column to compact and detailed dataframes
m_reg_season_compact_results['gender'] = 'men'
m_reg_season_detailed_results['gender'] = 'men'
w_reg_season_compact_results['gender'] = 'women'
w_reg_season_detailed_results['gender'] = 'women'

In [17]:
# concat men's and women's regular season data
reg_season_compact = pd.concat([m_reg_season_compact_results, w_reg_season_compact_results], axis=0)
reg_season_detailed = pd.concat([m_reg_season_detailed_results, w_reg_season_detailed_results], axis=0)

#### Concat Conference Detail

In [19]:
team_conferences.head()

Unnamed: 0,Season,TeamID,ConfAbbrev
0,1985,1102,wac
1,1985,1103,ovc
2,1985,1104,sec
3,1985,1106,swac
4,1985,1108,swac


In [21]:
# save dataframe for compact reg season
df = reg_season_compact.copy()

In [22]:
# add winning team conference detail
df = df.merge(team_conferences, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
df = df.rename(columns={'ConfAbbrev':'WTeamConf'})
df.drop(['TeamID'], axis=1, inplace=True)

# add losing team conference detail
df = df.merge(team_conferences, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
df = df.rename(columns={'ConfAbbrev':'LTeamConf'})
df.drop(['TeamID'], axis=1, inplace=True)

#### Concat City Detail

In [24]:
# merge CRType and CityID into df
df = df.merge(game_cities, how='left', on=['Season', 'DayNum', 'WTeamID', 'LTeamID'])

#### Save Regular Season Data

In [25]:
reg_season_df = df.copy()