# 2021 March Madness Bracket Predictor

### Predicts and generates a 2021 March Madness bracket.

Developers:
- Brady Lange (03/08/2021)

## Libraries

In [1]:
# Import standard libraries
import os
import pandas as pd
import numpy as np

# Configure settings
pd.set_option("display.max.columns", None)
# pd.set_option("display.max.rows", None)
# pd.set_option("display.precision", 2)

## Load Data

In [2]:
base_path = os.path.abspath("..")
data_path = os.path.join(base_path, "data")

data_file_paths = {}
exclude_dirs = ["2020_challenge_data", "output"]

for root, dirs, files in os.walk(data_path, topdown=True):
    # Exclude last year's challenge data and output files
    dirs[:] = [d for d in dirs if d not in exclude_dirs]
    for file_name in files:
        if file_name.endswith(".csv"):
            data_file_paths[file_name[:-4]] = os.path.join(root, file_name)

# Basic data
m_ncaa_tourney_compact_results_df = pd.read_csv(data_file_paths["m_ncaa_tourney_compact_results"])
m_ncaa_tourney_seeds_df = pd.read_csv(data_file_paths["m_ncaa_tourney_seeds"])
m_regular_season_compact_results_df = pd.read_csv(data_file_paths["m_regular_season_compact_results"])
m_seasons_df = pd.read_csv(data_file_paths["m_seasons"])
m_teams_df = pd.read_csv(data_file_paths["m_teams"])

# Team box scores data
m_ncaa_tourney_detailed_results_df = pd.read_csv(data_file_paths["m_ncaa_tourney_detailed_results"])
m_regular_season_detailed_results_df = pd.read_csv(data_file_paths["m_regular_season_detailed_results"])

# Geography data
cities_df = pd.read_csv(data_file_paths["cities"])
m_game_cities_df = pd.read_csv(data_file_paths["m_game_cities"])

# Public rankings data
m_massey_ordinals_df = pd.read_csv(data_file_paths["m_massey_ordinals"])

# Supplemental data
conferences_df = pd.read_csv(data_file_paths["conferences"])
m_conference_tourney_games_df = pd.read_csv(data_file_paths["m_conference_tourney_games"])
m_ncaa_tourney_seed_round_slots_df = pd.read_csv(data_file_paths["m_ncaa_tourney_seed_round_slots"])
m_ncaa_tourney_slots_df = pd.read_csv(data_file_paths["m_ncaa_tourney_slots"])
m_secondary_tourney_compact_results_df = pd.read_csv(data_file_paths["m_secondary_tourney_compact_results"])
m_secondary_tourney_teams_df = pd.read_csv(data_file_paths["m_secondary_tourney_teams"])
m_team_coaches_df = pd.read_csv(data_file_paths["m_team_coaches"])
m_team_conferences_df = pd.read_csv(data_file_paths["m_team_conferences"])
# Windows codepage 1252 encoded file
m_team_spellings_df = pd.read_csv(data_file_paths["m_team_spellings"], encoding="cp1252")

## Explore Data

In [None]:
def explore_df(df, title="Data Frame"):
    """
    Explores a specified Pandas data frame by printing out all of it's metrics
    and information neatly.
    
    Args:
        df (pandas.DataFrame): Pandas Data Frame to explore.
        title (str): Title/name of data frame. Default is 'Data Frame'.
        
    Returns:
        None: Nothing.
    """
    print("======================================================================")
    print("{0}:".format(title))
    print("======================================================================")
    print("Data Type:")
    print("----------------------------------------------------------------------")
    print(type(df), "\n")
    print("First 5 Rows:")
    print("----------------------------------------------------------------------")
    print(df.head(), "\n")
    print("Last 5 Rows:")
    print("----------------------------------------------------------------------")
    print(df.tail(), "\n")
    print("Description:")
    print("----------------------------------------------------------------------")
    print(df.describe(include=np.object), "\n")
    print("Information:")
    print("----------------------------------------------------------------------")
    df.info()
    print("\nNumber of Rows & Columns (Rows, Columns):")
    print("----------------------------------------------------------------------")
    print(df.shape, "\n")
    print("Number of Rows:")
    print("----------------------------------------------------------------------")
    print(len(df), "\n")
    print("Number of Elements (Rows x Columns):")
    print("----------------------------------------------------------------------")
    print(df.size, "\n")
    print("Columns:")
    print("----------------------------------------------------------------------")
    print(df.columns, "\n")
    for column in df.columns:
        print("Column:")
        print("----------------------------------------------------------------------")
        print(column, "\n")
        print("'{0}' Value Counts:".format(column))
        print("----------------------------------------------------------------------")
        print(df[column].value_counts(), "\n")
        print("Minimum '{0}' Value:".format(column))
        print("----------------------------------------------------------------------")
        print(df[column].min(), "\n")
        print("Maximum '{0}' Value:".format(column))
        print("----------------------------------------------------------------------")
        print(df[column].max(), "\n")
    print("Null Values:")
    print("----------------------------------------------------------------------")
    print(df.isnull().sum(), "\n")

# Basic data
explore_df(df=m_ncaa_tourney_compact_results_df, title="Men's NCAA Tourney Compact Results")
explore_df(df=m_ncaa_tourney_seeds_df, title="Men's NCAA Tourney Seeds")
explore_df(df=m_regular_season_compact_results_df, title="Men's Regular Season Compact Results")
explore_df(df=m_seasons_df, title="Men's Seasons")
explore_df(df=m_teams_df, title="Men's Teams")

# Team box scores data
explore_df(df=m_ncaa_tourney_detailed_results_df, title="Men's NCAA Tourney Detailed Results")
explore_df(df=m_regular_season_detailed_results_df, title="Men's Regular Season Detailed Results")

# Geography data
explore_df(df=cities_df, title="Cities")
explore_df(df=m_game_cities_df, title="Men's Game Cities")

# Public rankings data
explore_df(df=m_massey_ordinals_df, title="Men's Massey Ordinals")

# Supplemental data
explore_df(df=conferences_df, title="Conferences")
explore_df(df=m_conference_tourney_games_df, title="Men's Conference Tourney Games")
explore_df(df=m_ncaa_tourney_seed_round_slots_df, title="Men's NCAA Tourney Seed Round Slots")
explore_df(df=m_ncaa_tourney_slots_df, title="Men's NCAA Tourney Slots")
explore_df(df=m_secondary_tourney_compact_results_df, title="Men's Secondary Tourney Compact Results")
explore_df(df=m_secondary_tourney_teams_df, title="Men's Secondary Tourney Teams")
explore_df(df=m_team_coaches_df, title="Men's Team Coaches")
explore_df(df=m_team_conferences_df, title="Men's Team Conferences")
explore_df(df=m_team_spellings_df, title="Men's Team Spellings")

## Preprocess Data

In [9]:
# Basic data
print(m_ncaa_tourney_compact_results_df.columns)
print(m_ncaa_tourney_seeds_df.columns)
print(m_regular_season_compact_results_df.columns)
print(m_seasons_df.columns)
print(m_teams_df.columns)

print(len(m_ncaa_tourney_compact_results_df))
print(len(m_ncaa_tourney_seeds_df))
print(len(m_regular_season_compact_results_df))
print(len(m_seasons_df))
print(len(m_teams_df))

all_game_compact_results_df = pd.concat([m_ncaa_tourney_compact_results_df, m_regular_season_compact_results_df])
all_game_detailed_results_df = pd.concat([m_ncaa_tourney_detailed_results_df, m_regular_season_detailed_results_df])
df = pd.merge(m_ncaa_tourney_seeds_df, m_teams_df, how="inner", on="TeamID")
df = df.merge(m_seasons_df, how="inner", on="Season")
df = df.merge(m_ncaa_tourney_detailed_results_df, how="inner", left_on=["TeamID", "Season"], right_on=["WTeamID", "Season"])
# df = df.merge(all_game_results_df, how="inner", left_on=["TeamID", "Season"], right_on=["WTeamID", "Season"])
df[df["TeamName"] == "Minnesota"]

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT'],
      dtype='object')
Index(['Season', 'Seed', 'TeamID'], dtype='object')
Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT'],
      dtype='object')
Index(['Season', 'DayZero', 'RegionW', 'RegionX', 'RegionY', 'RegionZ'], dtype='object')
Index(['TeamID', 'TeamName', 'FirstD1Season', 'LastD1Season'], dtype='object')
2251
2286
166880
37
371


Unnamed: 0,Season,Seed,TeamID,TeamName,FirstD1Season,LastD1Season,DayZero,RegionW,RegionX,RegionY,RegionZ,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
437,2013,X11,1278,Minnesota,1985,2021,11/5/2012,East,South,Midwest,West,137,1278,83,1417,63,N,0,31,61,9,16,12,19,11,25,19,11,7,3,19,20,63,4,22,19,25,19,23,10,15,5,2,19
1101,2019,W10,1278,Minnesota,1985,2021,11/5/2018,East,West,Midwest,South,136,1278,86,1257,76,N,0,29,58,11,27,17,26,11,24,10,5,2,3,15,26,59,9,26,15,17,9,26,16,5,2,0,22


## Analysis