# 2021 March Madness Bracket Predictor

### Predicts and generates a 2021 March Madness bracket.

Developers:
- Brady Lange (03/12/2021)

## Libraries

In [1]:
# Import standard libraries
import os
import pandas as pd
import numpy as np

# Configure settings
pd.set_option("display.max.columns", None)
# pd.set_option("display.max.rows", None)
# pd.set_option("display.precision", 2)

## Load Data

In [2]:
base_path = os.path.abspath("..")
data_path = os.path.join(base_path, "data")

data_file_paths = {}
exclude_dirs = ["2020_challenge_data"]

for root, dirs, files in os.walk(data_path, topdown=True):
    # Exclude last year's challenge data and output files
    dirs[:] = [d for d in dirs if d not in exclude_dirs]
    for file_name in files:
        if file_name.endswith(".csv"):
            data_file_paths[file_name[:-4]] = os.path.join(root, file_name)

# Basic data
m_ncaa_tourney_compact_results_df = pd.read_csv(data_file_paths["m_ncaa_tourney_compact_results"])
m_ncaa_tourney_seeds_df = pd.read_csv(data_file_paths["m_ncaa_tourney_seeds"])
m_regular_season_compact_results_df = pd.read_csv(data_file_paths["m_regular_season_compact_results"])
m_seasons_df = pd.read_csv(data_file_paths["m_seasons"])
m_teams_df = pd.read_csv(data_file_paths["m_teams"])

# Team box scores data
m_ncaa_tourney_detailed_results_df = pd.read_csv(data_file_paths["m_ncaa_tourney_detailed_results"])
m_regular_season_detailed_results_df = pd.read_csv(data_file_paths["m_regular_season_detailed_results"])

# Geography data
cities_df = pd.read_csv(data_file_paths["cities"])
m_game_cities_df = pd.read_csv(data_file_paths["m_game_cities"])

# Public rankings data
m_massey_ordinals_df = pd.read_csv(data_file_paths["m_massey_ordinals"])

# Supplemental data
conferences_df = pd.read_csv(data_file_paths["conferences"])
m_conference_tourney_games_df = pd.read_csv(data_file_paths["m_conference_tourney_games"])
m_ncaa_tourney_seed_round_slots_df = pd.read_csv(data_file_paths["m_ncaa_tourney_seed_round_slots"])
m_ncaa_tourney_slots_df = pd.read_csv(data_file_paths["m_ncaa_tourney_slots"])
m_secondary_tourney_compact_results_df = pd.read_csv(data_file_paths["m_secondary_tourney_compact_results"])
m_secondary_tourney_teams_df = pd.read_csv(data_file_paths["m_secondary_tourney_teams"])
m_team_coaches_df = pd.read_csv(data_file_paths["m_team_coaches"])
m_team_conferences_df = pd.read_csv(data_file_paths["m_team_conferences"])
# Windows codepage 1252 encoded file
m_team_spellings_df = pd.read_csv(data_file_paths["m_team_spellings"], encoding="cp1252")

# Sample submission data
hist_sample_subm_df = pd.read_csv(data_file_paths["m_sample_submission_stage_01"])

## Explore Data

In [None]:
def explore_df(df, title="Data Frame"):
    """
    Explores a specified Pandas data frame by printing out all of it's metrics
    and information neatly.
    
    Args:
        df (pandas.DataFrame): Pandas Data Frame to explore.
        title (str): Title/name of data frame. Default is 'Data Frame'.
        
    Returns:
        None: Nothing.
    """
    print("======================================================================")
    print("{0}:".format(title))
    print("======================================================================")
    print("Data Type:")
    print("----------------------------------------------------------------------")
    print(type(df), "\n")
    print("First 5 Rows:")
    print("----------------------------------------------------------------------")
    print(df.head(), "\n")
    print("Last 5 Rows:")
    print("----------------------------------------------------------------------")
    print(df.tail(), "\n")
    print("Description:")
    print("----------------------------------------------------------------------")
    print(df.describe(include=np.object), "\n")
    print("Information:")
    print("----------------------------------------------------------------------")
    df.info()
    print("\nNumber of Rows & Columns (Rows, Columns):")
    print("----------------------------------------------------------------------")
    print(df.shape, "\n")
    print("Number of Rows:")
    print("----------------------------------------------------------------------")
    print(len(df), "\n")
    print("Number of Elements (Rows x Columns):")
    print("----------------------------------------------------------------------")
    print(df.size, "\n")
    print("Columns:")
    print("----------------------------------------------------------------------")
    print(df.columns, "\n")
    for column in df.columns:
        print("Column:")
        print("----------------------------------------------------------------------")
        print(column, "\n")
        print("'{0}' Value Counts:".format(column))
        print("----------------------------------------------------------------------")
        print(df[column].value_counts(), "\n")
        print("Minimum '{0}' Value:".format(column))
        print("----------------------------------------------------------------------")
        print(df[column].min(), "\n")
        print("Maximum '{0}' Value:".format(column))
        print("----------------------------------------------------------------------")
        print(df[column].max(), "\n")
    print("Null Values:")
    print("----------------------------------------------------------------------")
    print(df.isnull().sum(), "\n")

# Basic data
explore_df(df=m_ncaa_tourney_compact_results_df, title="Men's NCAA Tourney Compact Results")
explore_df(df=m_ncaa_tourney_seeds_df, title="Men's NCAA Tourney Seeds")
explore_df(df=m_regular_season_compact_results_df, title="Men's Regular Season Compact Results")
explore_df(df=m_seasons_df, title="Men's Seasons")
explore_df(df=m_teams_df, title="Men's Teams")

# Team box scores data
explore_df(df=m_ncaa_tourney_detailed_results_df, title="Men's NCAA Tourney Detailed Results")
explore_df(df=m_regular_season_detailed_results_df, title="Men's Regular Season Detailed Results")

# Geography data
explore_df(df=cities_df, title="Cities")
explore_df(df=m_game_cities_df, title="Men's Game Cities")

# Public rankings data
explore_df(df=m_massey_ordinals_df, title="Men's Massey Ordinals")

# Supplemental data
explore_df(df=conferences_df, title="Conferences")
explore_df(df=m_conference_tourney_games_df, title="Men's Conference Tourney Games")
explore_df(df=m_ncaa_tourney_seed_round_slots_df, title="Men's NCAA Tourney Seed Round Slots")
explore_df(df=m_ncaa_tourney_slots_df, title="Men's NCAA Tourney Slots")
explore_df(df=m_secondary_tourney_compact_results_df, title="Men's Secondary Tourney Compact Results")
explore_df(df=m_secondary_tourney_teams_df, title="Men's Secondary Tourney Teams")
explore_df(df=m_team_coaches_df, title="Men's Team Coaches")
explore_df(df=m_team_conferences_df, title="Men's Team Conferences")
explore_df(df=m_team_spellings_df, title="Men's Team Spellings")

## Preprocess Data

In [53]:
# # Basic data
# print(m_ncaa_tourney_compact_results_df.columns)
# print(m_ncaa_tourney_seeds_df.columns)
# print(m_regular_season_compact_results_df.columns)
# print(m_seasons_df.columns)
# print(m_teams_df.columns)

# print(len(m_ncaa_tourney_compact_results_df))
# print(len(m_ncaa_tourney_seeds_df))
# print(len(m_regular_season_compact_results_df))
# print(len(m_seasons_df))
# print(len(m_teams_df))

all_game_compact_results_df = pd.concat([m_ncaa_tourney_compact_results_df, m_regular_season_compact_results_df])
all_game_detailed_results_df = pd.concat([m_ncaa_tourney_detailed_results_df, m_regular_season_detailed_results_df])

df = m_ncaa_tourney_compact_results_df.merge(
    m_teams_df[["TeamID", "TeamName"]], left_on="WTeamID", right_on="TeamID", validate="many_to_one"
).drop(
    "TeamID", axis=1
).rename(
    columns={"TeamName": "WTeamName"}
).merge(
    m_teams_df[["TeamID", "TeamName"]], left_on="LTeamID", right_on="TeamID"
).drop(
    "TeamID", axis=1
).rename(
    columns={"TeamName": "LTeamName"}
)
df

df = all_game_detailed_results_df.merge(
    m_teams_df[["TeamID", "TeamName"]], left_on="WTeamID", right_on="TeamID", validate="many_to_one"
).drop(
    "TeamID", axis=1
).rename(
    columns={"TeamName": "WTeamName"}
).merge(
    m_teams_df[["TeamID", "TeamName"]], left_on="LTeamID", right_on="TeamID"
).drop(
    "TeamID", axis=1
).rename(
    columns={"TeamName": "LTeamName"}
)
df

0
1
2
3
4
5
6


KeyError: ''

## Feature Engineering

In [29]:
GAME_MINS = 40
OT_MINS = 5

def get_ppg(season_results_df):    
    season_results_df = season_results_df.copy()
    
    season_results_df["GameDuration"] = GAME_MINS + OT_MINS * season_results_df["NumOT"]
    
    season_results_df["WTeamPPG"] = (season_results_df["WScore"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LTeamPPG"] = (season_results_df["LScore"] / season_results_df["GameDuration"]) * GAME_MINS
    
    w_team_ppg = season_results_df[["Season", "WTeamID", "WTeamPPG"]]
    w_team_ppg = w_team_ppg.groupby(["Season", "WTeamID"]).agg([("WPPG", "mean"), ("WGames", "count")])
    w_team_ppg.columns = w_team_ppg.columns.droplevel(0)
    w_team_ppg = w_team_ppg.reset_index()
    
    l_team_ppg = season_results_df[["Season", "LTeamID", "LTeamPPG"]]
    l_team_ppg = l_team_ppg.groupby(["Season", "LTeamID"]).agg([("LPPG", "mean"), ("LGames", "count")])
    l_team_ppg.columns = l_team_ppg.columns.droplevel(0)
    l_team_ppg = l_team_ppg.reset_index()
    
    ppg_df = pd.merge(w_team_ppg, l_team_ppg, left_on=["Season", "WTeamID"], right_on=["Season", "LTeamID"], how="outer")
    
    ppg_df = ppg_df.fillna({"WTeamID": ppg_df["LTeamID"], "WPPG": 0, "WGames": 0, "LTeamID": ppg_df["WTeamID"], "LPPG": 0, "LGames": 0})
    
    ppg_df["PPG"] = (ppg_df["WPPG"] * ppg_df["WGames"] + ppg_df["LPPG"] * ppg_df["LGames"]) / (ppg_df["WGames"] + ppg_df["LGames"])
    ppg_df["WPerc"] = ppg_df["WGames"] / (ppg_df["WGames"] + ppg_df["LGames"])
    ppg_df["Games"] = (ppg_df["WGames"] + ppg_df["LGames"]).astype(int)
    
    ppg_df["TeamID"] = ppg_df["WTeamID"].astype(int)
    
    ppg_df.drop(["WTeamID", "WPPG", "WGames", "LTeamID", "LPPG", "LGames"], axis=1, inplace=True)
    
    ppg_df = ppg_df[["TeamID", "Season", "PPG", "WPerc", "Games"]]
    
    return ppg_df
    
print(get_ppg(m_regular_season_compact_results_df))

def get_efficiency(season_results_df):
    season_results_df = season_results_df.copy()
    
    season_results_df["GameDuration"] = GAME_MINS + OT_MINS * season_results_df["NumOT"]
    
    # Winning teams
    # Winning offense
    season_results_df["WFGM"] = (season_results_df["WFGM"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WFGA"] = (season_results_df["WFGA"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WFGM3"] = (season_results_df["WFGM3"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WFGA3"] = (season_results_df["WFGA3"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WFTM"] = (season_results_df["WFTM"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WFTA"] = (season_results_df["WFTA"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WOR"] = (season_results_df["WOR"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WAst"] = (season_results_df["WAst"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WTO"] = (season_results_df["WTO"] / season_results_df["GameDuration"]) * GAME_MINS
    
    # Winning defense
    season_results_df["WDR"] = (season_results_df["WDR"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WStl"] = (season_results_df["WStl"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WBlk"] = (season_results_df["WBlk"] / season_results_df["GameDuration"]) * GAME_MINS
    
    # Winning offense & defense
    season_results_df["WPF"] = (season_results_df["WPF"] / season_results_df["GameDuration"]) * GAME_MINS
    
    # Losing teams
    # Losing offense
    season_results_df["LFGM"] = (season_results_df["LFGM"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LFGA"] = (season_results_df["LFGA"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LFGM3"] = (season_results_df["LFGM3"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LFGA3"] = (season_results_df["LFGA3"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LFTM"] = (season_results_df["LFTM"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LFTA"] = (season_results_df["LFTA"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LOR"] = (season_results_df["LOR"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LAst"] = (season_results_df["LAst"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LTO"] = (season_results_df["LTO"] / season_results_df["GameDuration"]) * GAME_MINS
    
    # Losing defense
    season_results_df["LDR"] = (season_results_df["LDR"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LStl"] = (season_results_df["LStl"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LBlk"] = (season_results_df["LBlk"] / season_results_df["GameDuration"]) * GAME_MINS
    
    # Losing offense & defense
    season_results_df["LPF"] = (season_results_df["LPF"] / season_results_df["GameDuration"]) * GAME_MINS
    
    season_results_df["WPointsMade"] = (2 * (season_results_df["WFGM"] - season_results_df["WFGM3"])) + (3 * season_results_df["WFGM3"]) + (1 * season_results_df["WFTM"])
    season_results_df["WPointsMissed"] = ((2 * (season_results_df["WFGA"] - season_results_df["WFGA3"])) + (3 * season_results_df["WFGA3"]) + (1 * season_results_df["WFTA"])) - season_results_df["WPointsMade"]
    
    season_results_df["LPointsMade"] = (2 * (season_results_df["LFGM"] - season_results_df["LFGM3"])) + (3 * season_results_df["LFGM3"]) + (1 * season_results_df["LFTM"])
    season_results_df["LPointsMissed"] = ((2 * (season_results_df["LFGA"] - season_results_df["LFGA3"])) + (3 * season_results_df["LFGA3"]) + (1 * season_results_df["LFTA"])) - season_results_df["LPointsMade"]
    
    season_results_df["WOffEff"] = season_results_df["WPointsMade"] + season_results_df["WAst"] + season_results_df["WOR"]
    season_results_df["WDefEff"] = season_results_df["LPointsMissed"] + season_results_df["LTO"] + season_results_df["WDR"] + season_results_df["WStl"] + season_results_df["WBlk"] + season_results_df["WPF"]
    
    season_results_df["LOffEff"] = season_results_df["LPointsMade"] + season_results_df["LAst"] + season_results_df["LOR"]
    season_results_df["LDefEff"] = season_results_df["WPointsMissed"] + season_results_df["WTO"] + season_results_df["LDR"] + season_results_df["LStl"] + season_results_df["LBlk"] + season_results_df["LPF"]
    
    w_eff_df = season_results_df[["WTeamID", "Season", "DayNum", "WOffEff", "WDefEff"]]
    w_eff_df = w_eff_df.rename(columns={"WTeamID": "TeamID", "WOffEff": "OffEff", "WDefEff": "DefEff"})
    
    l_eff_df = season_results_df[["LTeamID", "Season", "DayNum", "LOffEff", "LDefEff"]]
    l_eff_df = l_eff_df.rename(columns={"LTeamID": "TeamID", "LOffEff": "OffEff", "LDefEff": "DefEff"})
    
    eff_df = pd.concat([w_eff_df, l_eff_df]).sort_values(by=["Season", "DayNum"]).reset_index(drop=True)
    
    eff_df.drop(["DayNum"], axis=1, inplace=True)
    
    eff_df = eff_df.groupby(by=["Season", "TeamID"]).agg("mean").reset_index()
    
    return eff_df
    
print(get_efficiency(m_regular_season_detailed_results_df))

def get_rankings(rankings_df):
    rankings_df = rankings_df.copy()
    
    rankings_df = rankings_df[rankings_df["RankingDayNum"] == 133].reset_index(drop=True)
    
    mean_median_rankings_df = rankings_df.groupby(["Season", "TeamID"])[["OrdinalRank"]].agg([("MeanOrdinalRank", "mean"), ("MedianOrdinalRank", "median")])
    mean_median_rankings_df.columns = mean_median_rankings_df.columns.droplevel(0)
    mean_median_rankings_df = mean_median_rankings_df.reset_index()
    
    massey_rankings_df = rankings_df[rankings_df["SystemName"] == "MAS"].reset_index(drop=True)
    massey_rankings_df = massey_rankings_df.rename(columns={"OrdinalRank": "MasseyOrdinalRank"})
    massey_rankings_df = massey_rankings_df[["Season", "TeamID", "MasseyOrdinalRank"]].reset_index(drop=True)
    
    rankings_df = pd.merge(mean_median_rankings_df, massey_rankings_df, on=["Season", "TeamID"], how="left")
    
    return rankings_df
    
print(get_rankings(m_massey_ordinals_df))

       TeamID  Season        PPG     WPerc  Games
0        1102    1985  63.083333  0.208333     24
1        1103    1985  61.043478  0.391304     23
2        1104    1985  68.318519  0.700000     30
3        1106    1985  71.625000  0.416667     24
4        1108    1985  83.000000  0.760000     25
...       ...     ...        ...       ...    ...
11589    1366    2005  57.964286  0.000000     28
11590    1312    2008  55.931034  0.000000     29
11591    1212    2013  49.678571  0.000000     28
11592    1212    2015  51.111111  0.000000     27
11593    1363    2015  52.265873  0.000000     28

[11594 rows x 5 columns]
      Season  TeamID      OffEff      DefEff
0       2003    1102   74.428571  115.821429
1       2003    1103  100.188477  135.788477
2       2003    1104   94.599206  147.968254
3       2003    1105   97.922222  153.617094
4       2003    1106   87.134921  148.186508
...      ...     ...         ...         ...
6182    2020    1463   94.643651  151.820996
6183    2020  

## Prepare Training Dataset

In [4]:
def prep_season(season_results_df):
    season_results_df = season_results_df.copy()
    
    seasons = season_results_df["Season"]
    season_min_gt = seasons > 2014
    season_max_lt = seasons < 2020
    all_teams_df = season_results_df[season_min_gt & season_max_lt].reset_index(drop=True)
    
    w_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    w_teams_df = w_teams_df.rename(columns={"WTeamID": "TeamID_1", "LTeamID": "TeamID_2"})
    w_teams_df["Result"] = 1
    
    l_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    l_teams_df = l_teams_df.rename(columns={"WTeamID": "TeamID_2", "LTeamID": "TeamID_1"})
    l_teams_df["Result"] = 0
    
    all_teams_df = pd.concat([w_teams_df, l_teams_df]).reset_index(drop=True)
    
    all_teams_df["ID"] = all_teams_df["Season"].apply(str) + "_" + all_teams_df["TeamID_1"].apply(str) + "_" + all_teams_df["TeamID_2"].apply(str)
    
    all_teams_df = all_teams_df[["ID", "Season", "TeamID_1", "TeamID_2", "Result"]]
    
    return all_teams_df
    
print(prep_season(m_regular_season_compact_results_df))

                   ID  Season  TeamID_1  TeamID_2  Result
0      2015_1103_1420    2015      1103      1420       1
1      2015_1104_1406    2015      1104      1406       1
2      2015_1112_1291    2015      1112      1291       1
3      2015_1113_1152    2015      1113      1152       1
4      2015_1119_1102    2015      1119      1102       1
...               ...     ...       ...       ...     ...
53967  2019_1222_1153    2019      1222      1153       0
53968  2019_1426_1209    2019      1426      1209       0
53969  2019_1276_1277    2019      1276      1277       0
53970  2019_1382_1387    2019      1382      1387       0
53971  2019_1217_1463    2019      1217      1463       0

[53972 rows x 5 columns]


## Prepare Predictions

In [None]:
def prepare_predictions(sample_output_df, ncaa_tourney_results_df):
    features_df = sample_output_df.copy()
    features_df["Season"] = features_df["ID"].apply(lambda row: int(row[:4]))
    features_df["TeamID_1"] = features_df["ID"].apply(lambda row: int(row[5:9]))
    features_df["TeamID_2"] = features_df["ID"].apply(lambda row: int(row[10:14]))
    features_df.drop(["Pred"], axis=1, inplace=True)
    
    ncaa_tourney_results_df = ncaa_tourney_results_df.copy()
    seasons = ncaa_tourney_results_df["Season"]
    season_min_gt = seasons > 2014
    season_max_lt = seasons < 2020
    all_teams_df = ncaa_tourney_results_df[season_min_gt & season_max_lt].reset_index(drop=True)
    
    w_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    w_teams_df = w_teams_df.rename(columns={"WTeamID": "TeamID_1", "LTeamID": "TeamID_2"})
    w_teams_df["Result"] = 1
    
    l_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    l_teams_df = l_teams_df.rename(columns={"WTeamID": "TeamID_2", "LTeamID": "TeamID_1"})
    l_teams_df["Result"] = 0
    
    all_teams_df = pd.concat([w_teams_df, l_teams_df]).reset_index(drop=True)
    
    features_df = features_df.merge(all_teams_df, on=["Season", "TeamID_1", "TeamID_2"], how="left")
    
    return features_df

## Merge Features

In [33]:
def merge_features(features_df, ppg_df, eff_df, rankings_df):
    features_df = features_df.copy()
    ppg_df = ppg_df.copy()
    eff_df = eff_df.copy()
    rankings_df = rankings_df.copy()
    
    w_ppg_df = ppg_df.rename(columns={"PPG": "PPG_1", "WPerc": "WPerc_1"})
    l_ppg_df = ppg_df.rename(columns={"PPG": "PPG_2", "WPerc": "WPerc_2"})
    
    features_df = pd.merge(features_df, w_ppg_df, left_on=["Season", "TeamID_1"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID", "Games"], axis=1, inplace=True)
    features_df = pd.merge(features_df, l_ppg_df, left_on=["Season", "TeamID_2"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID", "Games"], axis=1, inplace=True)
    
    w_eff_df = eff_df.rename(columns={"OffEff": "OffEff_1", "DefEff": "DefEff_1"})
    l_eff_df = eff_df.rename(columns={"OffEff": "OffEff_2", "DefEff": "DefEff_2"})
    
    features_df = pd.merge(features_df, w_eff_df, left_on=["Season", "TeamID_1"], right_on=["Season", "TeamID"], how="left")#.reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    features_df = pd.merge(features_df, l_eff_df, left_on=["Season", "TeamID_2"], right_on=["Season", "TeamID"], how="left")#.reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    
    features_df["PPGDiff"] = features_df["PPG_1"] - features_df["PPG_2"]
    features_df["WPercDiff"] = features_df["WPerc_1"] - features_df["WPerc_2"]
    features_df["OffEffDiff"] = features_df["OffEff_1"] - features_df["OffEff_2"]
    features_df["DefEffDiff"] = features_df["DefEff_1"] - features_df["DefEff_2"]
    features_df.drop(["PPG_1", "PPG_2", "WPerc_1", "WPerc_2", "OffEff_1", "OffEff_2", "DefEff_1", "DefEff_2"], axis=1, inplace=True)
    
    w_rankings_df = rankings_df.rename(columns={"MeanOrdinalRank": "MeanOrdinalRank_1", "MedianOrdinalRank": "MedianOrdinalRank_1", "MasseyOrdinalRank": "MasseyOrdinalRank_1"})
    l_rankings_df = rankings_df.rename(columns={"MeanOrdinalRank": "MeanOrdinalRank_2", "MedianOrdinalRank": "MedianOrdinalRank_2", "MasseyOrdinalRank": "MasseyOrdinalRank_2"})
    
    features_df = pd.merge(features_df, w_rankings_df, left_on=["Season", "TeamID_1"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    features_df = pd.merge(features_df, l_rankings_df, left_on=["Season", "TeamID_2"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    
    features_df["MeanOrdinalRankDiff"] = features_df["MeanOrdinalRank_1"] - features_df["MeanOrdinalRank_2"]
    features_df["MedianOrdinalRankDiff"] = features_df["MedianOrdinalRank_1"] - features_df["MedianOrdinalRank_2"]
    features_df["MasseyOrdinalRankDiff"] = features_df["MasseyOrdinalRank_1"] - features_df["MasseyOrdinalRank_2"]
    features_df.drop(["MeanOrdinalRank_1", "MeanOrdinalRank_2", "MedianOrdinalRank_1", "MedianOrdinalRank_2", "MasseyOrdinalRank_1", "MasseyOrdinalRank_2"], axis=1, inplace=True)
    
    return features_df

## Create Training and Testing Sets

In [67]:
features_df = prepare_predictions(hist_sample_subm_df, m_ncaa_tourney_compact_results_df)
ppg_df = get_ppg(m_regular_season_compact_results_df)
eff_df = get_efficiency(m_regular_season_detailed_results_df)
rankings_df = get_rankings(m_massey_ordinals_df)
tourney_df = merge_features(features_df, ppg_df, eff_df, rankings_df)

train_features_df = prep_season(m_regular_season_compact_results_df)
reg_season_df = merge_features(train_features_df, ppg_df, eff_df, rankings_df)

## Split Training and Testing Sets

In [68]:
x_train = reg_season_df[[
    "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
    "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff"
]]
y_train = reg_season_df[["Result"]]

tourney_actuals_df = tourney_df.copy().dropna()
x_test = tourney_actuals_df[[
    "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
    "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff"
]]
y_test = tourney_actuals_df[["Result"]]

## Train Model

In [116]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

log_reg = LogisticRegression(penalty="l2", random_state=None, max_iter=1000, verbose=1, n_jobs=-1)

log_reg.fit(x_train, y_train.values.ravel())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   11.9s finished


LogisticRegression(max_iter=1000, n_jobs=-1, verbose=1)

## Predictions

In [176]:
y_pred = log_reg.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

y_pred = log_reg.predict_proba(
    tourney_df[[
        "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
        "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff"
    ]]
)
y_pred = y_pred[:, 1]

stage_1_submission_df = tourney_df[["ID"]].copy()
stage_1_submission_df["Pred"] = y_pred
stage_1_submission_df.to_csv("brady_lange_m_submission_stage_01_log_reg.csv", index=False)

0.5652996256807145


In [122]:
from sklearn.linear_model import SGDClassifier

In [127]:
sgd_clf = SGDClassifier(loss="log", verbose=1, n_jobs=-1)

sgd_clf.fit(x_train, y_train.values.ravel())

-- Epoch 1
Norm: 35.68, NNZs: 7, Bias: 14.035842, T: 53972, Avg. loss: 2808.358156
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 30.08, NNZs: 7, Bias: 4.761078, T: 107944, Avg. loss: 481.632535
Total training time: 0.03 seconds.
-- Epoch 3
Norm: 11.17, NNZs: 7, Bias: -4.818560, T: 161916, Avg. loss: 285.561783
Total training time: 0.04 seconds.
-- Epoch 4
Norm: 17.99, NNZs: 7, Bias: 2.771133, T: 215888, Avg. loss: 201.189791
Total training time: 0.05 seconds.
-- Epoch 5
Norm: 12.56, NNZs: 7, Bias: -0.930281, T: 269860, Avg. loss: 158.951010
Total training time: 0.07 seconds.
-- Epoch 6
Norm: 10.71, NNZs: 7, Bias: 2.546443, T: 323832, Avg. loss: 128.685458
Total training time: 0.08 seconds.
-- Epoch 7
Norm: 10.87, NNZs: 7, Bias: 0.304693, T: 377804, Avg. loss: 110.124166
Total training time: 0.09 seconds.
-- Epoch 8
Norm: 10.88, NNZs: 7, Bias: 3.494769, T: 431776, Avg. loss: 93.701981
Total training time: 0.10 seconds.
-- Epoch 9
Norm: 10.40, NNZs: 7, Bias: 0.184492, T: 485748, Av

Norm: 4.91, NNZs: 7, Bias: -0.047234, T: 4155844, Avg. loss: 9.169844
Total training time: 0.74 seconds.
-- Epoch 78
Norm: 4.87, NNZs: 7, Bias: -0.023481, T: 4209816, Avg. loss: 9.102405
Total training time: 0.75 seconds.
-- Epoch 79
Norm: 4.82, NNZs: 7, Bias: 0.021268, T: 4263788, Avg. loss: 8.838970
Total training time: 0.76 seconds.
-- Epoch 80
Norm: 4.84, NNZs: 7, Bias: 0.014428, T: 4317760, Avg. loss: 8.834476
Total training time: 0.77 seconds.
-- Epoch 81
Norm: 4.77, NNZs: 7, Bias: 0.031370, T: 4371732, Avg. loss: 8.625005
Total training time: 0.78 seconds.
-- Epoch 82
Norm: 4.75, NNZs: 7, Bias: -0.062930, T: 4425704, Avg. loss: 8.588180
Total training time: 0.79 seconds.
-- Epoch 83
Norm: 4.72, NNZs: 7, Bias: 0.217286, T: 4479676, Avg. loss: 8.482059
Total training time: 0.80 seconds.
-- Epoch 84
Norm: 4.70, NNZs: 7, Bias: -0.019519, T: 4533648, Avg. loss: 8.414254
Total training time: 0.81 seconds.
-- Epoch 85
Norm: 4.64, NNZs: 7, Bias: 0.069910, T: 4587620, Avg. loss: 8.314755

-- Epoch 154
Norm: 3.19, NNZs: 7, Bias: 0.017965, T: 8311688, Avg. loss: 4.566859
Total training time: 1.50 seconds.
-- Epoch 155
Norm: 3.20, NNZs: 7, Bias: -0.015715, T: 8365660, Avg. loss: 4.533337
Total training time: 1.51 seconds.
-- Epoch 156
Norm: 3.19, NNZs: 7, Bias: -0.014961, T: 8419632, Avg. loss: 4.490514
Total training time: 1.52 seconds.
-- Epoch 157
Norm: 3.18, NNZs: 7, Bias: -0.051969, T: 8473604, Avg. loss: 4.522517
Total training time: 1.53 seconds.
-- Epoch 158
Norm: 3.16, NNZs: 7, Bias: -0.026923, T: 8527576, Avg. loss: 4.429396
Total training time: 1.54 seconds.
-- Epoch 159
Norm: 3.15, NNZs: 7, Bias: -0.086445, T: 8581548, Avg. loss: 4.392215
Total training time: 1.55 seconds.
-- Epoch 160
Norm: 3.13, NNZs: 7, Bias: 0.003064, T: 8635520, Avg. loss: 4.424744
Total training time: 1.56 seconds.
-- Epoch 161
Norm: 3.12, NNZs: 7, Bias: -0.053818, T: 8689492, Avg. loss: 4.376924
Total training time: 1.57 seconds.
-- Epoch 162
Norm: 3.10, NNZs: 7, Bias: -0.060027, T: 8743

Norm: 2.17, NNZs: 7, Bias: -0.002939, T: 12683420, Avg. loss: 2.958602
Total training time: 2.25 seconds.
-- Epoch 236
Norm: 2.16, NNZs: 7, Bias: -0.002388, T: 12737392, Avg. loss: 2.941771
Total training time: 2.26 seconds.
-- Epoch 237
Norm: 2.15, NNZs: 7, Bias: -0.033404, T: 12791364, Avg. loss: 2.980642
Total training time: 2.27 seconds.
-- Epoch 238
Norm: 2.13, NNZs: 7, Bias: -0.095895, T: 12845336, Avg. loss: 2.935477
Total training time: 2.28 seconds.
-- Epoch 239
Norm: 2.13, NNZs: 7, Bias: 0.005192, T: 12899308, Avg. loss: 2.937205
Total training time: 2.29 seconds.
-- Epoch 240
Norm: 2.17, NNZs: 7, Bias: 0.026283, T: 12953280, Avg. loss: 2.948477
Total training time: 2.30 seconds.
-- Epoch 241
Norm: 2.12, NNZs: 7, Bias: -0.057649, T: 13007252, Avg. loss: 2.914871
Total training time: 2.31 seconds.
-- Epoch 242
Norm: 2.10, NNZs: 7, Bias: -0.012374, T: 13061224, Avg. loss: 2.865812
Total training time: 2.31 seconds.
-- Epoch 243
Norm: 2.11, NNZs: 7, Bias: 0.000334, T: 13115196, 

Norm: 1.54, NNZs: 7, Bias: 0.023167, T: 17055152, Avg. loss: 2.252266
Total training time: 2.99 seconds.
-- Epoch 317
Norm: 1.54, NNZs: 7, Bias: 0.009842, T: 17109124, Avg. loss: 2.233893
Total training time: 3.00 seconds.
-- Epoch 318
Norm: 1.53, NNZs: 7, Bias: -0.040391, T: 17163096, Avg. loss: 2.222825
Total training time: 3.01 seconds.
-- Epoch 319
Norm: 1.52, NNZs: 7, Bias: -0.060243, T: 17217068, Avg. loss: 2.207343
Total training time: 3.02 seconds.
-- Epoch 320
Norm: 1.52, NNZs: 7, Bias: -0.010118, T: 17271040, Avg. loss: 2.201967
Total training time: 3.03 seconds.
-- Epoch 321
Norm: 1.51, NNZs: 7, Bias: 0.017694, T: 17325012, Avg. loss: 2.207915
Total training time: 3.04 seconds.
-- Epoch 322
Norm: 1.51, NNZs: 7, Bias: -0.005324, T: 17378984, Avg. loss: 2.186089
Total training time: 3.05 seconds.
-- Epoch 323
Norm: 1.50, NNZs: 7, Bias: -0.031470, T: 17432956, Avg. loss: 2.202434
Total training time: 3.06 seconds.
-- Epoch 324
Norm: 1.48, NNZs: 7, Bias: 0.005096, T: 17486928, A

SGDClassifier(loss='log', n_jobs=-1, verbose=1)

In [128]:
y_pred = sgd_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

0.6016407022548894


In [189]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train.values.ravel())

DecisionTreeClassifier()

In [190]:
y_pred = dt_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

10.844571471111513


In [134]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_jobs=-1)
rf_clf.fit(x_train, y_train.values.ravel())

RandomForestClassifier(n_jobs=-1)

In [137]:
y_pred = rf_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

pd.DataFrame(
    {
        "Importance": rf_clf.feature_importances_,
        "Features": [
            "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
            "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff"
        ]
    }
)

0.5816245446051252


Unnamed: 0,Importance,Features
0,0.09963,PPGDiff
1,0.131039,WPercDiff
2,0.097685,OffEffDiff
3,0.090546,DefEffDiff
4,0.217359,MeanOrdinalRankDiff
5,0.200532,MedianOrdinalRankDiff
6,0.163209,MasseyOrdinalRankDiff


In [141]:
from sklearn.svm import SVC

svc_clf = SVC(probability=True)
svc_clf.fit(x_train, y_train.values.ravel())

SVC(probability=True)

In [142]:
y_pred = svc_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

0.5588164514507269


In [167]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=1500, n_jobs=-1)
knn_clf.fit(x_train, y_train.values.ravel())

KNeighborsClassifier(n_jobs=-1, n_neighbors=1500)

In [175]:
y_pred = knn_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

y_pred = knn_clf.predict_proba(
    tourney_df[[
        "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
        "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff"
    ]]
)
y_pred = y_pred[:, 1]

stage_1_submission_df = tourney_df[["ID"]].copy()
stage_1_submission_df["Pred"] = y_pred
stage_1_submission_df.to_csv("brady_lange_m_submission_stage_01_knn.csv", index=False)

0.5542296317998373


In [178]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(activation="logistic")
mlp_clf.fit(x_train, y_train.values.ravel())

MLPClassifier(activation='logistic')

In [179]:
y_pred = mlp_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

0.5621694191718448


In [180]:
from sklearn.ensemble import AdaBoostClassifier

ada_boost_clf = AdaBoostClassifier()
ada_boost_clf.fit(x_train, y_train.values.ravel())

AdaBoostClassifier()

In [182]:
y_pred = ada_boost_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

0.6865761365486209


In [183]:
from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()
gnb_clf.fit(x_train, y_train.values.ravel())

GaussianNB()

In [184]:
y_pred = gnb_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

0.9086214524841149


In [185]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda_clf = QuadraticDiscriminantAnalysis()
qda_clf.fit(x_train, y_train.values.ravel())

QuadraticDiscriminantAnalysis()

In [186]:
y_pred = qda_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

0.5703967786426403
