# 2021 March Madness Bracket Predictor

### Predicts and generates a 2021 March Madness bracket.

Developers:
- Brady Lange (03/17/2021)

## Libraries

In [2]:
# Import standard libraries
import os
import pandas as pd
import numpy as np

# Configure settings
# pd.set_option("display.max.columns", None)
# pd.set_option("display.max.rows", None)
# pd.set_option("display.precision", 2)

## Configure Pathing

In [141]:
base_path = os.path.abspath("..")
data_path = os.path.join(base_path, "data")
output_path = os.path.join(data_path, "output_data")

m_subm_stage_1_output_path = os.path.join(output_path, "m_submission_stage_1.csv")
m_subm_stage_2_output_path = os.path.join(output_path, "m_submission_stage_2.csv")

data_file_paths = {}
exclude_dirs = ["initial_data", "ncaa_march_madness_2020_data", "MDataFiles_Stage1"]

for root, dirs, files in os.walk(data_path, topdown=True):
    # Exclude last year's challenge data
    dirs[:] = [d for d in dirs if d not in exclude_dirs]
    for file_name in files:
        if file_name.endswith(".csv"):
            data_file_paths[file_name[:-4]] = os.path.join(root, file_name)

## Load Data

In [91]:
# Basic data
m_ncaa_tourney_compact_results_df = pd.read_csv(data_file_paths["MNCAATourneyCompactResults"])
m_ncaa_tourney_seeds_df = pd.read_csv(data_file_paths["MNCAATourneySeeds"])
m_regular_season_compact_results_df = pd.read_csv(data_file_paths["MRegularSeasonCompactResults"])
m_seasons_df = pd.read_csv(data_file_paths["MSeasons"])
m_teams_df = pd.read_csv(data_file_paths["MTeams"])

# Team box scores data
m_ncaa_tourney_detailed_results_df = pd.read_csv(data_file_paths["MNCAATourneyDetailedResults"])
m_regular_season_detailed_results_df = pd.read_csv(data_file_paths["MRegularSeasonDetailedResults"])

# Geography data
cities_df = pd.read_csv(data_file_paths["Cities"])
m_game_cities_df = pd.read_csv(data_file_paths["MGameCities"])

# Public rankings data
m_massey_ordinals_df = pd.read_csv(data_file_paths["MMasseyOrdinals"])

# Supplemental data
conferences_df = pd.read_csv(data_file_paths["Conferences"])
m_conference_tourney_games_df = pd.read_csv(data_file_paths["MConferenceTourneyGames"])
m_ncaa_tourney_seed_round_slots_df = pd.read_csv(data_file_paths["MNCAATourneySeedRoundSlots"])
m_ncaa_tourney_slots_df = pd.read_csv(data_file_paths["MNCAATourneySlots"])
m_secondary_tourney_compact_results_df = pd.read_csv(data_file_paths["MSecondaryTourneyCompactResults"])
m_secondary_tourney_teams_df = pd.read_csv(data_file_paths["MSecondaryTourneyTeams"])
m_team_coaches_df = pd.read_csv(data_file_paths["MTeamCoaches"])
m_team_conferences_df = pd.read_csv(data_file_paths["MTeamConferences"])
# Windows codepage 1252 encoded file
m_team_spellings_df = pd.read_csv(data_file_paths["MTeamSpellings"], encoding="cp1252")

# External data
m_ncaa_bpi_21_df = pd.read_csv(data_file_paths["m_ncaa_bpi_2021"])

# Sample submission data
hist_sample_subm_df = pd.read_csv(data_file_paths["MSampleSubmissionStage1"])
sample_subm_df = pd.read_csv(data_file_paths["MSampleSubmissionStage2"])

## Explore Data

In [None]:
def explore_df(df, title="Data Frame"):
    """
    Explores a specified Pandas data frame by printing out all of it's metrics
    and information neatly.
    
    Args:
        df (pandas.DataFrame): Pandas Data Frame to explore.
        title (str): Title/name of data frame. Default is 'Data Frame'.
        
    Returns:
        None: Nothing.
    """
    print("======================================================================")
    print("{0}:".format(title))
    print("======================================================================")
    print("Data Type:")
    print("----------------------------------------------------------------------")
    print(type(df), "\n")
    print("First 5 Rows:")
    print("----------------------------------------------------------------------")
    print(df.head(), "\n")
    print("Last 5 Rows:")
    print("----------------------------------------------------------------------")
    print(df.tail(), "\n")
    print("Description:")
    print("----------------------------------------------------------------------")
    print(df.describe(include=np.object), "\n")
    print("Information:")
    print("----------------------------------------------------------------------")
    df.info()
    print("\nNumber of Rows & Columns (Rows, Columns):")
    print("----------------------------------------------------------------------")
    print(df.shape, "\n")
    print("Number of Rows:")
    print("----------------------------------------------------------------------")
    print(len(df), "\n")
    print("Number of Elements (Rows x Columns):")
    print("----------------------------------------------------------------------")
    print(df.size, "\n")
    print("Columns:")
    print("----------------------------------------------------------------------")
    print(df.columns, "\n")
    for column in df.columns:
        print("Column:")
        print("----------------------------------------------------------------------")
        print(column, "\n")
        print("'{0}' Value Counts:".format(column))
        print("----------------------------------------------------------------------")
        print(df[column].value_counts(), "\n")
        print("Minimum '{0}' Value:".format(column))
        print("----------------------------------------------------------------------")
        print(df[column].min(), "\n")
        print("Maximum '{0}' Value:".format(column))
        print("----------------------------------------------------------------------")
        print(df[column].max(), "\n")
    print("Null Values:")
    print("----------------------------------------------------------------------")
    print(df.isnull().sum(), "\n")

# Basic data
explore_df(df=m_ncaa_tourney_compact_results_df, title="Men's NCAA Tourney Compact Results")
explore_df(df=m_ncaa_tourney_seeds_df, title="Men's NCAA Tourney Seeds")
explore_df(df=m_regular_season_compact_results_df, title="Men's Regular Season Compact Results")
explore_df(df=m_seasons_df, title="Men's Seasons")
explore_df(df=m_teams_df, title="Men's Teams")

# Team box scores data
explore_df(df=m_ncaa_tourney_detailed_results_df, title="Men's NCAA Tourney Detailed Results")
explore_df(df=m_regular_season_detailed_results_df, title="Men's Regular Season Detailed Results")

# Geography data
explore_df(df=cities_df, title="Cities")
explore_df(df=m_game_cities_df, title="Men's Game Cities")

# Public rankings data
explore_df(df=m_massey_ordinals_df, title="Men's Massey Ordinals")

# Supplemental data
explore_df(df=conferences_df, title="Conferences")
explore_df(df=m_conference_tourney_games_df, title="Men's Conference Tourney Games")
explore_df(df=m_ncaa_tourney_seed_round_slots_df, title="Men's NCAA Tourney Seed Round Slots")
explore_df(df=m_ncaa_tourney_slots_df, title="Men's NCAA Tourney Slots")
explore_df(df=m_secondary_tourney_compact_results_df, title="Men's Secondary Tourney Compact Results")
explore_df(df=m_secondary_tourney_teams_df, title="Men's Secondary Tourney Teams")
explore_df(df=m_team_coaches_df, title="Men's Team Coaches")
explore_df(df=m_team_conferences_df, title="Men's Team Conferences")
explore_df(df=m_team_spellings_df, title="Men's Team Spellings")

all_game_compact_results_df = pd.concat([m_ncaa_tourney_compact_results_df, m_regular_season_compact_results_df])
all_game_detailed_results_df = pd.concat([m_ncaa_tourney_detailed_results_df, m_regular_season_detailed_results_df])

df = m_ncaa_tourney_compact_results_df.merge(
    m_teams_df[["TeamID", "TeamName"]], left_on="WTeamID", right_on="TeamID", validate="many_to_one"
).drop(
    "TeamID", axis=1
).rename(
    columns={"TeamName": "WTeamName"}
).merge(
    m_teams_df[["TeamID", "TeamName"]], left_on="LTeamID", right_on="TeamID"
).drop(
    "TeamID", axis=1
).rename(
    columns={"TeamName": "LTeamName"}
)
df

df = all_game_detailed_results_df.merge(
    m_teams_df[["TeamID", "TeamName"]], left_on="WTeamID", right_on="TeamID", validate="many_to_one"
).drop(
    "TeamID", axis=1
).rename(
    columns={"TeamName": "WTeamName"}
).merge(
    m_teams_df[["TeamID", "TeamName"]], left_on="LTeamID", right_on="TeamID"
).drop(
    "TeamID", axis=1
).rename(
    columns={"TeamName": "LTeamName"}
)
df

## Preprocess Data

In [92]:
m_ncaa_bpi_21_df["TEAM"] = m_ncaa_bpi_21_df["TEAM"].str.lower()

m_ncaa_bpi_21_df = pd.merge(m_ncaa_bpi_21_df, m_team_spellings_df, left_on="TEAM", right_on="TeamNameSpelling", how="inner")
m_ncaa_bpi_21_df.drop(["RPI RK", "TeamNameSpelling"], axis=1, inplace=True)

In [93]:
# GLOSSARY
# BPI RK: Rank in the Basketball Power Index (BPI) among all Division I teams.
# SOS RK: Strength of Schedule (SOS) rank among all Division I teams based on how a typical 25th ranked team would do against each team's schedule to date. SOS accounts for the game location, day's rest, travel distance, and high altitude in addition to opponent strength.
# SOR RK: Rank of Strength of Record (SOR) among all Division I teams. SOR reflects the chance a typical 25th ranked team would have team's record or better, given the schedule on a 0 to 100 scale, where 100 is best.
# RPI RK: Team's rank in official NCAA Ratings Percentage Index (RPI).
# https://www.espn.com/mens-college-basketball/bpi/_/view/overview/page/1

def get_bpi(bpi_df):
    bpi_df = bpi_df.copy()
    
    bpi_df.drop(["RK", "TEAM", "CONF", "W-L", "SOS RK", "SOR RK"], axis=1, inplace=True)
    
    bpi_df = bpi_df[["TeamID", "Season", "BPI RK"]].sort_values(by="TeamID").reset_index(drop=True)
    
    return bpi_df

def get_sos(sos_df):
    sos_df = sos_df.copy()
    
    sos_df.drop(["RK", "TEAM", "CONF", "W-L", "BPI RK", "SOR RK"], axis=1, inplace=True)
    
    sos_df = sos_df[["TeamID", "Season", "SOS RK"]].sort_values(by="TeamID").reset_index(drop=True)
    
    return sos_df

def get_sor(sor_df):
    sor_df = sor_df.copy()
    
    sor_df.drop(["RK", "TEAM", "CONF", "W-L", "BPI RK", "SOS RK"], axis=1, inplace=True)
    
    sor_df = sor_df[["TeamID", "Season", "SOR RK"]].sort_values(by="TeamID").reset_index(drop=True)
    
    return sor_df

## Feature Engineering

In [94]:
# Feature Engineering Ideas:
# - Feature engineer PPG, other metrics against opposing team per season [*PPG done]
# - Feature engineer PPG, other metrics against opposing team all-time [*PPG done]
# - Feature engineer data from people's bracket picks []
# - Feature engineer PPG, wins, other metrics against ranked opponents [*]
# - Feature engineer amount players with top stats for each team https://www.ncaa.com/stats/basketball-men/d1 []
# Features to consider:
# - https://www.sportsbettingdime.com/guides/strategy/7-attributes-of-march-madness-winners/
# - https://www.espn.com/mens-college-basketball/bpi

GAME_MINS = 40
OT_MINS = 5

def get_ppg(season_results_df):    
    season_results_df = season_results_df.copy()
    
    season_results_df["GameDuration"] = GAME_MINS + OT_MINS * season_results_df["NumOT"]
    
    season_results_df["WTeamPPG"] = (season_results_df["WScore"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LTeamPPG"] = (season_results_df["LScore"] / season_results_df["GameDuration"]) * GAME_MINS
    
    w_team_ppg = season_results_df[["Season", "WTeamID", "WTeamPPG"]]
    w_team_ppg = w_team_ppg.groupby(["Season", "WTeamID"]).agg([("WPPG", "mean"), ("WGames", "count")])
    w_team_ppg.columns = w_team_ppg.columns.droplevel(0)
    w_team_ppg = w_team_ppg.reset_index()
    
    l_team_ppg = season_results_df[["Season", "LTeamID", "LTeamPPG"]]
    l_team_ppg = l_team_ppg.groupby(["Season", "LTeamID"]).agg([("LPPG", "mean"), ("LGames", "count")])
    l_team_ppg.columns = l_team_ppg.columns.droplevel(0)
    l_team_ppg = l_team_ppg.reset_index()
    
    ppg_df = pd.merge(w_team_ppg, l_team_ppg, left_on=["Season", "WTeamID"], right_on=["Season", "LTeamID"], how="outer")
    
    ppg_df = ppg_df.fillna({"WTeamID": ppg_df["LTeamID"], "WPPG": 0, "WGames": 0, "LTeamID": ppg_df["WTeamID"], "LPPG": 0, "LGames": 0})
    
    ppg_df["PPG"] = (ppg_df["WPPG"] * ppg_df["WGames"] + ppg_df["LPPG"] * ppg_df["LGames"]) / (ppg_df["WGames"] + ppg_df["LGames"])
    ppg_df["WPerc"] = ppg_df["WGames"] / (ppg_df["WGames"] + ppg_df["LGames"])
    ppg_df["Games"] = (ppg_df["WGames"] + ppg_df["LGames"]).astype(int)
    
    ppg_df["TeamID"] = ppg_df["WTeamID"].astype(int)
    
    ppg_df.drop(["WTeamID", "WPPG", "WGames", "LTeamID", "LPPG", "LGames"], axis=1, inplace=True)
    
    ppg_df = ppg_df[["TeamID", "Season", "PPG", "WPerc", "Games"]]
    
    return ppg_df
    
print(get_ppg(m_regular_season_compact_results_df))

def get_ppg_vs_team_season(season_results_df):
    season_results_df = season_results_df.copy()
    
    season_results_df["GameDuration"] = GAME_MINS + OT_MINS * season_results_df["NumOT"]
    
    season_results_df["WTeamScoreNorm"] = (season_results_df["WScore"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LTeamScoreNorm"] = (season_results_df["LScore"] / season_results_df["GameDuration"]) * GAME_MINS
    
    team_vs_ppg = season_results_df[["Season", "WTeamID", "WTeamScoreNorm", "LTeamID", "LTeamScoreNorm"]]
    team_vs_ppg = team_vs_ppg.groupby(["Season", "WTeamID", "LTeamID"]).agg({"WTeamScoreNorm": [("WPPGAgainstTeam", "mean"), ("WGames", "count")], "LTeamScoreNorm": [("LPPGAgainstTeam", "mean")]})
    team_vs_ppg.columns = team_vs_ppg.columns.droplevel(0)
    team_vs_ppg = team_vs_ppg.reset_index()
    
    team_vs_ppg = pd.merge(team_vs_ppg, team_vs_ppg[["Season", "WTeamID", "LTeamID", "WGames"]], left_on=["Season", "WTeamID", "LTeamID"], right_on=["Season", "LTeamID", "WTeamID"], how="inner")
    team_vs_ppg.drop(["WTeamID_y", "LTeamID_y"], axis=1, inplace=True)
    team_vs_ppg = team_vs_ppg.rename(columns={"WTeamID_x": "WTeamID", "LTeamID_x": "LTeamID", "WGames_x": "WGamesAgainstTeam", "WGames_y": "WLGamesAgainstTeam"})

    team_vs_ppg["GamesAgainstTeam"] = (team_vs_ppg["WGamesAgainstTeam"] + team_vs_ppg["WLGamesAgainstTeam"]).astype(int)
    team_vs_ppg["WPercAgainstTeam"] = team_vs_ppg["WGamesAgainstTeam"] / team_vs_ppg["GamesAgainstTeam"]
    team_vs_ppg["WLPercAgainstTeam"] = team_vs_ppg["WLGamesAgainstTeam"] / team_vs_ppg["GamesAgainstTeam"]
    
    team_vs_ppg.drop(["WGamesAgainstTeam", "WLGamesAgainstTeam"], axis=1, inplace=True)
    
#     team_vs_ppg = team_vs_ppg[["WTeamID", "LTeamID", "Season", "WPPGAgainstTeam", "WPercAgainstTeam", "WLPercAgainstTeam" "LPPGAgainstTeam", GamesAgainstTeam"]]
    
    return team_vs_ppg

print(get_ppg_vs_team_season(m_regular_season_compact_results_df))

def get_ppg_vs_team_all_time(season_results_df):
    season_results_df = season_results_df.copy()
    
    season_results_df["GameDuration"] = GAME_MINS + OT_MINS * season_results_df["NumOT"]
    
    season_results_df["WTeamScoreNorm"] = (season_results_df["WScore"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LTeamScoreNorm"] = (season_results_df["LScore"] / season_results_df["GameDuration"]) * GAME_MINS
    
    team_vs_ppg_all_tm = season_results_df[["Season", "WTeamID", "WTeamScoreNorm", "LTeamID", "LTeamScoreNorm"]]
    team_vs_ppg_all_tm = team_vs_ppg_all_tm.groupby(["WTeamID", "LTeamID"]).agg({"WTeamScoreNorm": [("WPPGAllTime", "mean"), ("WGamesAllTime", "count")], "LTeamScoreNorm": [("LPPGAllTime", "mean"), ("LGamesAllTime", "count")]})
    team_vs_ppg_all_tm.columns = team_vs_ppg_all_tm.columns.droplevel(0)
    team_vs_ppg_all_tm = team_vs_ppg_all_tm.reset_index()
    
    team_vs_ppg_all_tm = pd.merge(team_vs_ppg_all_tm, team_vs_ppg_all_tm, left_on=["WTeamID", "LTeamID"], right_on=["LTeamID", "WTeamID"], how="inner").reset_index(drop=True)
    team_vs_ppg_all_tm.drop(["WTeamID_y", "LTeamID_y"], axis=1, inplace=True)
    team_vs_ppg_all_tm = team_vs_ppg_all_tm.rename(
        columns={
            "WTeamID_x": "TeamID_1",
            "LTeamID_x": "TeamID_2",
            "WGamesAllTime_x": "WGamesAllTime_1",
            "LGamesAllTime_x": "LGamesAllTime_2",
            "WPPGAllTime_x": "WPPGAllTime_1",
            "LPPGAllTime_x": "LPPGAllTime_2",
            "WGamesAllTime_y": "WGamesAllTime_2",
            "LGamesAllTime_y": "LGamesAllTime_1",
            "WPPGAllTime_y": "WPPGAllTime_2",
            "LPPGAllTime_y": "LPPGAllTime_1"
        }
    )
    
    team_vs_ppg_all_tm.loc[
        team_vs_ppg_all_tm["TeamID_1"] > team_vs_ppg_all_tm["TeamID_2"],
        [
            "TeamID_1", "TeamID_2", "WGamesAllTime_1", "WGamesAllTime_2",
            "WPPGAllTime_1", "WPPGAllTime_2", "WGamesAllTime_2", "WGamesAllTime_1",
            "WPPGAllTime_2", "WPPGAllTime_1"
        ]
    ] = team_vs_ppg_all_tm.loc[
        team_vs_ppg_all_tm["TeamID_1"] > team_vs_ppg_all_tm["TeamID_2"],
        [
            "TeamID_2", "TeamID_1", "WGamesAllTime_2", "WGamesAllTime_1",
            "WPPGAllTime_2", "WPPGAllTime_1", "WGamesAllTime_1", "WGamesAllTime_2",
            "WPPGAllTime_1", "WPPGAllTime_2"
        ]
    ].values
    
    team_vs_ppg_all_tm[
        ["TeamID_1", "TeamID_2", "WGamesAllTime_1", "WGamesAllTime_2", "LGamesAllTime_1", "LGamesAllTime_2"]
    ] = team_vs_ppg_all_tm[
        ["TeamID_1", "TeamID_2", "WGamesAllTime_1", "WGamesAllTime_2", "LGamesAllTime_1", "LGamesAllTime_2"]
    ].astype(int)
    
    team_vs_ppg_all_tm["WLGamesAllTime"] = (team_vs_ppg_all_tm["WGamesAllTime_1"] + team_vs_ppg_all_tm["LGamesAllTime_1"]).astype(int)
    team_vs_ppg_all_tm["WPercAllTime_1"] = team_vs_ppg_all_tm["WGamesAllTime_1"] / team_vs_ppg_all_tm["WLGamesAllTime"]
    team_vs_ppg_all_tm["WPercAllTime_2"] = team_vs_ppg_all_tm["WGamesAllTime_2"] / team_vs_ppg_all_tm["WLGamesAllTime"]
    team_vs_ppg_all_tm["LPercAllTime_1"] = team_vs_ppg_all_tm["LGamesAllTime_1"] / team_vs_ppg_all_tm["WLGamesAllTime"]
    team_vs_ppg_all_tm["LPercAllTime_2"] = team_vs_ppg_all_tm["LGamesAllTime_2"] / team_vs_ppg_all_tm["WLGamesAllTime"]
    
    team_vs_ppg_all_tm = team_vs_ppg_all_tm[
        [
            "TeamID_1", "TeamID_2",
            "WPPGAllTime_1", "WGamesAllTime_1", "LPPGAllTime_1", "LGamesAllTime_1", "WPercAllTime_1", "LPercAllTime_1",
            "WPPGAllTime_2", "WGamesAllTime_2", "LPPGAllTime_2", "LGamesAllTime_2", "WPercAllTime_2", "LPercAllTime_2",
            "WLGamesAllTime"
        ]
    ]
    
    team_vs_ppg_all_tm.drop(
        ["WGamesAllTime_1", "WGamesAllTime_2", "LGamesAllTime_1", "LGamesAllTime_2"],
        axis=1,
        inplace=True
    )
    
    return team_vs_ppg_all_tm

print(get_ppg_vs_team_all_time(m_regular_season_compact_results_df))

def get_efficiency(season_results_df):
    season_results_df = season_results_df.copy()
    
    season_results_df["GameDuration"] = GAME_MINS + OT_MINS * season_results_df["NumOT"]
    
    # Winning teams
    # Winning offense
    season_results_df["WFGM"] = (season_results_df["WFGM"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WFGA"] = (season_results_df["WFGA"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WFGM3"] = (season_results_df["WFGM3"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WFGA3"] = (season_results_df["WFGA3"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WFTM"] = (season_results_df["WFTM"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WFTA"] = (season_results_df["WFTA"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WOR"] = (season_results_df["WOR"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WAst"] = (season_results_df["WAst"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WTO"] = (season_results_df["WTO"] / season_results_df["GameDuration"]) * GAME_MINS
    
    # Winning defense
    season_results_df["WDR"] = (season_results_df["WDR"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WStl"] = (season_results_df["WStl"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["WBlk"] = (season_results_df["WBlk"] / season_results_df["GameDuration"]) * GAME_MINS
    
    # Winning offense & defense
    season_results_df["WPF"] = (season_results_df["WPF"] / season_results_df["GameDuration"]) * GAME_MINS
    
    # Losing teams
    # Losing offense
    season_results_df["LFGM"] = (season_results_df["LFGM"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LFGA"] = (season_results_df["LFGA"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LFGM3"] = (season_results_df["LFGM3"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LFGA3"] = (season_results_df["LFGA3"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LFTM"] = (season_results_df["LFTM"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LFTA"] = (season_results_df["LFTA"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LOR"] = (season_results_df["LOR"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LAst"] = (season_results_df["LAst"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LTO"] = (season_results_df["LTO"] / season_results_df["GameDuration"]) * GAME_MINS
    
    # Losing defense
    season_results_df["LDR"] = (season_results_df["LDR"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LStl"] = (season_results_df["LStl"] / season_results_df["GameDuration"]) * GAME_MINS
    season_results_df["LBlk"] = (season_results_df["LBlk"] / season_results_df["GameDuration"]) * GAME_MINS
    
    # Losing offense & defense
    season_results_df["LPF"] = (season_results_df["LPF"] / season_results_df["GameDuration"]) * GAME_MINS
    
    season_results_df["WPointsMade"] = (2 * (season_results_df["WFGM"] - season_results_df["WFGM3"])) + (3 * season_results_df["WFGM3"]) + (1 * season_results_df["WFTM"])
    season_results_df["WPointsMissed"] = ((2 * (season_results_df["WFGA"] - season_results_df["WFGA3"])) + (3 * season_results_df["WFGA3"]) + (1 * season_results_df["WFTA"])) - season_results_df["WPointsMade"]
    
    season_results_df["LPointsMade"] = (2 * (season_results_df["LFGM"] - season_results_df["LFGM3"])) + (3 * season_results_df["LFGM3"]) + (1 * season_results_df["LFTM"])
    season_results_df["LPointsMissed"] = ((2 * (season_results_df["LFGA"] - season_results_df["LFGA3"])) + (3 * season_results_df["LFGA3"]) + (1 * season_results_df["LFTA"])) - season_results_df["LPointsMade"]
    
    season_results_df["WOffEff"] = season_results_df["WPointsMade"] + season_results_df["WAst"] + season_results_df["WOR"]
    season_results_df["WDefEff"] = season_results_df["LPointsMissed"] + season_results_df["LTO"] + season_results_df["WDR"] + season_results_df["WStl"] + season_results_df["WBlk"] + season_results_df["WPF"]
    
    season_results_df["LOffEff"] = season_results_df["LPointsMade"] + season_results_df["LAst"] + season_results_df["LOR"]
    season_results_df["LDefEff"] = season_results_df["WPointsMissed"] + season_results_df["WTO"] + season_results_df["LDR"] + season_results_df["LStl"] + season_results_df["LBlk"] + season_results_df["LPF"]
    
    w_eff_df = season_results_df[["WTeamID", "Season", "DayNum", "WOffEff", "WDefEff"]]
    w_eff_df = w_eff_df.rename(columns={"WTeamID": "TeamID", "WOffEff": "OffEff", "WDefEff": "DefEff"})
    
    l_eff_df = season_results_df[["LTeamID", "Season", "DayNum", "LOffEff", "LDefEff"]]
    l_eff_df = l_eff_df.rename(columns={"LTeamID": "TeamID", "LOffEff": "OffEff", "LDefEff": "DefEff"})
    
    eff_df = pd.concat([w_eff_df, l_eff_df]).sort_values(by=["Season", "DayNum"]).reset_index(drop=True)
    
    eff_df.drop(["DayNum"], axis=1, inplace=True)
    
    eff_df = eff_df.groupby(by=["Season", "TeamID"]).agg("mean").reset_index()
    
    return eff_df
    
print(get_efficiency(m_regular_season_detailed_results_df))

def get_rankings(rankings_df):
    rankings_df = rankings_df.copy()
    
    rankings_df = rankings_df[rankings_df["RankingDayNum"] == 133].reset_index(drop=True)
    
    mean_median_rankings_df = rankings_df.groupby(["Season", "TeamID"])[["OrdinalRank"]].agg([("MeanOrdinalRank", "mean"), ("MedianOrdinalRank", "median")])
    mean_median_rankings_df.columns = mean_median_rankings_df.columns.droplevel(0)
    mean_median_rankings_df = mean_median_rankings_df.reset_index()
    
    massey_rankings_df = rankings_df[rankings_df["SystemName"] == "MAS"].reset_index(drop=True)
    massey_rankings_df = massey_rankings_df.rename(columns={"OrdinalRank": "MasseyOrdinalRank"})
    massey_rankings_df = massey_rankings_df[["Season", "TeamID", "MasseyOrdinalRank"]].reset_index(drop=True)
    
    rankings_df = pd.merge(mean_median_rankings_df, massey_rankings_df, on=["Season", "TeamID"], how="left")
    
    return rankings_df
    
print(get_rankings(m_massey_ordinals_df))

       TeamID  Season        PPG     WPerc  Games
0        1102    1985  63.083333  0.208333     24
1        1103    1985  61.043478  0.391304     23
2        1104    1985  68.318519  0.700000     30
3        1106    1985  71.625000  0.416667     24
4        1108    1985  83.000000  0.760000     25
...       ...     ...        ...       ...    ...
11936    1312    2008  55.931034  0.000000     29
11937    1212    2013  49.678571  0.000000     28
11938    1212    2015  51.111111  0.000000     27
11939    1363    2015  52.265873  0.000000     28
11940    1152    2021  55.444444  0.000000      9

[11941 rows x 5 columns]
       Season  WTeamID  LTeamID  WPPGAgainstTeam  LPPGAgainstTeam  \
0        1985     1102     1140        82.000000        79.000000   
1        1985     1102     1218        58.000000        54.000000   
2        1985     1102     1461        52.000000        49.000000   
3        1985     1103     1122        80.000000        66.000000   
4        1985     1103     11

In [None]:
# rankings_df.sort_values(by=["Season", "MeanOrdinalRank"], ignore_index=True).loc[25:50, :]


def get_rankings_top_25(rankings_df, season_results_df):
    rankings_df = rankings_df.copy()
    season_results_df = season_results_df.copy()
    
#     rankings_df = rankings_df[rankings_df["RankingDayNum"] == 128].reset_index(drop=True)
    
#     rankings_df = rankings_df.groupby(["Season", "TeamID"])[["OrdinalRank"]].agg([("MeanOrdinalRank", "mean"), ("MedianOrdinalRank", "median")])
#     rankings_df.columns = rankings_df.columns.droplevel(0)
#     rankings_df = rankings_df.reset_index()
    
#     rankings_df = pd.merge(rankings_df, season_results_df, left_on=["Season", "TeamID"], right_on=["Season", "WTeamID"])
    
    ### Average the ranking day's ordinal values
    
    rankings_wk_1_df = rankings_df[rankings_df["RankingDayNum"] <= 7].reset_index(drop=True)
    season_wk_1_results_df = season_results_df[season_results_df["DayNum"] < 7].reset_index(drop=True)
    
    rankings_wk_1_df = rankings_wk_1_df.groupby(["Season", "TeamID"])[["OrdinalRank"]].agg([("MeanOrdinalRank", "mean"), ("MedianOrdinalRank", "median")])
    rankings_wk_1_df.columns = rankings_wk_1_df.columns.droplevel(0)
    rankings_wk_1_df = rankings_wk_1_df.reset_index()
    
    rankings_wk_1_df = pd.merge(rankings_wk_1_df, season_wk_1_results_df, left_on=["Season", "TeamID"], right_on=["Season", "WTeamID"], how="left")
    
    
    rankings_wk_2_df = rankings_df[(rankings_df["RankingDayNum"] > 7) & (rankings_df["RankingDayNum"] <= 14)].reset_index(drop=True)
    season_wk_2_results_df = season_results_df[(season_results_df["DayNum"] >= 7) & (season_results_df["DayNum"] < 14)].reset_index(drop=True)

    rankings_wk_2_df = rankings_wk_2_df.groupby(["Season", "TeamID"])[["OrdinalRank"]].agg([("MeanOrdinalRank", "mean"), ("MedianOrdinalRank", "median")])
    rankings_wk_2_df.columns = rankings_wk_2_df.columns.droplevel(0)
    rankings_wk_2_df = rankings_wk_2_df.reset_index()
    
    print(rankings_wk_2_df)
    
    rankings_df = pd.merge(rankings_wk_2_df, season_wk_2_results_df, left_on=["Season", "TeamID"], right_on=["Season", "WTeamID"], how="left")
    
#     rankings_df = rankings_df[rankings_df["MeanOrdinalRank"] < 26].reset_index(drop=True)

#     rankings_df = pd.merge(rankings_df, season_results_df, left_on=["Season", "TeamID"])
    
    
    
    
#     rankings_df = rankings_df[rankings_df["RankingDayNum"] == 133].reset_index(drop=True)
    
#     mean_median_rankings_df = rankings_df.groupby(["Season", "TeamID"])[["OrdinalRank"]].agg([("MeanOrdinalRank", "mean"), ("MedianOrdinalRank", "median")])
#     mean_median_rankings_df.columns = mean_median_rankings_df.columns.droplevel(0)
#     mean_median_rankings_df = mean_median_rankings_df.reset_index()
    
#     massey_rankings_df = rankings_df[rankings_df["SystemName"] == "MAS"].reset_index(drop=True)
#     massey_rankings_df = massey_rankings_df.rename(columns={"OrdinalRank": "MasseyOrdinalRank"})
#     massey_rankings_df = massey_rankings_df[["Season", "TeamID", "MasseyOrdinalRank"]].reset_index(drop=True)
    
#     rankings_df = pd.merge(mean_median_rankings_df, massey_rankings_df, on=["Season", "TeamID"], how="left")
    
    return rankings_df.sort_values(by=["Season", "MeanOrdinalRank"]).reset_index(drop=True)

get_rankings_top_25(m_massey_ordinals_df, m_regular_season_compact_results_df)
# m_massey_ordinals_df

# m_massey_ordinals_df.loc[:, "RankingDayNum"].value_counts().sort_index().index#[-20:]
# m_massey_ordinals_df.loc[:, "RankingDayNum"].value_counts().sort_index()[0:-1]
# len(m_massey_ordinals_df)
# m_regular_season_compact_results_df

## Prepare Training Dataset

In [95]:
def prep_season(season_results_df):
    season_results_df = season_results_df.copy()
    
    seasons = season_results_df["Season"]
    season_min_gt = seasons > 2014
    season_max_lt = seasons < 2020
    all_teams_df = season_results_df[season_min_gt & season_max_lt].reset_index(drop=True)
    
    w_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    w_teams_df = w_teams_df.rename(columns={"WTeamID": "TeamID_1", "LTeamID": "TeamID_2"})
    w_teams_df["Result"] = 1
    
    l_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    l_teams_df = l_teams_df.rename(columns={"WTeamID": "TeamID_2", "LTeamID": "TeamID_1"})
    l_teams_df["Result"] = 0
    
    all_teams_df = pd.concat([w_teams_df, l_teams_df]).reset_index(drop=True)
    
    all_teams_df["ID"] = all_teams_df["Season"].apply(str) + "_" + all_teams_df["TeamID_1"].apply(str) + "_" + all_teams_df["TeamID_2"].apply(str)
    
    all_teams_df = all_teams_df[["ID", "Season", "TeamID_1", "TeamID_2", "Result"]]
    
    return all_teams_df
    
print(prep_season(m_regular_season_compact_results_df))

                   ID  Season  TeamID_1  TeamID_2  Result
0      2015_1103_1420    2015      1103      1420       1
1      2015_1104_1406    2015      1104      1406       1
2      2015_1112_1291    2015      1112      1291       1
3      2015_1113_1152    2015      1113      1152       1
4      2015_1119_1102    2015      1119      1102       1
...               ...     ...       ...       ...     ...
53967  2019_1222_1153    2019      1222      1153       0
53968  2019_1426_1209    2019      1426      1209       0
53969  2019_1276_1277    2019      1276      1277       0
53970  2019_1382_1387    2019      1382      1387       0
53971  2019_1217_1463    2019      1217      1463       0

[53972 rows x 5 columns]


In [110]:
def prep_season(season_results_df):
    season_results_df = season_results_df.copy()
    
#     seasons = season_results_df["Season"]
#     season_min_gt = seasons > 2014
#     season_max_lt = seasons < 2020
    all_teams_df = season_results_df#[season_min_gt & season_max_lt].reset_index(drop=True)
    
    w_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    w_teams_df = w_teams_df.rename(columns={"WTeamID": "TeamID_1", "LTeamID": "TeamID_2"})
    w_teams_df["Result"] = 1
    
    l_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    l_teams_df = l_teams_df.rename(columns={"WTeamID": "TeamID_2", "LTeamID": "TeamID_1"})
    l_teams_df["Result"] = 0
    
    all_teams_df = pd.concat([w_teams_df, l_teams_df]).reset_index(drop=True)
    
    all_teams_df["ID"] = all_teams_df["Season"].apply(str) + "_" + all_teams_df["TeamID_1"].apply(str) + "_" + all_teams_df["TeamID_2"].apply(str)
    
    all_teams_df = all_teams_df[["ID", "Season", "TeamID_1", "TeamID_2", "Result"]]
    
    return all_teams_df
    
print(prep_season(m_regular_season_compact_results_df))

                    ID  Season  TeamID_1  TeamID_2  Result
0       1985_1228_1328    1985      1228      1328       1
1       1985_1106_1354    1985      1106      1354       1
2       1985_1112_1223    1985      1112      1223       1
3       1985_1165_1432    1985      1165      1432       1
4       1985_1192_1447    1985      1192      1447       1
...                ...     ...       ...       ...     ...
341471  2021_1433_1382    2021      1433      1382       0
341472  2021_1259_1159    2021      1259      1159       0
341473  2021_1261_1104    2021      1261      1104       0
341474  2021_1153_1222    2021      1153      1222       0
341475  2021_1326_1228    2021      1326      1228       0

[341476 rows x 5 columns]


## Prepare Predictions

In [96]:
def prepare_predictions(sample_output_df, ncaa_tourney_results_df):
    features_df = sample_output_df.copy()
    features_df["Season"] = features_df["ID"].apply(lambda row: int(row[:4]))
    features_df["TeamID_1"] = features_df["ID"].apply(lambda row: int(row[5:9]))
    features_df["TeamID_2"] = features_df["ID"].apply(lambda row: int(row[10:14]))
    features_df.drop(["Pred"], axis=1, inplace=True)
    
    ncaa_tourney_results_df = ncaa_tourney_results_df.copy()
    seasons = ncaa_tourney_results_df["Season"]
    season_min_gt = seasons > 2014
    season_max_lt = seasons < 2020
    all_teams_df = ncaa_tourney_results_df[season_min_gt & season_max_lt].reset_index(drop=True)
    
    w_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    w_teams_df = w_teams_df.rename(columns={"WTeamID": "TeamID_1", "LTeamID": "TeamID_2"})
    w_teams_df["Result"] = 1
    
    l_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    l_teams_df = l_teams_df.rename(columns={"WTeamID": "TeamID_2", "LTeamID": "TeamID_1"})
    l_teams_df["Result"] = 0
    
    all_teams_df = pd.concat([w_teams_df, l_teams_df]).reset_index(drop=True)
    
    features_df = features_df.merge(all_teams_df, on=["Season", "TeamID_1", "TeamID_2"], how="left")
    
    return features_df

## Merge Features

In [122]:
def merge_features(features_df, ppg_df, eff_df, rankings_df):
    features_df = features_df.copy()
    ppg_df = ppg_df.copy()
    eff_df = eff_df.copy()
    rankings_df = rankings_df.copy()
    
    w_ppg_df = ppg_df.rename(columns={"PPG": "PPG_1", "WPerc": "WPerc_1"})
    l_ppg_df = ppg_df.rename(columns={"PPG": "PPG_2", "WPerc": "WPerc_2"})
    
    features_df = pd.merge(features_df, w_ppg_df, left_on=["Season", "TeamID_1"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID", "Games"], axis=1, inplace=True)
    features_df = pd.merge(features_df, l_ppg_df, left_on=["Season", "TeamID_2"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID", "Games"], axis=1, inplace=True)
    
    w_eff_df = eff_df.rename(columns={"OffEff": "OffEff_1", "DefEff": "DefEff_1"})
    l_eff_df = eff_df.rename(columns={"OffEff": "OffEff_2", "DefEff": "DefEff_2"})
    
    features_df = pd.merge(features_df, w_eff_df, left_on=["Season", "TeamID_1"], right_on=["Season", "TeamID"], how="left")#.reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    features_df = pd.merge(features_df, l_eff_df, left_on=["Season", "TeamID_2"], right_on=["Season", "TeamID"], how="left")#.reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    
    features_df["PPGDiff"] = features_df["PPG_1"] - features_df["PPG_2"]
    features_df["WPercDiff"] = features_df["WPerc_1"] - features_df["WPerc_2"]
    features_df["OffEffDiff"] = features_df["OffEff_1"] - features_df["OffEff_2"]
    features_df["DefEffDiff"] = features_df["DefEff_1"] - features_df["DefEff_2"]
    features_df.drop(["PPG_1", "PPG_2", "WPerc_1", "WPerc_2", "OffEff_1", "OffEff_2", "DefEff_1", "DefEff_2"], axis=1, inplace=True)
    
    w_rankings_df = rankings_df.rename(columns={"MeanOrdinalRank": "MeanOrdinalRank_1", "MedianOrdinalRank": "MedianOrdinalRank_1", "MasseyOrdinalRank": "MasseyOrdinalRank_1"})
    l_rankings_df = rankings_df.rename(columns={"MeanOrdinalRank": "MeanOrdinalRank_2", "MedianOrdinalRank": "MedianOrdinalRank_2", "MasseyOrdinalRank": "MasseyOrdinalRank_2"})
    
    features_df = pd.merge(features_df, w_rankings_df, left_on=["Season", "TeamID_1"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    features_df = pd.merge(features_df, l_rankings_df, left_on=["Season", "TeamID_2"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    
    features_df["MeanOrdinalRankDiff"] = features_df["MeanOrdinalRank_1"] - features_df["MeanOrdinalRank_2"]
    features_df["MedianOrdinalRankDiff"] = features_df["MedianOrdinalRank_1"] - features_df["MedianOrdinalRank_2"]
    features_df["MasseyOrdinalRankDiff"] = features_df["MasseyOrdinalRank_1"] - features_df["MasseyOrdinalRank_2"]
    features_df.drop(["MeanOrdinalRank_1", "MeanOrdinalRank_2", "MedianOrdinalRank_1", "MedianOrdinalRank_2", "MasseyOrdinalRank_1", "MasseyOrdinalRank_2"], axis=1, inplace=True)
    
    bpi_df = get_bpi(m_ncaa_bpi_21_df)
    
    w_bpi_df = bpi_df.rename(columns={"BPI RK": "BPI RK_1"})
    l_bpi_df = bpi_df.rename(columns={"BPI RK": "BPI RK_2"})
    
    features_df = pd.merge(features_df, w_bpi_df, left_on=["Season", "TeamID_1"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    features_df = pd.merge(features_df, l_bpi_df, left_on=["Season", "TeamID_2"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    
    sos_df = get_sos(m_ncaa_bpi_21_df)
    
    w_sos_df = sos_df.rename(columns={"SOS RK": "SOS RK_1"})
    l_sos_df = sos_df.rename(columns={"SOS RK": "SOS RK_2"})
    
    features_df = pd.merge(features_df, w_sos_df, left_on=["Season", "TeamID_1"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    features_df = pd.merge(features_df, l_sos_df, left_on=["Season", "TeamID_2"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    
    sor_df = get_sor(m_ncaa_bpi_21_df)
    
    w_sor_df = sor_df.rename(columns={"SOR RK": "SOR RK_1"})
    l_sor_df = sor_df.rename(columns={"SOR RK": "SOR RK_2"})
    
    features_df = pd.merge(features_df, w_sor_df, left_on=["Season", "TeamID_1"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    features_df = pd.merge(features_df, l_sor_df, left_on=["Season", "TeamID_2"], right_on=["Season", "TeamID"], how="left").reset_index(drop=True)
    features_df.drop(["TeamID"], axis=1, inplace=True)
    
    features_df["BPI RKDiff"] = features_df["BPI RK_1"] - features_df["BPI RK_2"]
    features_df["SOS RKDiff"] = features_df["SOS RK_1"] - features_df["SOS RK_2"]
    features_df["SOR RKDiff"] = features_df["SOR RK_1"] - features_df["SOR RK_2"]
    
    features_df.drop(["BPI RK_1", "BPI RK_2", "SOS RK_1", "SOS RK_2", "SOR RK_1", "SOR RK_2"], axis=1, inplace=True)
    
    return features_df

## Create Training and Testing Sets

In [111]:
features_df = prepare_predictions(hist_sample_subm_df, m_ncaa_tourney_compact_results_df)
ppg_df = get_ppg(m_regular_season_compact_results_df)
# ppg_vs_team_df = get_ppg_vs_team(m_regular_season_compact_results_df)
# ppg_vs_team_all_time_df = get_ppg_vs_team_all_time(m_regular_season_compact_results_df)
eff_df = get_efficiency(m_regular_season_detailed_results_df)
rankings_df = get_rankings(m_massey_ordinals_df)
tourney_df = merge_features(features_df, ppg_df, eff_df, rankings_df)

train_features_df = prep_season(m_regular_season_compact_results_df)
reg_season_df = merge_features(train_features_df, ppg_df, eff_df, rankings_df)

In [114]:
reg_season_df[reg_season_df["BPI RK_1"].isna()]
tourney_df[tourney_df["BPI RK_1"].isna()]

Unnamed: 0,ID,Season,TeamID_1,TeamID_2,Result,PPGDiff,WPercDiff,OffEffDiff,DefEffDiff,MeanOrdinalRankDiff,MedianOrdinalRankDiff,MasseyOrdinalRankDiff,BPI RK_1,BPI RK_2,SOS RK_1,SOS RK_2,SOR RK_1,SOR RK_2
0,2015_1107_1112,2015,1107,1112,,-10.978962,-0.161765,-15.602737,-13.001021,114.669633,118.0,90.0,,,,,,
1,2015_1107_1116,2015,1107,1116,,-12.266544,-0.014706,-20.880515,-15.938930,96.572859,100.0,81.0,,,,,,
2,2015_1107_1124,2015,1107,1124,,-3.444444,0.031250,-11.822917,-12.343750,104.701891,108.0,79.0,,,,,,
3,2015_1107_1125,2015,1107,1125,,-8.527666,0.072581,-12.703517,-8.132056,-10.844828,-12.0,-11.0,,,,,,
4,2015_1107_1129,2015,1107,1129,,-3.360641,0.008065,-2.926456,-2.106250,72.179310,74.5,42.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11385,2019_1449_1459,2019,1449,1459,,-11.376471,-0.101961,-15.229412,6.939216,17.703591,19.0,30.0,,,,,,
11386,2019_1449_1463,2019,1449,1463,,-11.069328,0.014706,-15.600840,-10.615546,-36.156250,-34.5,-32.0,,,,,,
11387,2019_1458_1459,2019,1458,1459,,-12.139394,-0.169697,-16.230303,5.339394,-4.492537,-5.0,7.0,,,,,,
11388,2019_1458_1463,2019,1458,1463,,-11.832251,-0.053030,-16.601732,-12.215368,-58.352379,-58.5,-55.0,,,,,,


## Split Training and Testing Sets

In [99]:
x_train = reg_season_df[[
    "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
    "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff"
]]
y_train = reg_season_df[["Result"]]

tourney_actuals_df = tourney_df.copy().dropna()
x_test = tourney_actuals_df[[
    "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
    "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff"
]]
y_test = tourney_actuals_df[["Result"]]

## Logistic Regression - Train Model

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

log_reg = LogisticRegression(penalty="l2", random_state=None, max_iter=1000, verbose=1, n_jobs=-1)

log_reg.fit(x_train, y_train.values.ravel())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.4s finished


LogisticRegression(max_iter=1000, n_jobs=-1, verbose=1)

## Logistic Regression - Predictions

In [101]:
y_pred = log_reg.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

# y_pred = log_reg.predict_proba(
#     tourney_df[[
#         "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
#         "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff"
#     ]]
# )
# y_pred = y_pred[:, 1]

# stage_1_submission_df = tourney_df[["ID"]].copy()
# stage_1_submission_df["Pred"] = y_pred
# stage_1_submission_df.to_csv("brady_lange_m_submission_stage_01_log_reg.csv", index=False)

ValueError: Found array with 0 sample(s) (shape=(0, 7)) while a minimum of 1 is required.

## Stochastic Gradient Descent - Train Model

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(loss="log", verbose=1, n_jobs=-1)

sgd_clf.fit(x_train, y_train.values.ravel())

## Stochastic Gradient Descent - Predictions

In [None]:
y_pred = sgd_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

## Gradient Boosting - Model Training

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(
    loss="deviance",
    learning_rate=0.1,
    n_estimators=500,
    subsample=0.5,
    criterion="friedman_mse",
    verbose=1
)

gb_clf.fit(x_train, y_train.values.ravel())

## Gradient Boosting - Predictions

In [None]:
y_pred = gb_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

y_pred = knn_clf.predict_proba(
    tourney_df[[
        "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
        "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff"
    ]]
)
y_pred = y_pred[:, 1]

stage_1_submission_df = tourney_df[["ID"]].copy()
stage_1_submission_df["Pred"] = y_pred
stage_1_submission_df.to_csv("brady_lange_m_submission_stage_01_gb_clf.csv", index=False)

## Decision Tree - Train Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()

dt_clf.fit(x_train, y_train.values.ravel())

## Decision Tree - Predictions

In [None]:
y_pred = dt_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

## Random Forest - Train Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_jobs=-1)

rf_clf.fit(x_train, y_train.values.ravel())

## Random Forest - Predictions

In [None]:
y_pred = rf_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

pd.DataFrame(
    {
        "Importance": rf_clf.feature_importances_,
        "Features": [
            "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
            "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff"
        ]
    }
)

## Support Vector Classification - Train Model

In [None]:
from sklearn.svm import SVC

svc_clf = SVC(probability=True)

svc_clf.fit(x_train, y_train.values.ravel())

## Support Vector Classification - Predictions

In [None]:
y_pred = svc_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

## K Nearest Neighbors - Train Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=1500, n_jobs=-1)

knn_clf.fit(x_train, y_train.values.ravel())

## K Nearest Neighbors - Predictions

In [None]:
y_pred = knn_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

y_pred = knn_clf.predict_proba(
    tourney_df[[
        "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
        "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff"
    ]]
)
y_pred = y_pred[:, 1]

stage_1_submission_df = tourney_df[["ID"]].copy()
stage_1_submission_df["Pred"] = y_pred
stage_1_submission_df.to_csv("brady_lange_m_submission_stage_01_knn.csv", index=False)

## Multilayer Perceptron - Train Model

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(activation="logistic")

mlp_clf.fit(x_train, y_train.values.ravel())

## Multilayer Perceptron - Predictions

In [None]:
y_pred = mlp_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

## AdaBoost - Train Model

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_boost_clf = AdaBoostClassifier()

ada_boost_clf.fit(x_train, y_train.values.ravel())

## AdaBoost - Predictions

In [None]:
y_pred = ada_boost_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

## Gausian Naive Bayes - Train Model

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()

gnb_clf.fit(x_train, y_train.values.ravel())

## Gausian Naive Bayes - Predictions

In [None]:
y_pred = gnb_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

## Quadratic Discriminant Analysis - Train Model

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda_clf = QuadraticDiscriminantAnalysis()

qda_clf.fit(x_train, y_train.values.ravel())

## Quadratic Discriminant Analysis - Predictions

In [None]:
y_pred = qda_clf.predict_proba(x_test)
y_pred = y_pred[:, 1]

print(log_loss(y_test, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None))

## Random Numbers

In [None]:
from random import random

y_pred_rand = [random() for item in range(0, 335)]

print(log_loss(y_test, y_pred_rand, eps=1e-15, normalize=True, sample_weight=None, labels=None))

## 2021 March Madness All Possible Matchups Predictions

In [None]:
# https://www.ncaa.com/news/basketball-men/article/2021-03-12/2021-bracketology-march-madness-predictions-andy-katz

# mock_mm_2021_matchups = [
#     "Gonzaga", "Prairie View/North Carolina A&T",
#     "Michigan", "Bryant/Hartford",
#     "Baylor", "Drexel",
#     "Illinois", "Nicholls State",
#     "San Diego State", "Loyola Chicago",
#     "BYU", "North Carolina",
#     "Missouri", "Rutgers",
#     "UCLA", "Florida",
#     "Oklahoma", "Colorado State/Syracuse",
#     "Virginia", "Wichita State",
#     "USC", "Toledo",
#     "Texas Tech", "Western Kentucky",
#     "Florida State", "Winthrop",
#     "Texas", "Morehead State",
#     "Oklahoma State", "UC Santa Barbara",
#     "Purdue", "Liberty",
#     "Creighton", "St. Bonaventure",
#     "Oregon", "Drake/Boise State",
#     "Clemson", "VCU",
#     "Colorado", "Louisville",
#     "Tennessee", "Georgia Tech",
#     "Virginia Tech", "Michigan State",
#     "UConn", "Maryland",
#     "Wisconsin", "LSU",
#     "Ohio State", "Grand Canyon",
#     "Alabama", "Appalachian State",
#     "Iowa", "Siena",
#     "Houston", "Cleveland State"
# ]

mock_mm_2021_matchups = [
    "Gonzaga", "Prairie View",
    "Michigan", "Bryant",
    "Baylor", "Drexel",
    "Illinois", "Nicholls State",
    "San Diego State", "Loyola Chicago",
    "BYU", "North Carolina",
    "Missouri", "Rutgers",
    "UCLA", "Florida",
    "Oklahoma", "Colorado State",
    "Virginia", "Wichita State",
    "USC", "Toledo",
    "Texas Tech", "Western Kentucky",
    "Florida State", "Winthrop",
    "Texas", "Morehead State",
    "Oklahoma State", "UC Santa Barbara",
    "Purdue", "Liberty",
    "Creighton", "St. Bonaventure",
    "Oregon", "Drake",
    "Clemson", "VCU",
    "Colorado", "Louisville",
    "Tennessee", "Georgia Tech",
    "Virginia Tech", "Michigan State",
    "UConn", "Maryland",
    "Wisconsin", "LSU",
    "Ohio State", "Grand Canyon",
    "Alabama", "Appalachian State",
    "Iowa", "Siena",
    "Houston", "Cleveland State"
]

# mm_2021_matchups = [
#     "Gonzaga", "Norfolk St/Appalachian State",
#     "Oklahoma", "Missouri",
#     "Virginia", "Ohio",
#     "Creighton", "UC Santa Barbara",
#     "Kansas", "Eastern Washington",
#     "USC", "Wichita State/Drake",
#     "Oregon", "VCU",
#     "Iowa", "Grand Canyon",
#     "Baylor", "Hartford",
#     "North Carolina", "Wisconsin",
#     "Purdue", "North Texas",
#     "Villanova", "Winthrop",
#     "Arkansas", "Colgate",
#     "Texas Tech", "Utah State",
#     "Florida", "Virginia Tech",
#     "Ohio State", "Oral Roberts",
#     "Illinois", "Drexel",
#     "Loyola-Chicago", "Georgia Tech",
#     "Oklahoma State", "Liberty",
#     "Tennessee", "Oregon State",
#     "West Virginia", "Morehead State",
#     "San Diego State", "Syracuse",
#     "Clemson", "Rutgers",
#     "Houston", "Cleveland State",
#     "Michigan", "Mount St. Mary's/Texas Southern",
#     "LSU", "St. Bonaventure",
#     "Florida State", "UNC Greensboro",
#     "Colorado", "Georgetown",
#     "Texas", "Abilene Christian",
#     "BYU", "Michigan State/UCLA",
#     "UConn", "Maryland",
#     "Alabama", "Iona"
# ]

mm_2021_matchups = [
    "Gonzaga", "Appalachian State",
    "Oklahoma", "Missouri",
    "Virginia", "Ohio",
    "Creighton", "UC Santa Barbara",
    "Kansas", "Eastern Washington",
    "USC", "Drake",
    "Oregon", "VCU",
    "Iowa", "Grand Canyon",
    "Baylor", "Hartford",
    "North Carolina", "Wisconsin",
    "Purdue", "North Texas",
    "Villanova", "Winthrop",
    "Arkansas", "Colgate",
    "Texas Tech", "Utah State",
    "Florida", "Virginia Tech",
    "Ohio State", "Oral Roberts",
    "Illinois", "Drexel",
    "Loyola-Chicago", "Georgia Tech",
    "Oklahoma State", "Liberty",
    "Tennessee", "Oregon State",
    "West Virginia", "Morehead State",
    "San Diego State", "Syracuse",
    "Clemson", "Rutgers",
    "Houston", "Cleveland State",
    "Michigan", "Texas Southern",
    "LSU", "St. Bonaventure",
    "Florida State", "UNC Greensboro",
    "Colorado", "Georgetown",
    "Texas", "Abilene Christian",
    "BYU", "Michigan State",
    "UConn", "Maryland",
    "Alabama", "Iona"
]

mock_mm_2021_matchups = [team.lower() for team in mock_mm_2021_matchups]
mm_2021_matchups = [team.lower() for team in mm_2021_matchups]

In [None]:
print(len(mock_mm_2021_matchups))
print(len(m_teams_df[m_teams_df["TeamName"].isin(mock_mm_2021_matchups)]))

mock_matchups_df = pd.DataFrame({"TeamName": mock_mm_2021_matchups})
mock_matchups_df = pd.merge(mock_matchups_df, m_team_spellings_df, left_on="TeamName", right_on="TeamNameSpelling", how="left").reset_index(drop=True)
mock_matchups_df.drop(["TeamNameSpelling"], axis=1, inplace=True)
mock_matchups_df

print(len(mm_2021_matchups))
print(len(m_teams_df[m_teams_df["TeamName"].isin(mm_2021_matchups)]))

matchups_df = pd.DataFrame({"TeamName": mm_2021_matchups})
matchups_df = pd.merge(matchups_df, m_team_spellings_df, left_on="TeamName", right_on="TeamNameSpelling", how="left").reset_index(drop=True)
matchups_df.drop(["TeamNameSpelling"], axis=1, inplace=True)
matchups_df

In [73]:
def prep2021(matchups_df, ppg_df, eff_df, rankings_df):
    matchups_df = pd.DataFrame(matchups_df.values.reshape(-1, 4), columns=["TeamName_1", "TeamID_1", "TeamName_2", "TeamID_2"])
    
    matchups_df.loc[
        matchups_df["TeamID_1"] > matchups_df["TeamID_2"], ["TeamID_1", "TeamID_2", "TeamName_1", "TeamName_2"]
    ] = matchups_df.loc[
        matchups_df["TeamID_1"] > matchups_df["TeamID_2"], ["TeamID_2", "TeamID_1", "TeamName_2", "TeamName_1"]
    ].values
    
    matchups_df["Season"] = 2021
    matchups_df["ID"] = matchups_df["Season"].apply(str) + "_" + matchups_df["TeamID_1"].apply(str) + "_" + matchups_df["TeamID_2"].apply(str)
    
    matchups_df = merge_features(matchups_df, ppg_df, eff_df, rankings_df)
    
    return matchups_df

In [116]:
def prep_season_2021(season_results_df):
    season_results_df = season_results_df.copy()
    
    seasons = season_results_df["Season"]
    all_teams_df = season_results_df[seasons == 2021].reset_index(drop=True)
    
    w_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    w_teams_df = w_teams_df.rename(columns={"WTeamID": "TeamID_1", "LTeamID": "TeamID_2"})
    w_teams_df["Result"] = 1
    
    l_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    l_teams_df = l_teams_df.rename(columns={"WTeamID": "TeamID_2", "LTeamID": "TeamID_1"})
    l_teams_df["Result"] = 0
    
    all_teams_df = pd.concat([w_teams_df, l_teams_df]).reset_index(drop=True)
    
    all_teams_df["ID"] = all_teams_df["Season"].apply(str) + "_" + all_teams_df["TeamID_1"].apply(str) + "_" + all_teams_df["TeamID_2"].apply(str)
    
    all_teams_df = all_teams_df[["ID", "Season", "TeamID_1", "TeamID_2", "Result"]]
    
    return all_teams_df
    
print(prep_season_2021(m_regular_season_compact_results_df))

                  ID  Season  TeamID_1  TeamID_2  Result
0     2021_1101_1190    2021      1101      1190       1
1     2021_1291_1288    2021      1291      1288       1
2     2021_1298_1203    2021      1298      1203       1
3     2021_1462_1324    2021      1462      1324       1
4     2021_1441_1423    2021      1441      1423       1
...              ...     ...       ...       ...     ...
7711  2021_1433_1382    2021      1433      1382       0
7712  2021_1259_1159    2021      1259      1159       0
7713  2021_1261_1104    2021      1261      1104       0
7714  2021_1153_1222    2021      1153      1222       0
7715  2021_1326_1228    2021      1326      1228       0

[7716 rows x 5 columns]


In [81]:
def prep_predictions_2021(sample_output_df, ncaa_tourney_results_df):
    features_df = sample_output_df.copy()
    features_df["Season"] = features_df["ID"].apply(lambda row: int(row[:4]))
    features_df["TeamID_1"] = features_df["ID"].apply(lambda row: int(row[5:9]))
    features_df["TeamID_2"] = features_df["ID"].apply(lambda row: int(row[10:14]))
    features_df.drop(["Pred"], axis=1, inplace=True)
    
    ncaa_tourney_results_df = ncaa_tourney_results_df.copy()
    seasons = ncaa_tourney_results_df["Season"]
    all_teams_df = ncaa_tourney_results_df[seasons == 2021].reset_index(drop=True)
    
    w_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    w_teams_df = w_teams_df.rename(columns={"WTeamID": "TeamID_1", "LTeamID": "TeamID_2"})
    w_teams_df["Result"] = 1
    
    l_teams_df = all_teams_df[["Season", "WTeamID", "LTeamID"]]
    l_teams_df = l_teams_df.rename(columns={"WTeamID": "TeamID_2", "LTeamID": "TeamID_1"})
    l_teams_df["Result"] = 0
    
    all_teams_df = pd.concat([w_teams_df, l_teams_df]).reset_index(drop=True)
    
    features_df = features_df.merge(all_teams_df, on=["Season", "TeamID_1", "TeamID_2"], how="left")
    
    return features_df

In [74]:
ppg_df = get_ppg(m_regular_season_compact_results_df)
eff_df = get_efficiency(m_regular_season_detailed_results_df)
rankings_df = get_rankings(m_massey_ordinals_df)

In [82]:
prep_predictions_2021(sample_subm_df, m_ncaa_tourney_compact_results_df)

Unnamed: 0,ID,Season,TeamID_1,TeamID_2,Result
0,2021_1101_1104,2021,1101,1104,
1,2021_1101_1111,2021,1101,1111,
2,2021_1101_1116,2021,1101,1116,
3,2021_1101_1124,2021,1101,1124,
4,2021_1101_1140,2021,1101,1140,
...,...,...,...,...,...
2273,2021_1452_1457,2021,1452,1457,
2274,2021_1452_1458,2021,1452,1458,
2275,2021_1455_1457,2021,1455,1457,
2276,2021_1455_1458,2021,1455,1458,


In [None]:
andy_kats_mock_mm_2021_df = prep2021(mock_matchups_df, ppg_df, eff_df, rankings_df)
andy_kats_mock_mm_2021_df

mm_2021_df = prep2021(matchups_df, ppg_df, eff_df, rankings_df)
mm_2021_df

In [123]:
features_df = prep_predictions_2021(sample_subm_df, m_ncaa_tourney_compact_results_df)
tourney_df = merge_features(features_df, ppg_df, eff_df, rankings_df)

features_df = prep_season_2021(m_regular_season_compact_results_df)
reg_season_df = merge_features(features_df, ppg_df, eff_df, rankings_df)

In [124]:
reg_season_df

Unnamed: 0,ID,Season,TeamID_1,TeamID_2,Result,PPGDiff,WPercDiff,OffEffDiff,DefEffDiff,MeanOrdinalRankDiff,MedianOrdinalRankDiff,MasseyOrdinalRankDiff,BPI RKDiff,SOS RKDiff,SOR RKDiff
0,2021_1101_1190,2021,1101,1190,1,8.053140,0.347826,15.584541,5.922705,-48.480000,-49.5,-39.0,-64,153,-93
1,2021_1291_1288,2021,1291,1288,1,-15.258107,-0.086124,-20.001595,-18.915205,-18.560000,-23.5,-27.0,-23,-72,-5
2,2021_1298_1203,2021,1298,1203,1,2.912854,0.539216,6.343137,-7.879811,-138.360000,-138.0,-101.0,-88,85,-188
3,2021_1462_1324,2021,1462,1324,1,1.593862,0.219048,3.283492,12.411323,-164.660000,-168.5,-156.0,-211,-66,-207
4,2021_1441_1423,2021,1441,1423,1,-1.057500,-0.015000,-1.211111,4.556111,-18.179184,-16.5,-41.0,23,-117,-3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7711,2021_1433_1382,2021,1433,1382,0,0.867521,-0.069231,-2.507692,3.086325,12.900000,12.0,17.0,32,-9,4
7712,2021_1259_1159,2021,1259,1159,0,-16.442939,-0.580392,-19.828164,-15.813983,145.460000,151.5,153.0,145,-92,202
7713,2021_1261_1104,2021,1261,1104,0,2.157613,-0.133333,0.081070,-0.428807,19.734615,19.5,19.0,15,1,17
7714,2021_1153_1222,2021,1153,1222,0,-7.394649,-0.362876,-10.454849,-5.357860,102.429231,105.0,91.0,123,-40,80


In [125]:
reg_season_2021_df = reg_season_df[reg_season_df["Season"] == 2021].copy()

x_train = reg_season_2021_df[[
    "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
    "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff",
    "BPI RKDiff", "SOS RKDiff", "SOR RKDiff"
]]
y_train = reg_season_2021_df[["Result"]]
x_test = tourney_df[[
    "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
    "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff",
    "BPI RKDiff", "SOS RKDiff", "SOR RKDiff"
]]

In [129]:
tourney_df

Unnamed: 0,ID,Season,TeamID_1,TeamID_2,Result,PPGDiff,WPercDiff,OffEffDiff,DefEffDiff,MeanOrdinalRankDiff,MedianOrdinalRankDiff,MasseyOrdinalRankDiff,BPI RKDiff,SOS RKDiff,SOR RKDiff,TeamWProb_1
0,2021_1101_1104,2021,1101,1104,,-3.262319,0.026087,-0.013043,-14.255072,76.614615,76.0,98.0,74,262,70,0.142567
1,2021_1101_1111,2021,1101,1111,,9.189533,0.284420,17.036957,4.563446,-96.880000,-96.5,-76.0,-92,51,-114,0.831619
2,2021_1101_1116,2021,1101,1116,,-5.747239,0.040373,-3.369393,-11.196342,70.210769,70.0,95.0,69,234,65,0.159385
3,2021_1101_1124,2021,1101,1124,,-7.635467,-0.090580,-8.093599,2.098631,80.441538,81.0,103.0,81,227,72,0.116458
4,2021_1101_1140,2021,1101,1140,,-0.495652,0.066087,3.014957,-2.065739,62.813333,60.0,81.0,54,214,56,0.194185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,2021_1452_1457,2021,1452,1457,,-3.278292,-0.291667,-4.224588,-1.801852,-46.321538,-51.5,-61.0,-59,-317,-30,0.695932
2274,2021_1452_1458,2021,1452,1458,,7.194409,0.080460,12.629147,7.633206,-8.221538,-9.0,13.0,5,12,-13,0.545738
2275,2021_1455_1457,2021,1455,1457,,-8.670322,-0.221491,-13.376462,-0.358187,-9.060000,-8.0,-29.0,12,-240,-4,0.628749
2276,2021_1455_1458,2021,1455,1458,,1.802380,0.150635,3.477274,9.076870,29.040000,34.5,45.0,76,89,13,0.470613


In [126]:
log_reg = LogisticRegression(penalty="l2", random_state=None, max_iter=1000, verbose=1, n_jobs=-1)

log_reg.fit(x_train, y_train.values.ravel())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.6s finished


LogisticRegression(max_iter=1000, n_jobs=-1, verbose=1)

In [152]:
y_pred = log_reg.predict_proba(x_test)
y_pred = y_pred[:, 1]

tourney_df["TeamWProb_1"] = y_pred

# tourney_df[["TeamName_1", "TeamName_2", "TeamWProb_1"]]

tourney_df_disp_teams = pd.merge(tourney_df[["ID", "TeamID_1", "TeamID_2", "TeamWProb_1"]], m_teams_df[["TeamName", "TeamID"]], left_on="TeamID_1", right_on="TeamID", how="left")
tourney_df_disp_teams.drop(["TeamID"], axis=1, inplace=True)
tourney_df_disp_teams = pd.merge(tourney_df_disp_teams, m_teams_df[["TeamName", "TeamID"]], left_on="TeamID_2", right_on="TeamID", how="left")
tourney_df_disp_teams.drop(["TeamID"], axis=1, inplace=True)
tourney_df_disp_teams = tourney_df_disp_teams.rename(columns={"TeamName_x": "TeamName_1", "TeamName_y": "TeamName_2"})
# tourney_df_disp_teams.drop(["TeamID_1", "TeamID_2"], axis=1, inplace=True)
tourney_df_disp_teams = tourney_df_disp_teams[["ID", "TeamID_1", "TeamID_2", "TeamName_1", "TeamName_2", "TeamWProb_1"]]
tourney_df_disp_teams

# tourney_df_disp_teams.to_csv("bracket.csv", index=False)

# y_pred = log_reg.predict_proba(
#     tourney_df[[
#         "PPGDiff", "WPercDiff", "OffEffDiff", "DefEffDiff",
#         "MeanOrdinalRankDiff","MedianOrdinalRankDiff", "MasseyOrdinalRankDiff",
#         "BPI RKDiff", "SOS RKDiff", "SOR RKDiff"
#     ]]
# )
# y_pred = y_pred[:, 1]

# stage_2_submission_df = tourney_df[["ID"]].copy()
# stage_2_submission_df["Pred"] = y_pred

# stage_2_submission_df["TeamID_1"] = stage_1_submission_df["ID"].apply(lambda row: row[5:9])
# stage_2_submission_df["TeamID_2"] = stage_1_submission_df["ID"].apply(lambda row: row[10:14])
# stage_2_submission_df.merge(m_teams_df)
# stage_2_submission_df
# stage_2_submission_df.to_csv(m_subm_stage_2_output_path, index=False)

Unnamed: 0,ID,TeamID_1,TeamID_2,TeamName_1,TeamName_2,TeamWProb_1
0,2021_1101_1104,1101,1104,Abilene Chr,Alabama,0.142567
1,2021_1101_1111,1101,1111,Abilene Chr,Appalachian St,0.831619
2,2021_1101_1116,1101,1116,Abilene Chr,Arkansas,0.159385
3,2021_1101_1124,1101,1124,Abilene Chr,Baylor,0.116458
4,2021_1101_1140,1101,1140,Abilene Chr,BYU,0.194185
...,...,...,...,...,...,...
2273,2021_1452_1457,1452,1457,West Virginia,Winthrop,0.695932
2274,2021_1452_1458,1452,1458,West Virginia,Wisconsin,0.545738
2275,2021_1455_1457,1455,1457,Wichita St,Winthrop,0.628749
2276,2021_1455_1458,1455,1458,Wichita St,Wisconsin,0.470613


In [199]:
first_rd_tms = [
    "Gonzaga", "Appalachian State",
    "Oklahoma", "Missouri",
    "Creighton", "UC Santa Barbara",
    "Virginia", "Ohio",
    "USC", "Drake",
    "Kansas", "Eastern Washington",
    "Oregon", "VCU",
    "Iowa", "Grand Canyon",
    "Michigan", "Texas Southern",
    "LSU", "St. Bonaventure",
    "Colorado", "Georgetown",
    "Florida State", "UNC Greensboro",
    "BYU", "Michigan State",
    "Texas", "Abilene Christian",
    "UConn", "Maryland",
    "Alabama", "Iona",
    "Baylor", "Hartford",
    "North Carolina", "Wisconsin",
    "Villanova", "Winthrop",
    "Purdue", "North Texas",
    "Texas Tech", "Utah State",
    "Arkansas", "Colgate",
    "Florida", "Virginia Tech",
    "Ohio State", "Oral Roberts",
    "Illinois", "Drexel",
    "Loyola-Chicago", "Georgia Tech",
    "Tennessee", "Oregon State",
    "Oklahoma State", "Liberty",
    "San Diego State", "Syracuse",
    "West Virginia", "Morehead State",
    "Clemson", "Rutgers",
    "Houston", "Cleveland State"
]

## Generate Winners from Each Round

In [315]:
def generate_mm_bracket(first_rd_tms, preds_df):
    preds_df = preds_df.copy()
    
    first_rd_tms = [team.lower() for team in first_rd_tms]
    first_rd_tms_df = pd.DataFrame({"TeamName": first_rd_tms})
    first_rd_tms_df = pd.merge(first_rd_tms_df, m_team_spellings_df, left_on="TeamName", right_on="TeamNameSpelling", how="left").reset_index(drop=True)
    first_rd_tms_df.drop(["TeamNameSpelling"], axis=1, inplace=True)
    
    df = pd.DataFrame(first_rd_tms_df.values.reshape(-1, 4), columns=["TeamName_1", "TeamID_1", "TeamName_2", "TeamID_2"])
    
    df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_1", "TeamID_2", "TeamName_1", "TeamName_2"]
    ] = df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_2", "TeamID_1", "TeamName_2", "TeamName_1"]
    ].values
    
    df["Season"] = 2021
    df["ID"] = df["Season"].apply(str) + "_" + df["TeamID_1"].apply(str) + "_" + df["TeamID_2"].apply(str)
    
    df = pd.merge(df, preds_df[["ID", "TeamWProb_1"]], on="ID", how="inner")
    
    first_rd_w_tms = [row["TeamName_1"] if row["TeamWProb_1"] >= 0.5 else row["TeamName_2"] for index, row in df.iterrows()]
    
    second_rd_tms_df = pd.DataFrame({"TeamName": first_rd_w_tms})
    second_rd_tms_df = pd.merge(second_rd_tms_df, m_team_spellings_df, left_on="TeamName", right_on="TeamNameSpelling", how="left").reset_index(drop=True)
    second_rd_tms_df.drop(["TeamNameSpelling"], axis=1, inplace=True)
    
    df = pd.DataFrame(second_rd_tms_df.values.reshape(-1, 4), columns=["TeamName_1", "TeamID_1", "TeamName_2", "TeamID_2"])
    
    df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_1", "TeamID_2", "TeamName_1", "TeamName_2"]
    ] = df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_2", "TeamID_1", "TeamName_2", "TeamName_1"]
    ].values
    
    df["Season"] = 2021
    df["ID"] = df["Season"].apply(str) + "_" + df["TeamID_1"].apply(str) + "_" + df["TeamID_2"].apply(str)
    
    df = pd.merge(df, preds_df[["ID", "TeamWProb_1"]], on="ID", how="inner")
    
    second_rd_w_tms = [row["TeamName_1"] if row["TeamWProb_1"] >= 0.5 else row["TeamName_2"] for index, row in df.iterrows()]
    
    sweet_16_tms_df = pd.DataFrame({"TeamName": second_rd_w_tms})
    sweet_16_tms_df = pd.merge(sweet_16_tms_df, m_team_spellings_df, left_on="TeamName", right_on="TeamNameSpelling", how="left").reset_index(drop=True)
    sweet_16_tms_df.drop(["TeamNameSpelling"], axis=1, inplace=True)
    
    df = pd.DataFrame(sweet_16_tms_df.values.reshape(-1, 4), columns=["TeamName_1", "TeamID_1", "TeamName_2", "TeamID_2"])
    
    df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_1", "TeamID_2", "TeamName_1", "TeamName_2"]
    ] = df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_2", "TeamID_1", "TeamName_2", "TeamName_1"]
    ].values
    
    df["Season"] = 2021
    df["ID"] = df["Season"].apply(str) + "_" + df["TeamID_1"].apply(str) + "_" + df["TeamID_2"].apply(str)
    
    df = pd.merge(df, preds_df[["ID", "TeamWProb_1"]], on="ID", how="inner")
    
    sweet_16_w_tms = [row["TeamName_1"] if row["TeamWProb_1"] >= 0.5 else row["TeamName_2"] for index, row in df.iterrows()]
    
    elite_8_tms_df = pd.DataFrame({"TeamName": sweet_16_w_tms})
    elite_8_tms_df = pd.merge(elite_8_tms_df, m_team_spellings_df, left_on="TeamName", right_on="TeamNameSpelling", how="left").reset_index(drop=True)
    elite_8_tms_df.drop(["TeamNameSpelling"], axis=1, inplace=True)
    
    df = pd.DataFrame(elite_8_tms_df.values.reshape(-1, 4), columns=["TeamName_1", "TeamID_1", "TeamName_2", "TeamID_2"])
    
    df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_1", "TeamID_2", "TeamName_1", "TeamName_2"]
    ] = df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_2", "TeamID_1", "TeamName_2", "TeamName_1"]
    ].values
    
    df["Season"] = 2021
    df["ID"] = df["Season"].apply(str) + "_" + df["TeamID_1"].apply(str) + "_" + df["TeamID_2"].apply(str)
    
    df = pd.merge(df, preds_df[["ID", "TeamWProb_1"]], on="ID", how="inner")
    
    elite_8_w_tms = [row["TeamName_1"] if row["TeamWProb_1"] >= 0.5 else row["TeamName_2"] for index, row in df.iterrows()]
    
    final_4_tms_df = pd.DataFrame({"TeamName": elite_8_w_tms})
    final_4_tms_df = pd.merge(final_4_tms_df, m_team_spellings_df, left_on="TeamName", right_on="TeamNameSpelling", how="left").reset_index(drop=True)
    final_4_tms_df.drop(["TeamNameSpelling"], axis=1, inplace=True)
    
    df = pd.DataFrame(final_4_tms_df.values.reshape(-1, 4), columns=["TeamName_1", "TeamID_1", "TeamName_2", "TeamID_2"])
    
    df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_1", "TeamID_2", "TeamName_1", "TeamName_2"]
    ] = df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_2", "TeamID_1", "TeamName_2", "TeamName_1"]
    ].values
    
    df["Season"] = 2021
    df["ID"] = df["Season"].apply(str) + "_" + df["TeamID_1"].apply(str) + "_" + df["TeamID_2"].apply(str)
    
    df = pd.merge(df, preds_df[["ID", "TeamWProb_1"]], on="ID", how="inner")
    
    final_4_w_tms = [row["TeamName_1"] if row["TeamWProb_1"] >= 0.5 else row["TeamName_2"] for index, row in df.iterrows()]
    
    champ_tms_df = pd.DataFrame({"TeamName": final_4_w_tms})
    champ_tms_df = pd.merge(champ_tms_df, m_team_spellings_df, left_on="TeamName", right_on="TeamNameSpelling", how="left").reset_index(drop=True)
    champ_tms_df.drop(["TeamNameSpelling"], axis=1, inplace=True)
    
    df = pd.DataFrame(champ_tms_df.values.reshape(-1, 4), columns=["TeamName_1", "TeamID_1", "TeamName_2", "TeamID_2"])
    
    df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_1", "TeamID_2", "TeamName_1", "TeamName_2"]
    ] = df.loc[
        df["TeamID_1"] > df["TeamID_2"], ["TeamID_2", "TeamID_1", "TeamName_2", "TeamName_1"]
    ].values
    
    df["Season"] = 2021
    df["ID"] = df["Season"].apply(str) + "_" + df["TeamID_1"].apply(str) + "_" + df["TeamID_2"].apply(str)
    
    df = pd.merge(df, preds_df[["ID", "TeamWProb_1"]], on="ID", how="inner")
    
    champ_w_tm = [row["TeamName_1"] if row["TeamWProb_1"] >= 0.5 else row["TeamName_2"] for index, row in df.iterrows()]
    
    nl = "\n"
    print(f"First Round Winners:{nl + '- '}{(nl + '- ').join((*first_rd_w_tms, ))}{nl}")
    print(f"Second Round Winners:{nl + '- '}{(nl + '- ').join((*second_rd_w_tms, ))}{nl}")
    print(f"Sweet 16 Winners:{nl + '- '}{(nl + '- ').join((*sweet_16_w_tms, ))}{nl}")
    print(f"Elite 8 Winners:{nl + '- '}{(nl + '- ').join((*elite_8_w_tms, ))}{nl}")
    print(f"Final 4 Winners:{nl + '- '}{(nl + '- ').join((*final_4_w_tms, ))}{nl}")
    print(f"NCAA Championship Winner:{(nl + '- ')}{nl.join((*champ_w_tm, ))}{nl}")
    
generate_mm_bracket(first_rd_tms, tourney_df_disp_teams)

First Round Winners:
- gonzaga
- missouri
- creighton
- virginia
- usc
- kansas
- oregon
- iowa
- michigan
- lsu
- colorado
- florida state
- byu
- texas
- uconn
- alabama
- baylor
- north carolina
- villanova
- purdue
- texas tech
- arkansas
- florida
- ohio state
- illinois
- georgia tech
- tennessee
- oklahoma state
- san diego state
- west virginia
- clemson
- houston

Second Round Winners:
- gonzaga
- creighton
- kansas
- iowa
- michigan
- florida state
- texas
- alabama
- baylor
- purdue
- arkansas
- ohio state
- illinois
- oklahoma state
- san diego state
- houston

Sweet 16 Winners:
- gonzaga
- iowa
- michigan
- alabama
- baylor
- arkansas
- illinois
- houston

Elite 8 Winners:
- gonzaga
- michigan
- baylor
- illinois

Final 4 Winners:
- gonzaga
- baylor

NCAA Championship Winner:
- gonzaga

