In [1]:
import sys
import os
import json
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier, LassoCV
from sklearn.metrics import roc_auc_score, accuracy_score
sys.path.append("/Users/atticussoane/Desktop/atticus_tools/")
from recursive_selection import FeatureSelector
from dataclasses import dataclass

In [2]:
validation_data = pd.read_csv("./engineered_data/model_ready.csv.gz", compression = "gzip")
pd.set_option("max.columns", 100)
validation_data.head()

Unnamed: 0,Season,DayNum,team1_TeamID,team1_Score,team2_TeamID,team2_Score,team1_seedN,team1_DRB_DIF,team1_ORB_DIF,team1_PPG,team1_PPP,team1_TRB_DIF,team1_ast_TO,team1_e_fg,team1_fg,team1_free_throw_pct,team1_opp_PPG,team1_opp_PPP,team1_opp_ast_TO,team1_opp_efg,team1_opp_fg,team1_opp_three_point_pct,team1_opp_trueshooting,team1_three_point_pct,team1_total_point_differential,team1_true_shooting,team1_acc,team1_big_east,team1_big_ten,team1_big_twelve,team1_sec,team1_pac12,team1_MOR,team1_AP_top10,team1_AP_top25,team2_seedN,team2_DRB_DIF,team2_ORB_DIF,team2_PPG,team2_PPP,team2_TRB_DIF,team2_ast_TO,team2_e_fg,team2_fg,team2_free_throw_pct,team2_opp_PPG,team2_opp_PPP,team2_opp_ast_TO,team2_opp_efg,team2_opp_fg,team2_opp_three_point_pct,team2_opp_trueshooting,team2_three_point_pct,team2_total_point_differential,team2_true_shooting,team2_acc,team2_big_east,team2_big_ten,team2_big_twelve,team2_sec,team2_pac12,team2_MOR,team2_AP_top10,team2_AP_top25,team1_win
0,2003,134,1421,92,1411,84,16,10,-42,71.206897,0.866162,-32,0.804255,0.486339,0.429265,0.762768,78.448276,0.937078,1.236559,0.518382,0.455882,0.36711,0.553323,0.360153,-210,0.539424,0,0,0,0,0,0,277.0,0,0,16,58,37,72.8,0.878704,95,0.932166,0.501206,0.447527,0.619952,70.833333,0.85366,0.960465,0.487307,0.424945,0.32518,0.515957,0.320721,59,0.538334,0,0,0,0,0,0,293.0,0,0,1
1,2003,146,1328,47,1393,63,1,77,52,71.166667,0.930072,129,1.200565,0.512972,0.446934,0.707885,60.166667,0.799394,0.76399,0.446284,0.404747,0.326781,0.488641,0.393673,330,0.549827,0,0,0,1,0,0,6.0,1,0,3,134,-43,80.103448,0.929051,91,1.098734,0.512195,0.470067,0.693431,69.896552,0.799883,1.100239,0.443703,0.390075,0.30687,0.479169,0.330435,296,0.551677,0,1,0,0,0,0,21.0,0,1,0
2,2003,144,1120,78,1393,79,10,59,-9,70.1,0.884178,50,0.842767,0.521328,0.469835,0.662162,65.566667,0.822586,0.801688,0.481165,0.420247,0.333333,0.512977,0.344898,136,0.55299,0,0,0,0,1,0,44.0,0,0,3,134,-43,80.103448,0.929051,91,1.098734,0.512195,0.470067,0.693431,69.896552,0.799883,1.100239,0.443703,0.390075,0.30687,0.479169,0.330435,296,0.551677,0,1,0,0,0,0,21.0,0,1,0
3,2003,144,1268,58,1277,60,6,88,-16,80.785714,0.929044,72,1.339241,0.517756,0.464487,0.703642,66.857143,0.759001,0.703704,0.432193,0.378538,0.312715,0.479165,0.397895,390,0.554477,1,0,0,0,0,0,9.0,0,1,7,125,42,67.225806,0.891803,167,0.935698,0.498737,0.450758,0.734694,60.967742,0.826801,0.781038,0.458013,0.400641,0.338374,0.512773,0.366265,194,0.552539,0,0,1,0,0,0,16.0,0,0,0
4,2003,143,1458,57,1246,63,5,25,61,70.724138,0.969208,86,1.265574,0.522393,0.464491,0.741135,58.689655,0.825396,0.8,0.479612,0.430481,0.339492,0.513566,0.359127,349,0.566212,0,0,1,0,0,0,15.0,0,1,1,134,54,77.53125,0.949992,188,1.151111,0.537988,0.487426,0.706767,64.09375,0.799199,0.652751,0.464146,0.412325,0.328014,0.502864,0.355932,430,0.57388,0,0,0,0,1,0,1.0,1,0,0


# REGULAR SEASON STATISTICS MODEL SOLO PREDICTIONS

In [3]:
with open("./models/reg_season_stats_SVM.pk", "rb") as f: #load SVM model trained on regular season statistics
    stat_based_model = pickle.load(f)
with open("./models/regular_season_stat_standard_scaler.pk", "rb") as f: #load data scaler from training set
    stat_scaler = pickle.load(f)
with open("./models/regular_season_stats_features.json", "r+") as f: #load features used in training 
    stat_features = json.load(f)

In [4]:
def scale_features(data, binary_features, load_scaler = None, save_scaler = False):
    if binary_features:
        to_scale = data.drop(columns = binary_features)
    else:
        to_scale = data
    scale_columns = list(to_scale.columns)
    npX = np.array(to_scale)
    
    if not load_scaler:
        ss = StandardScaler()
        ss.fit(npX)
    else:
        ss = load_scaler
    npX = ss.transform(npX)
    scaled_X = pd.DataFrame(npX)
    scaled_X.columns = scale_columns
    if binary_features:
        scaled_X = pd.concat([scaled_X, data[binary_features]], axis = 1)
    if save_scaler:
        return(scaled_X, ss)
    else:
        return(scaled_X)

In [5]:
validation_data = validation_data[validation_data.Season > 2014].reset_index(drop = True)
stat_validation = validation_data[stat_features]
scaled_stat_validation = scale_features(stat_validation, [], load_scaler = stat_scaler)
validation_target = validation_data.team1_win

In [6]:
scaled_stat_validation.describe()

Unnamed: 0,team1_ast_TO,team1_PPG,team1_PPP,team2_ast_TO,team2_PPG,team2_PPP,team1_opp_ast_TO,team1_opp_PPG,team1_opp_PPP,team2_opp_ast_TO,team2_opp_PPG,team2_opp_PPP,team1_total_point_differential,team2_total_point_differential,team1_TRB_DIF,team2_TRB_DIF
count,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0
mean,1.150133,0.824866,1.13545,1.172297,0.772341,1.115704,-0.135146,-0.391384,-0.284147,-0.094587,-0.393466,-0.266219,2.215116,2.126553,1.437634,1.404127
std,0.790595,0.674327,0.608569,0.843559,0.706305,0.629542,0.695509,0.663115,0.544328,0.655988,0.756517,0.584081,1.196762,1.203035,1.591265,1.588184
min,-0.80592,-0.964233,-0.570596,-0.813293,-1.09213,-0.576947,-1.714622,-2.592113,-2.196854,-1.604606,-2.259978,-2.209929,-0.570251,-1.104437,-3.544749,-3.514742
25%,0.57824,0.316561,0.695702,0.593376,0.256589,0.66665,-0.583912,-0.817011,-0.54719,-0.482819,-0.854958,-0.627067,1.36911,1.293251,0.362834,0.305801
50%,1.066434,0.789405,1.128493,1.116738,0.725387,1.120791,-0.170382,-0.344271,-0.239503,-0.145675,-0.353969,-0.234061,2.194927,2.094954,1.343231,1.36166
75%,1.681122,1.244691,1.536513,1.768087,1.366729,1.568001,0.326435,0.085652,0.061205,0.301216,0.092223,0.09568,2.946009,3.015429,2.35164,2.514769
max,3.448225,2.698819,2.833833,3.468598,2.677988,2.8226,1.772384,1.3792,1.532517,1.598651,1.641643,1.54255,5.827025,5.784276,5.628968,5.585097


While it immediately becomes clear that the training scaler did not scale this data to mean 0 and unit variance, this intuitively makes sense. These are tournament teams: they're statistics are going to be above the mean.

In [7]:
y_preds = stat_based_model.predict(scaled_stat_validation)

In [8]:
roc_score = roc_auc_score(validation_target, y_preds)
acc_score = accuracy_score(validation_target, y_preds)

print("The ROC AUC score for the regular season statistics model was: {}".format(roc_score))
print()
print("The accuracy score for the regular season statistics model was: {}".format(acc_score))

The ROC AUC score for the regular season statistics model was: 0.6596806387225549

The accuracy score for the regular season statistics model was: 0.6597014925373135


# RANKINGS MODEL SOLO PREDICTIONS

This will require a little elbow grease... I prepared the season-end and in-season metrics slightly differently so the season end frame does not have all of the features that the regular season frame has (doesn't have power6 wins, top 50 wins, etc)... should be a quick fix.

In [9]:
with open("./models/reg_season_rankings_SVM.pk", "rb") as f: #load SVM model trained on regular season rankings
    ranking_based_model = pickle.load(f)
with open("./models/regular_season_rankings_standard_scaler.pk", "rb") as f: #load data scaler from training set
    ranking_scaler = pickle.load(f)
with open("./models/regular_season_ranking_features.json", "r+") as f: #load features used in training 
    ranking_features = json.load(f)

In [10]:
end_of_season_rankings = pd.read_csv("./engineered_data/finalrankings.csv.gz", compression = "gzip")

In [11]:
end_of_season_rankings.head()

Unnamed: 0,7OT,ACU,ADE,AP,ARG,AUS,AWS,BBT,BCM,BD,BIH,BKM,BLS,BNM,BNT,BOB,BPI,BRZ,BUR,BWE,CJB,CMV,CNG,COL,COX,CPA,CPR,CRO,CRW,D1A,DAV,DC,DC2,DCI,DDB,DES,DII,DOK,DOL,DUN,DWH,EBP,ECK,ENT,ERD,ESR,FAS,FMG,FSH,GRN,...,ROH,RPI,RSE,RSL,RT,RTB,RTH,RTP,RTR,SAG,SAP,SAU,SCR,SE,SEL,SFX,SGR,SIM,SMN,SMS,SP,SPR,SPW,STF,STH,STM,STR,STS,TBD,TMR,TPR,TRK,TRP,TRX,TSR,TW,UCS,UPS,USA,WIL,WLK,WMR,WMV,WOB,WOL,WTE,YAG,ZAM,season,teamID
0,,,,,141.0,,,,,,172.0,,,,,146.0,,161.0,,,,,,162.0,,,,,,,,,,,,,,,175.0,97.0,149.0,,156.0,155.0,147.0,,,,,161.0,...,,158.0,,,,,146.0,,,149.0,,154.0,,162.0,155.0,,,,,,,,,,,,168.0,,,,,,,,146.0,,,,,,165.0,,,155.0,157.0,156.0,,,2003,1102
1,,,,,180.0,,,,,,177.0,,,,,179.0,,174.0,,,,,,172.0,,,,,,,,,,,,,,,174.0,165.0,168.0,,194.0,176.0,128.0,,,,,168.0,...,,182.0,,,,,168.0,,,172.0,,170.0,,167.0,169.0,,,,,,,,,,,,147.0,,,,,,,,172.0,,,,,,172.0,,,177.0,171.0,161.0,,,2003,1103
2,,,,,37.0,,,,,,40.0,,,,,35.0,,34.0,,,,,,43.0,,,,,,,,,,,,,,,39.0,43.0,33.0,,51.0,34.0,36.0,,,,,40.0,...,,38.0,,,,,31.0,,,37.0,,32.0,,41.0,38.0,,,,,,,,,,,,33.0,,,,,,,,35.0,,,,,,36.0,,,37.0,37.0,28.0,,,2003,1104
3,,,,,307.0,,,,,,312.0,,,,,313.0,,305.0,,,,,,310.0,,,,,,,,,,,,,,,315.0,306.0,315.0,,314.0,309.0,309.0,,,,,313.0,...,,313.0,,,,,312.0,,,312.0,,312.0,,314.0,312.0,,,,,,,,,,,,289.0,,,,,,,,315.0,,,,,,310.0,,,312.0,311.0,299.0,,,2003,1105
4,,,,,252.0,,,,,,269.0,,,,,265.0,,219.0,,,,,,256.0,,,,,,,,,,,,,,,266.0,305.0,269.0,,245.0,259.0,283.0,,,,,279.0,...,,248.0,,,,,274.0,,,268.0,,264.0,,276.0,267.0,,,,,,,,,,,,294.0,,,,,,,,279.0,,,,,,254.0,,,265.0,270.0,256.0,,,2003,1106


In [12]:
full_regular_season = pd.read_csv("./engineered_data/all_game_level.csv.gz", compression = "gzip")
full_regular_season.head()

Unnamed: 0,DayNum,LScore,LTeamID,L_DRB_DIF,L_ORB_DIF,L_PPG,L_PPP,L_TRB_DIF,L_ast_TO,L_e_fg,L_fg,L_free_throw_pct,L_opp_PPG,L_opp_PPP,L_opp_ast_TO,L_opp_efg,L_opp_fg,L_opp_three_point_pct,L_opp_trueshooting,L_three_point_pct,L_total_point_differential,L_true_shooting,Season,WLoc,WScore,WTeamID,W_DRB_DIF,W_ORB_DIF,W_PPG,W_PPP,W_TRB_DIF,W_ast_TO,W_e_fg,W_fg,W_free_throw_pct,W_opp_PPG,W_opp_PPP,W_opp_ast_TO,W_opp_efg,W_opp_fg,W_opp_three_point_pct,W_opp_trueshooting,W_three_point_pct,W_total_point_differential,W_true_shooting,W_COL,L_COL,W_DOL,L_DOL,W_POM,L_POM,W_RTH,L_RTH,W_SAG,L_SAG,W_WLK,L_WLK,W_WOL,L_WOL,W_MOR,L_MOR,W_AP_,L_AP_,W_USA_,L_USA_,W_Conf,L_Conf,W_power6_wins,W_top50_wins,L_power6_wins,L_top50_wins
0,10,62,1328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2003,N,68,1104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27,9,27,11,15,20,9,25,11,27,14,23,5,21,13,4,0,0,0,0,1,1,0,0,0,0
1,10,63,1393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2003,N,70,1272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52,37,72,16,41,40,15,20,61,60,80,110,11,35,40,57,0,0,0,0,0,1,0,0,0,0
2,11,61,1437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2003,N,73,1266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18,47,10,52,25,46,34,71,32,54,33,24,38,85,52,36,0,0,0,0,0,1,0,0,0,0
3,11,50,1457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2003,N,56,1296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,155,169,144,194,173,267,199,265,190,248,171,244,215,249,188,235,0,0,0,0,0,0,0,0,0,0
4,11,71,1208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2003,N,77,1400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,12,9,13,9,49,19,21,18,46,20,65,19,20,18,29,0,0,0,0,1,1,0,0,0,0


In [13]:
team_ids = validation_data[["Season", "team1_TeamID", "team2_TeamID"]]
seasons = {}
for season in team_ids.Season.unique():
    team1 = list(team_ids[team_ids.Season == season].team1_TeamID.unique())
    team2 = list(team_ids[team_ids.Season == season].team2_TeamID.unique())
    all_teams = set(team1 + team2)
    seasons[str(season)] = all_teams

In [14]:
rankings = ["AP", "USA", "COL", "DOL", "MOR", "POM", "RTH", "SAG", "WLK", "WOL"]
other = ["Conf", "power6_wins", "top50_wins"]

In [19]:
all_frames = []
for season in seasons:
    frame_list = []
    search_teams = seasons[season]
    season = int(season)
    for team in search_teams:
        obs = end_of_season_rankings[(end_of_season_rankings.season == season) &
                                    (end_of_season_rankings.teamID == team)][rankings]
        obs = obs.assign(teamID = team)
        obs = obs.assign(season = season)
        frame_list.append(obs)
    ranking_frame = pd.concat(frame_list, axis = 0)
    ranking_frame.assign(season = season)
    all_frames.append(ranking_frame)
ratings_systems = pd.concat(all_frames, axis = 0).reset_index(drop = True)

In [21]:
ratings_systems = ratings_systems.fillna(-1)
def top_25(x):
    if x > 0:
        return(1)
    else:
        return(0)
    
ratings_systems["AP_"] = ratings_systems.AP.map(lambda x: top_25(x))
ratings_systems["USA_"] = ratings_systems.USA.map(lambda x: top_25(x))

In [22]:
ratings_systems = ratings_systems.drop(columns = ["AP", "USA"])
ratings_systems.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 12 columns):
COL       340 non-null float64
DOL       340 non-null float64
MOR       340 non-null float64
POM       340 non-null float64
RTH       340 non-null float64
SAG       340 non-null float64
WLK       340 non-null float64
WOL       340 non-null float64
teamID    340 non-null int64
season    340 non-null int64
AP_       340 non-null int64
USA_      340 non-null int64
dtypes: float64(8), int64(4)
memory usage: 32.0 KB


In [23]:
ratings_systems.head()

Unnamed: 0,COL,DOL,MOR,POM,RTH,SAG,WLK,WOL,teamID,season,AP_,USA_
0,43.0,59.0,51.0,34.0,52.0,50.0,50.0,52.0,1153,2015,0,0
1,153.0,150.0,266.0,207.0,197.0,204.0,203.0,132.0,1411,2015,0,0
2,139.0,148.0,129.0,121.0,135.0,144.0,148.0,148.0,1412,2015,0,0
3,144.0,147.0,164.0,148.0,154.0,145.0,134.0,124.0,1157,2015,0,0
4,107.0,112.0,103.0,94.0,113.0,97.0,103.0,103.0,1414,2015,0,0


In [43]:
win_cols = []
for k in range(len(ratings_systems)):
    d = {}
    team = ratings_systems.iloc[k]["teamID"]
    season = ratings_systems.iloc[k]["season"]
    try:
        d["conf"] = full_regular_season[(full_regular_season.LTeamID == team) &
                (full_regular_season.Season == season)].L_Conf.mode().values[0]
    except:
        d["conf"] = full_regular_season[(full_regular_season.WTeamID == team) &
                           (full_regular_season.Season == season)].W_Conf.mode().values[0]
    power6_index = full_regular_season[((full_regular_season.LTeamID == team) |
                                  (full_regular_season.WTeamID == team)) &
                                 (full_regular_season.Season == season)].index.max()
    if full_regular_season.iloc[power6_index]["WTeamID"] == team:
        d["power6_wins"] = full_regular_season.iloc[power6_index]["W_power6_wins"] + 1
        d["top50_wins"] = full_regular_season.iloc[power6_index]["W_top50_wins"] + 1
    elif full_regular_season.iloc[power6_index]["LTeamID"] == team:
        d["power6_wins"] = full_regular_season.iloc[power6_index]["L_power6_wins"]
        d["top50_wins"] = full_regular_season.iloc[power6_index]["L_top50_wins"]
    win_cols.append(d)

other = pd.DataFrame(win_cols)
ratings_systems = pd.concat([ratings_systems, other], axis = 1)

In [47]:
ratings_systems.head()

Unnamed: 0,COL,DOL,MOR,POM,RTH,SAG,WLK,WOL,teamID,season,AP_,USA_,conf,power6_wins,top50_wins
0,43.0,59.0,51.0,34.0,52.0,50.0,50.0,52.0,1153,2015,0,0,0,1,3
1,153.0,150.0,266.0,207.0,197.0,204.0,203.0,132.0,1411,2015,0,0,0,3,2
2,139.0,148.0,129.0,121.0,135.0,144.0,148.0,148.0,1412,2015,0,0,0,1,2
3,144.0,147.0,164.0,148.0,154.0,145.0,134.0,124.0,1157,2015,0,0,0,2,1
4,107.0,112.0,103.0,94.0,113.0,97.0,103.0,103.0,1414,2015,0,0,0,1,2


In [51]:
team1_ids = team_ids[["Season", "team1_TeamID"]]
team2_ids = team_ids[["Season", "team2_TeamID"]]

team1_ratings = team1_ids.merge(ratings_systems, how = 'left', left_on = ["Season", "team1_TeamID"],
                           right_on = ["season", "teamID"])
team2_ratings = team2_ids.merge(ratings_systems, how = "left", left_on = ["Season", "team2_TeamID"],
                               right_on = ["season", "teamID"]) 

In [57]:
team1_ratings.to_csv("./engineered_data/validation_rankings_full.csv.gz", index = False,
                    compression = "gzip")
team2_ratings.to_csv("./engineered_data/validation_rankings2_full.csv.gz", index = False,
                    compression = "gzip")
team1_ratings = team1_ratings.drop(columns = ["Season", "team1_TeamID", "season", "teamID"])
team1_cols = list(team1_ratings.columns)
team1_cols = ["team1_" + i for i in team1_cols]
team1_ratings.columns = team1_cols

In [61]:
team2_ratings = team2_ratings.drop(columns = ["Season", "team2_TeamID", "season", "teamID"])
team2_cols = list(team2_ratings.columns)
team2_cols = ["team2_" + i for i in team2_cols]
team2_ratings.columns = team2_cols

In [63]:
validation_rankings = pd.concat([team1_ratings, team2_ratings, validation_target], axis = 1)

In [65]:
validation_rankings.to_csv("./engineered_data/model_ready_validation_rankings.csv.gz", index = False,
                          compression = "gzip")

In [70]:
validation_rankings = validation_rankings.rename(columns = {"team1_conf" : "team1_Conf",
                                                           "team2_conf" : "team2_Conf"})
validation_rankings = validation_rankings[ranking_features]

In [71]:
validation_rankings

Unnamed: 0,team1_COL,team1_DOL,team1_MOR,team1_POM,team1_RTH,team1_SAG,team1_WLK,team1_WOL,team1_power6_wins,team1_top50_wins,team2_COL,team2_DOL,team2_MOR,team2_POM,team2_RTH,team2_SAG,team2_WLK,team2_WOL,team2_power6_wins,team2_top50_wins,team1_AP_,team2_AP_,team1_USA_,team2_USA_,team1_Conf,team2_Conf
0,5.0,5.0,5.0,7.0,5.0,5.0,5.0,4.0,20,10,26.0,24.0,14.0,17.0,19.0,16.0,16.0,25.0,15,7,1,1,1,1,1,1
1,28.0,41.0,62.0,40.0,44.0,48.0,43.0,36.0,4,1,48.0,49.0,48.0,39.0,54.0,45.0,56.0,39.0,0,2,0,0,0,0,0,0
2,46.0,42.0,33.0,49.0,36.0,33.0,41.0,46.0,15,5,43.0,59.0,51.0,34.0,52.0,50.0,50.0,52.0,1,3,0,0,0,0,1,0
3,12.0,15.0,18.0,15.0,14.0,14.0,14.0,16.0,15,10,91.0,93.0,91.0,71.0,112.0,78.0,80.0,74.0,1,1,1,0,1,0,1,0
4,39.0,44.0,47.0,43.0,38.0,43.0,37.0,45.0,13,8,40.0,40.0,22.0,38.0,32.0,32.0,33.0,40.0,12,4,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330,11.0,13.0,22.0,18.0,14.0,20.0,16.0,13.0,16,6,27.0,21.0,30.0,24.0,19.0,29.0,24.0,27.0,13,10,1,0,1,0,1,1
331,10.0,7.0,2.0,2.0,8.0,5.0,5.0,5.0,6,6,47.0,53.0,53.0,41.0,44.0,44.0,43.0,49.0,13,9,1,0,1,0,0,1
332,17.0,20.0,6.0,13.0,15.0,11.0,11.0,17.0,18,7,9.0,12.0,27.0,20.0,12.0,14.0,17.0,15.0,19,13,1,1,1,1,1,1
333,3.0,3.0,10.0,8.0,6.0,7.0,7.0,4.0,21,8,36.0,24.0,34.0,36.0,28.0,42.0,37.0,36.0,14,7,1,0,1,0,1,1


In [73]:
binary_features = ["team1_AP_", "team2_AP_", "team1_USA_", "team2_USA_", "team1_Conf", "team2_Conf"]

In [74]:
scaled_validation_rankings = scale_features(validation_rankings, binary_features, 
                                           load_scaler = ranking_scaler)

In [75]:
scaled_validation_rankings.describe()

Unnamed: 0,team1_COL,team1_DOL,team1_MOR,team1_POM,team1_RTH,team1_SAG,team1_WLK,team1_WOL,team1_power6_wins,team1_top50_wins,team2_COL,team2_DOL,team2_MOR,team2_POM,team2_RTH,team2_SAG,team2_WLK,team2_WOL,team2_power6_wins,team2_top50_wins,team1_AP_,team2_AP_,team1_USA_,team2_USA_,team1_Conf,team2_Conf
count,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0
mean,-1.301543,-1.294729,-1.261793,-1.270123,-1.25264,-1.257931,-1.276537,-1.307644,3.913153,3.864499,-1.280226,-1.271583,-1.244056,-1.260206,-1.237066,-1.245802,-1.257542,-1.283791,3.903546,4.225027,0.528358,0.507463,0.552239,0.501493,0.653731,0.656716
std,0.482425,0.498406,0.534066,0.528717,0.570251,0.547653,0.522287,0.480447,2.819093,2.884616,0.507122,0.531082,0.566055,0.552909,0.591061,0.567084,0.554209,0.505019,2.852078,3.110706,0.499942,0.500692,0.498007,0.500746,0.476492,0.475515
min,-1.690125,-1.690351,-1.689448,-1.689353,-1.688742,-1.690533,-1.690545,-1.692275,-0.432599,-0.412912,-1.685412,-1.684109,-1.683118,-1.683964,-1.685046,-1.683302,-1.684085,-1.688019,-0.434256,-0.415088,0.0,0.0,0.0,0.0,0.0,0.0
25%,-1.599502,-1.599802,-1.5988,-1.598778,-1.598182,-1.599919,-1.594958,-1.591563,0.503023,1.013601,-1.604939,-1.603715,-1.592604,-1.598495,-1.604616,-1.602878,-1.593647,-1.607558,0.312296,1.005788,0.0,0.0,0.0,0.0,0.0,0.0
50%,-1.458533,-1.458948,-1.447719,-1.457883,-1.45731,-1.458965,-1.459122,-1.460637,4.806881,3.866628,-1.433935,-1.452978,-1.451803,-1.452695,-1.453809,-1.44203,-1.452966,-1.446635,4.791608,3.847541,1.0,1.0,1.0,1.0,1.0,1.0
75%,-1.23701,-1.222515,-1.205991,-1.226413,-1.23594,-1.20726,-1.222668,-1.228999,6.303876,6.006397,-1.227724,-1.206773,-1.175232,-1.221426,-1.192409,-1.190705,-1.211798,-1.225366,6.284712,6.689293,1.0,1.0,1.0,1.0,1.0,1.0
max,1.088978,1.207213,1.140793,1.349952,1.54125,1.279586,1.146903,1.248525,9.672113,12.425707,1.100953,1.250248,1.424542,1.413027,1.451747,1.272277,1.280271,1.118072,9.644196,12.372798,1.0,1.0,1.0,1.0,1.0,1.0


In [76]:
y_preds = ranking_based_model.predict(scaled_validation_rankings)
roc_score = roc_auc_score(validation_target, y_preds)
acc_score = accuracy_score(validation_target, y_preds)

print("The ROC AUC score for the regular season statistics model was: {}".format(roc_score))
print()
print("The accuracy score for the regular season statistics model was: {}".format(acc_score))

The ROC AUC score for the regular season statistics model was: 0.7194539492443683

The accuracy score for the regular season statistics model was: 0.7194029850746269


# FULL REGULAR SEASON MODEL

In [5]:
validation_data = validation_data[validation_data.Season > 2014].reset_index(drop = True)

In [8]:
validation_ratings = pd.read_csv("./engineered_data/model_ready_validation_rankings.csv.gz",
                          compression = "gzip")

In [15]:
validation_data = validation_data.drop(columns = [
    "team1_acc", "team1_big_east", "team1_big_ten", "team1_big_twelve", "team1_sec", "team1_pac12", "team1_MOR",
    "team1_AP_top10", "team1_AP_top25"
])

In [17]:
validation_data = pd.concat([validation_data, validation_ratings], axis = 1)

In [19]:
validation_target = validation_data["team1_win"]

In [20]:
with open("./models/full_reg_feature_set.json", "r+") as f:
    feature_set = json.load(f)

In [22]:
feature_set

['team1_COL',
 'team1_DOL',
 'team1_DRB_DIF',
 'team1_MOR',
 'team1_ORB_DIF',
 'team1_POM',
 'team1_PPG',
 'team1_PPP',
 'team1_RTH',
 'team1_SAG',
 'team1_TRB_DIF',
 'team1_WLK',
 'team1_WOL',
 'team1_ast_TO',
 'team1_e_fg',
 'team1_fg',
 'team1_free_throw_pct',
 'team1_opp_PPG',
 'team1_opp_PPP',
 'team1_opp_ast_TO',
 'team1_opp_efg',
 'team1_opp_fg',
 'team1_opp_three_point_pct',
 'team1_opp_trueshooting',
 'team1_power6_wins',
 'team1_three_point_pct',
 'team1_top50_wins',
 'team1_total_point_differential',
 'team1_true_shooting',
 'team2_COL',
 'team2_DOL',
 'team2_DRB_DIF',
 'team2_MOR',
 'team2_ORB_DIF',
 'team2_POM',
 'team2_PPG',
 'team2_PPP',
 'team2_RTH',
 'team2_SAG',
 'team2_TRB_DIF',
 'team2_WLK',
 'team2_WOL',
 'team2_ast_TO',
 'team2_e_fg',
 'team2_fg',
 'team2_free_throw_pct',
 'team2_opp_PPG',
 'team2_opp_PPP',
 'team2_opp_ast_TO',
 'team2_opp_efg',
 'team2_opp_fg',
 'team2_opp_three_point_pct',
 'team2_opp_trueshooting',
 'team2_power6_wins',
 'team2_three_point_pct'

In [25]:
validation_data = validation_data.rename(columns = {
    "team1_conf" : "team1_Conf",
    "team2_conf" : "team2_Conf"
})

In [26]:
validation_final = validation_data[feature_set]

In [27]:
validation_final.head()

Unnamed: 0,team1_COL,team1_DOL,team1_DRB_DIF,team1_MOR,team1_ORB_DIF,team1_POM,team1_PPG,team1_PPP,team1_RTH,team1_SAG,team1_TRB_DIF,team1_WLK,team1_WOL,team1_ast_TO,team1_e_fg,team1_fg,team1_free_throw_pct,team1_opp_PPG,team1_opp_PPP,team1_opp_ast_TO,team1_opp_efg,team1_opp_fg,team1_opp_three_point_pct,team1_opp_trueshooting,team1_power6_wins,team1_three_point_pct,team1_top50_wins,team1_total_point_differential,team1_true_shooting,team2_COL,team2_DOL,team2_DRB_DIF,team2_MOR,team2_ORB_DIF,team2_POM,team2_PPG,team2_PPP,team2_RTH,team2_SAG,team2_TRB_DIF,team2_WLK,team2_WOL,team2_ast_TO,team2_e_fg,team2_fg,team2_free_throw_pct,team2_opp_PPG,team2_opp_PPP,team2_opp_ast_TO,team2_opp_efg,team2_opp_fg,team2_opp_three_point_pct,team2_opp_trueshooting,team2_power6_wins,team2_three_point_pct,team2_top50_wins,team2_total_point_differential,team2_true_shooting,team1_AP_,team1_Conf,team1_USA_,team2_AP_,team2_Conf,team2_USA_
0,5.0,5.0,161,5.0,44,7.0,80.606061,1.029731,5.0,5.0,205,5.0,4.0,1.380054,0.568617,0.502128,0.691391,65.575758,0.845801,0.951807,0.473589,0.429311,0.320225,0.504777,20,0.385802,10,496,0.601211,26.0,24.0,153,14.0,78,17.0,71.882353,0.948979,19.0,16.0,231,16.0,25.0,1.473418,0.53818,0.471234,0.632787,63.411765,0.8628,0.978723,0.456892,0.39978,0.323484,0.50781,15,0.386124,7,288,0.560448,1,1,1,1,1,1
1,28.0,41.0,28,62.0,-64,40.0,68.212121,0.95944,44.0,48.0,-36,43.0,36.0,1.203166,0.528704,0.462963,0.681876,60.909091,0.828852,0.890869,0.47267,0.420598,0.316988,0.508593,4,0.352066,1,241,0.572145,48.0,49.0,51,48.0,3,39.0,69.870968,0.979931,54.0,45.0,54,56.0,39.0,1.14511,0.535067,0.453446,0.727941,60.935484,0.85015,0.866841,0.474303,0.419195,0.311734,0.513606,0,0.391304,2,277,0.571999,0,0,0,0,0,0
2,46.0,42.0,126,33.0,23,49.0,70.0,0.912366,36.0,33.0,149,41.0,46.0,1.138889,0.507349,0.453363,0.68484,64.515152,0.8522,0.968831,0.454898,0.400664,0.35,0.503729,15,0.334501,5,181,0.550031,43.0,59.0,155,51.0,4,34.0,62.375,0.885883,52.0,50.0,159,50.0,52.0,0.929095,0.503795,0.452878,0.673913,55.28125,0.784173,0.910256,0.444577,0.38975,0.328571,0.474039,1,0.333333,3,227,0.54118,0,1,0,0,0,0
3,12.0,15.0,127,18.0,106,15.0,69.125,0.898471,14.0,14.0,233,14.0,16.0,1.164141,0.49745,0.433994,0.666667,60.3125,0.819463,1.23038,0.457751,0.404429,0.297078,0.492297,15,0.382253,10,282,0.535344,91.0,93.0,61,91.0,-91,71.0,70.625,0.963539,112.0,78.0,-30,80.0,74.0,1.236842,0.51867,0.474912,0.732523,62.4375,0.811377,0.865263,0.450413,0.381936,0.326301,0.502647,1,0.318471,1,262,0.564007,1,1,1,0,0,0
4,39.0,44.0,112,47.0,-39,43.0,73.6875,0.889865,38.0,43.0,73,37.0,45.0,1.07265,0.504732,0.456362,0.688679,67.71875,0.806272,0.854214,0.446123,0.400806,0.313043,0.481838,13,0.338858,8,191,0.54037,40.0,40.0,79,22.0,30,38.0,70.393939,0.921547,32.0,32.0,109,33.0,40.0,1.103746,0.495733,0.437333,0.683358,65.424242,0.867208,1.178248,0.455248,0.403303,0.331633,0.500093,12,0.360791,4,164,0.534328,0,1,0,0,1,0


In [29]:
validation_final.to_csv("./engineered_data/full_regular_season_validation.csv.gz", index = False,
                       compression = "gzip")

In [31]:
with open("./models/regular_season_full_SVM.pk", "rb") as f: #load SVM model trained on regular season rankings
    full_model = pickle.load(f)
with open("./models/model_ready_full_regular_season_scaler.pk", "rb") as f: #load data scaler from training set
    full_scaler = pickle.load(f)

In [32]:
scaled_v = full_scaler.transform(validation_final)

ValueError: operands could not be broadcast together with shapes (335,64) (58,) (335,64) 