# Regular Season Modeling - 2003-2019

There is far more data available for the regular season. I have compiled a dataset of rankings, team stats, and conferences at a game by game level for every season from 2003 - 2019 (See __00DataSetCreation-FeatureEngineering.ipynb__). 

In [66]:
from concurrent.futures import ProcessPoolExecutor
import sys
import os
import json
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
sys.path.append("/Users/atticussoane/Desktop/atticus_tools/")
from recursive_selection import FeatureSelector
from dataclasses import dataclass, field

In [2]:
pd.set_option("max.columns", 80)
regular_season = pd.read_csv("./engineered_data/all_game_level.csv.gz", compression = "gzip")
regular_season.head()

Unnamed: 0,DayNum,LScore,LTeamID,L_DRB_DIF,L_ORB_DIF,L_PPG,L_PPP,L_TRB_DIF,L_ast_TO,L_e_fg,L_fg,L_free_throw_pct,L_opp_PPG,L_opp_PPP,L_opp_ast_TO,L_opp_efg,L_opp_fg,L_opp_three_point_pct,L_opp_trueshooting,L_three_point_pct,L_total_point_differential,L_true_shooting,Season,WLoc,WScore,WTeamID,W_DRB_DIF,W_ORB_DIF,W_PPG,W_PPP,W_TRB_DIF,W_ast_TO,W_e_fg,W_fg,W_free_throw_pct,W_opp_PPG,W_opp_PPP,W_opp_ast_TO,W_opp_efg,W_opp_fg,W_opp_three_point_pct,W_opp_trueshooting,W_three_point_pct,W_total_point_differential,W_true_shooting,W_COL,L_COL,W_DOL,L_DOL,W_POM,L_POM,W_RTH,L_RTH,W_SAG,L_SAG,W_WLK,L_WLK,W_WOL,L_WOL,W_MOR,L_MOR,W_AP_,L_AP_,W_USA_,L_USA_,W_Conf,L_Conf,W_power6_wins,W_top50_wins,L_power6_wins,L_top50_wins
0,10,62,1328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2003,N,68,1104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27,9,27,11,15,20,9,25,11,27,14,23,5,21,13,4,0,0,0,0,1,1,0,0,0,0
1,10,63,1393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2003,N,70,1272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52,37,72,16,41,40,15,20,61,60,80,110,11,35,40,57,0,0,0,0,0,1,0,0,0,0
2,11,61,1437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2003,N,73,1266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18,47,10,52,25,46,34,71,32,54,33,24,38,85,52,36,0,0,0,0,0,1,0,0,0,0
3,11,50,1457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2003,N,56,1296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,155,169,144,194,173,267,199,265,190,248,171,244,215,249,188,235,0,0,0,0,0,0,0,0,0,0
4,11,71,1208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2003,N,77,1400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,12,9,13,9,49,19,21,18,46,20,65,19,20,18,29,0,0,0,0,1,1,0,0,0,0


First, I need to create a target variable. To do this, I need to create a target variable. I will begin by randomly splitting the training dataset and assinging a variable: "team1_win".

In [3]:
group1 = []
splitter = KFold(2, shuffle = True)
indices = splitter.split(regular_season)
random_splits = []
for train, test in indices:
    group1.append(train)

In [4]:
team1_group = regular_season.iloc[group1[0]]
team2_group = regular_season.iloc[group1[1]]

In [5]:
all_cols = list(regular_season.columns)
team1_cols = []
team2_cols = []

for k in range(len(all_cols)):
    if all_cols[k][0] == "L":
        team1_cols.append(all_cols[k].replace("L", "team2", 1))
        team2_cols.append(all_cols[k].replace("L", "team1", 1))
    elif all_cols[k][0] == "W":
        team1_cols.append(all_cols[k].replace("W", "team1", 1))
        team2_cols.append(all_cols[k].replace("W", "team2", 1))
    else:
        team1_cols.append(all_cols[k])
        team2_cols.append(all_cols[k])

In [6]:
team1_group.columns = team1_cols
team2_group.columns = team2_cols

df = pd.concat([team1_group, team2_group], axis = 0, sort = True).\
sort_values(by = ["Season", "DayNum"]).reset_index(drop = True)

In [7]:
missing_locations = df[df.team1Loc.isnull()].index
for index in missing_locations:
    df.at[index, "team1Loc"] = df.iloc[index]["team2Loc"]
    
df = df.drop(columns = ["team2Loc"])
team1_win = df.team1Score > df.team2Score
team1_win = team1_win.astype("int32")
df = pd.concat([df, team1_win], axis = 1)
df = df.rename(columns = {0 : "team1_win"})

With the target variable created, I am going to create a couple more features to use in modeling. I want to create features to reflect strength of schedule, so I need to record power 5 wins and ranked team wins.

In [8]:
df.head()

Unnamed: 0,DayNum,Season,team1Loc,team1Score,team1TeamID,team1_AP_,team1_COL,team1_Conf,team1_DOL,team1_DRB_DIF,team1_MOR,team1_ORB_DIF,team1_POM,team1_PPG,team1_PPP,team1_RTH,team1_SAG,team1_TRB_DIF,team1_USA_,team1_WLK,team1_WOL,team1_ast_TO,team1_e_fg,team1_fg,team1_free_throw_pct,team1_opp_PPG,team1_opp_PPP,team1_opp_ast_TO,team1_opp_efg,team1_opp_fg,team1_opp_three_point_pct,team1_opp_trueshooting,team1_power6_wins,team1_three_point_pct,team1_top50_wins,team1_total_point_differential,team1_true_shooting,team2Score,team2TeamID,team2_AP_,team2_COL,team2_Conf,team2_DOL,team2_DRB_DIF,team2_MOR,team2_ORB_DIF,team2_POM,team2_PPG,team2_PPP,team2_RTH,team2_SAG,team2_TRB_DIF,team2_USA_,team2_WLK,team2_WOL,team2_ast_TO,team2_e_fg,team2_fg,team2_free_throw_pct,team2_opp_PPG,team2_opp_PPP,team2_opp_ast_TO,team2_opp_efg,team2_opp_fg,team2_opp_three_point_pct,team2_opp_trueshooting,team2_power6_wins,team2_three_point_pct,team2_top50_wins,team2_total_point_differential,team2_true_shooting,team1_win
0,10,2003,N,68,1104,0,27,1,27,0.0,13,0.0,15,0.0,0.0,9,11,0.0,0,14,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,62,1328,0,9,1,11,0.0,4,0.0,20,0.0,0.0,25,27,0.0,0,23,21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,1
1,10,2003,N,70,1272,0,52,0,72,0.0,40,0.0,41,0.0,0.0,15,61,0.0,0,80,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,63,1393,0,37,1,16,0.0,57,0.0,40,0.0,0.0,20,60,0.0,0,110,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,1
2,11,2003,N,77,1400,0,11,1,9,0.0,18,0.0,9,0.0,0.0,19,18,0.0,0,20,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,71,1208,0,12,1,13,0.0,29,0.0,49,0.0,0.0,21,46,0.0,0,65,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,1
3,11,2003,H,81,1458,0,30,1,33,0.0,31,0.0,26,0.0,0.0,22,19,0.0,0,25,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,55,1186,0,80,0,115,0.0,177,0.0,144,0.0,0.0,124,139,0.0,0,114,115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,1
4,11,2003,N,61,1437,0,47,1,52,0.0,36,0.0,46,0.0,0.0,71,54,0.0,0,24,85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,73,1266,0,18,0,10,0.0,52,0.0,25,0.0,0.0,34,32,0.0,0,33,38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0


In [10]:
df.to_csv("./engineered_data/full_regular_season_with_keys.csv.gz", index = False,
         compression = "gzip")

In [11]:
drop_cols = ["DayNum", "Season", "team1Loc", "team1Score", "team1TeamID", "team2Score",
            "team2TeamID"]
training_data = df.drop(columns = drop_cols)

In [12]:
for column in training_data.columns:
    if len(training_data[training_data[column].isnull()]) != 0:
        print(column)

team2_free_throw_pct


In [14]:
team2_free_throw_mean = training_data["team2_free_throw_pct"].mean()
training_data["team2_free_throw_pct"] = training_data.team2_free_throw_pct.fillna(team2_free_throw_mean)

In [19]:
train_target = training_data["team1_win"]
training_data = training_data.drop(columns = ["team1_win"])

# COUPLE OF FUNCTIONS / OBJECTS I FIND USEFUL

In [15]:
def feature_frame_correlation(feature_frame, tol = None):
    feature_corr = feature_frame.corr().abs()
    correlated_pairs = feature_corr.where(np.triu(np.ones(feature_corr.shape), k =1).\
                                          astype(np.bool)).stack().sort_values(ascending = False)
    if tol:
        return(correlated_pairs[correlated_pairs > tol])
    else:
        return(correlated_pairs)
    
def get_correlation_with_winning(subset, df, target):
    df = pd.concat([df, target], axis = 1)
    partial = df[subset]
    partial_corr = partial.corr().abs()
    correlated_pairs = partial_corr.where(np.triu(np.ones(partial_corr.shape), k =1)\
                                      .astype(np.bool)).stack().sort_values(ascending = False)
    with_target = []
    for i in correlated_pairs.index:
        if "team1_win" in i:
            with_target.append(i)
    return(correlated_pairs[with_target])

def scale_features(data, binary_features, load_scaler = None, save_scaler = False):
    if binary_features:
        to_scale = data.drop(columns = binary_features)
    else:
        to_scale = data
    scale_columns = list(to_scale.columns)
    npX = np.array(to_scale)
    
    if not load_scaler:
        ss = StandardScaler()
        ss.fit(npX)
    else:
        ss = load_scaler
    npX = ss.transform(npX)
    scaled_X = pd.DataFrame(npX)
    scaled_X.columns = scale_columns
    if binary_features:
        scaled_X = pd.concat([scaled_X, data[binary_features]], axis = 1)
    if save_scaler:
        return(scaled_X, ss)
    else:
        return(scaled_X)

def recursive_VIF(training_data, tol, binary_features):
    over_tolerance = []
    if binary_features:
        training_data = training_data.drop(columns = binary_features)
    while True:
        training_data = training_data.assign(constant = 1)
        all_cols = list(training_data.columns)
        npX = np.array(training_data)
        vif = [variance_inflation_factor(npX, i) for i in np.arange(npX.shape[1])]
        vif_ = pd.Series(vif, index = all_cols)
        vif_ = vif_.drop('constant')
        max_vif = vif_.idxmax()
        if vif_.max() > tol:
            training_data = training_data.drop(columns = [max_vif, 'constant'])
            over_tolerance.append((max_vif, vif_[max_vif]))
        else:
            return(over_tolerance)
        
def display_coef(training_data, model):
    features = list(training_data.columns)
    try:
        coefs = list(abs(model.coef_[0]))
    except:
        coefs = list(abs(model.coef_))
    coef_frame = pd.DataFrame({"feature" : features, "coefficient" : coefs}).\
    sort_values(by = ["coefficient"], ascending = False)
    zero_val = coef_frame[coef_frame.coefficient == 0.].feature
    print("There are {} coefficients with zero coefficient".format(len(zero_val)))
    print()
    print(zero_val.values)
    return(coef_frame)

In [18]:
@dataclass
class CVWrapper():
    data: any
    target: pd.core.frame.Series
    algorithm: object
    params: dict
    scoring: str = "roc_auc"
    cv: int = 5
    n_jobs: int = -1
    best_eval: float = 0.
    best_subset: list = None
        
    def evaluate(self):
        if type(self.data) == dict:
            for key in self.data:
                cv_score = self.cv_(self.data[key], self.target, self.algorithm, self.params, self.scoring,
                                  self.cv, self.n_jobs)
                if cv_score > self.best_eval:
                    self.best_eval = cv_score
                    self.best_subset = list(self.data[key].columns)
                print("CV Score of {} was: {}".format(key, cv_score))
        else:
            cv_score = self.cv_(self.data, self.target, self.algorithm, self.params, self.scoring,
                              self.cv, self.n_jobs)
            self.best_eval = cv_score
            self.best_subset = list(self.data.columns)
            print("CV Score was {}".format(self.best_eval))
        
    @staticmethod
    def cv_(xtrain, ytrain, algorithm, params, scoring, cv, n_jobs):
        if params:
            algorithm.set_params(**params)
        cv_score = cross_val_score(algorithm, xtrain, ytrain, scoring = scoring,
                                   cv = cv, n_jobs = n_jobs).mean()
        return(cv_score)
    
@dataclass
class GridSearchWrapper():
    xtrain: pd.core.frame.DataFrame
    ytrain: pd.core.frame.Series
    algorithm: object
    init_params: dict
    search_params : dict
    scoring = "roc_auc"
    cv = 5
    n_jobs = -1
    best_score: float = None
    best_params: dict = None
    
    def search(self):
        self.algorithm.set_params(**self.init_params)
        grid_search = GridSearchCV(self.algorithm, param_grid = self.search_params, scoring = self.scoring,
                                  n_jobs = self.n_jobs, cv = self.cv)
        grid_search.fit(self.xtrain, self.ytrain)
        self.best_score = grid_search.best_score_
        self.best_params = grid_search.best_params_
        print("Best score: {}".format(self.best_score))
        print()
        print("Best parameters: {}".format(self.best_params))

# BASELINE CROSS-VALIDATION SCORE

In [20]:
lr_params = {
    "C" : 0.1,
    "penalty" : "l1",
    "solver" : "liblinear",
    "fit_intercept" : True
}

cv = CVWrapper(training_data, train_target, LogisticRegression(), lr_params)
cv.evaluate()

CV Score was 0.8221410267928924


# FEATURE SELECTION

In [24]:
game1_indices = training_data[(training_data.team1_PPG == 0.) |
                             (training_data.team2_PPG == 0.)].index
training_data = training_data.drop(game1_indices, axis = 0).reset_index(drop = True)
train_target = train_target.drop(game1_indices, axis = 0).reset_index(drop = True)

In [26]:
binary_features = ["team1_AP_", "team1_Conf", "team1_USA_", "team2_AP_", "team2_Conf",
                  "team2_USA_"]
scaled_training, ss = scale_features(training_data, binary_features, save_scaler = True)

In [27]:
with open("./models/model_ready_full_regular_season_scaler.pk", "wb") as f:
    pickle.dump(ss, f)

## LassoCV

In [30]:
lassoCV_params = {
    "eps" : 0.001,
    "n_alphas" : 100,
    "alphas" : None,
    "fit_intercept" : False,
    "cv" : 10,
    "n_jobs" : -1,
    "selection" : "random"
}

lasso = LassoCV().set_params(**lassoCV_params)
lasso.fit(scaled_training, train_target)

coefs = display_coef(scaled_training, lasso)
coefs

There are 30 coefficients with zero coefficient

['team2_opp_ast_TO' 'team2_opp_efg' 'team1_e_fg' 'team2_opp_fg'
 'team1_RTH' 'team2_true_shooting' 'team1_PPP' 'team1_POM' 'team1_ORB_DIF'
 'team2_opp_PPG' 'team2_TRB_DIF' 'team2_fg' 'team1_TRB_DIF'
 'team1_opp_PPG' 'team1_ast_TO' 'team1_opp_efg' 'team1_opp_fg'
 'team1_opp_three_point_pct' 'team1_three_point_pct' 'team1_top50_wins'
 'team2_DOL' 'team2_e_fg' 'team2_DRB_DIF' 'team1_DOL' 'team2_POM'
 'team2_PPP' 'team1_fg' 'team2_WLK' 'team2_WOL' 'team1_COL']


Unnamed: 0,feature,coefficient
59,team1_Conf,0.472589
62,team2_Conf,0.383937
58,team1_AP_,0.230935
61,team2_AP_,0.216030
37,team2_RTH,0.196470
...,...,...
36,team2_PPP,0.000000
15,team1_fg,0.000000
40,team2_WLK,0.000000
41,team2_WOL,0.000000


In [34]:
lasso_drop = list(coefs[coefs.coefficient == 0.]["feature"].values)
lasso_reduced = training_data.drop(columns = lasso_drop)

## TREE BASED

In [36]:
selector = FeatureSelector(scaled_training, train_target, params = {"n_estimators" : 100})
selector.recursive_selection()

'Cannot reduce feature frame anymore. Reduce drop size if desired'

In [43]:
tree_reduced = scaled_training[selector.best_subset]

# VIF 

In [47]:
vif_drop = recursive_VIF(scaled_training, 5.0, binary_features)
vif_drop

  vif = 1. / (1. - r_squared_i)


[('team1_DRB_DIF', inf),
 ('team2_DRB_DIF', inf),
 ('team2_true_shooting', 71.3910048496342),
 ('team1_true_shooting', 69.18244040254731),
 ('team1_opp_efg', 45.29295135956014),
 ('team2_opp_efg', 42.53384637571467),
 ('team1_opp_PPP', 38.47413713166314),
 ('team2_RTH', 38.02383773896671),
 ('team2_PPP', 35.97599752158112),
 ('team1_RTH', 35.910130264244685),
 ('team2_DOL', 30.976780180435863),
 ('team1_DOL', 30.8730557430566),
 ('team1_SAG', 26.856979333263855),
 ('team2_SAG', 26.458395146046534),
 ('team1_e_fg', 26.197469929921997),
 ('team2_opp_trueshooting', 23.280700643526107),
 ('team2_COL', 23.241689866481),
 ('team1_WLK', 23.05405242527077),
 ('team2_e_fg', 22.465727682155688),
 ('team1_COL', 20.229717774994477),
 ('team2_POM', 20.188666566949475),
 ('team1_PPP', 18.769284985795903),
 ('team1_POM', 16.6396712763891),
 ('team2_WLK', 16.48872224328948),
 ('team1_opp_trueshooting', 14.168700292905832),
 ('team2_opp_PPP', 12.159107568641284),
 ('team1_PPG', 8.653005524057601),
 ('t

In [48]:
vif_drop = [i[0] for i in vif_drop]
vif_reduced = scaled_training.drop(columns = vif_drop)

## Collinearity

In [61]:
over_75 = feature_frame_correlation(scaled_training, tol = 0.75)
to_drop = list(set([i[0] for i in over_75.index]))

In [63]:
col_reduced = scaled_training.drop(columns = to_drop)

In [64]:
feature_sets = {
    "full" : scaled_training,
    "lasso_reduced" : lasso_reduced,
    "tree_reduced" : tree_reduced,
    "vif_reduced" : vif_reduced,
    "collinearity_reduced" : col_reduced
}

SVM_params = {
    "loss" : "squared_hinge",
    "penalty" : "l1",
    "learning_rate" : "adaptive",
    "eta0" : 0.1,
    "alpha" : 0.001,
    "fit_intercept" : False
}

cv = CVWrapper(feature_sets, train_target, SGDClassifier(), SVM_params)
cv.evaluate()

CV Score of full was: 0.8164933058975642
CV Score of lasso_reduced was: 0.7073020140657433
CV Score of tree_reduced was: 0.8157587625834749
CV Score of vif_reduced was: 0.8088405195116556
CV Score of collinearity_reduced was: 0.7699785656405892


In [65]:
log_params = {
    "fit_intercept" : False,
    "penalty" : "l1",
    "solver" : "liblinear", 
    "C" : 0.1
}

cv = CVWrapper(feature_sets, train_target, LogisticRegression(), log_params)
cv.evaluate()

CV Score of full was: 0.8162301738954838
CV Score of lasso_reduced was: 0.8125826670613474
CV Score of tree_reduced was: 0.8156832047832483
CV Score of vif_reduced was: 0.8088523734568334
CV Score of collinearity_reduced was: 0.7702882485451907


In [105]:
@dataclass
class L1Reducer():
    xtrain: pd.core.frame.DataFrame
    ytrain: pd.core.frame.Series
    algorithm: object = LogisticRegression()
    params: dict =\
        field(default_factory = {"penalty" : "l1", "solver" : "liblinear", "fit_intercept" : False, "C" : 0.1})
    scoring: str = "roc"
    reduction_size: int = 1
    n_folds: int = 5
    n_jobs: any = None
    drop_zero: bool = True
    best_eval: float = 0.
    best_subset: list = field(default_factory = list)
    
    def recursive_selection(self):
        while True:
            current_eval, coefficients = self.cross_validate()
            if current_eval > self.best_eval:
                self.best_eval = current_eval
                self.best_subset = list(self.xtrain.columns)
            coefficient_frame = pd.DataFrame({"feature" : list(self.xtrain.columns),
                                             "coefficient" : coefficients})
            coefficient_frame = coefficient_frame.sort_values(by = ["coefficient"], 
                                                             ascending = True)
            if self.drop_zero and len(coefficient_frame[coefficient_frame.coefficient == 0.]) != 0:
                to_drop = list(coefficient_frame[coefficient_frame.coefficient == 0].feature.values)
            else:
                if self.xtrain.shape[1] <= self.reduction_size:
                    return("Finished evaluating")
                else:
                    to_drop = list(coefficient_frame.iloc[:self.reduction_size].feature.values)
            self.xtrain = self.xtrain.drop(columns = to_drop)
                
    def cross_validate(self):
        evaluation = 0.
        splitter = KFold(n_splits = self.n_folds)
        result_list = []
        importance = np.zeros(self.xtrain.shape[1])
        with ProcessPoolExecutor(max_workers = self.n_jobs) as executor:
            for train, test in splitter.split(self.xtrain):
                xtrain, xtest = self.xtrain.iloc[train], self.xtrain.iloc[test]
                ytrain, ytest = self.ytrain.iloc[train], self.ytrain.iloc[test]
                result_list.append(executor.submit(self.get_coefs, xtrain, ytrain,
                                                  xtest, ytest, self.algorithm, self.params,
                                                  self.scoring, self.n_folds))
        for future in result_list:
            evaluation += future.result()[0]
            importance += future.result()[1][0]
        return(evaluation, importance)
    
    @staticmethod
    def get_coefs(xtrain, ytrain, xtest, ytest, algorithm, params, scoring, n_folds):
        algorithm.set_params(**params)
        algorithm.fit(xtrain, ytrain)
        ypreds = algorithm.predict(xtest)
        if scoring == "roc":
            score = roc_auc_score(ytest, ypreds) / n_folds
        elif scoring == "acc":
            score = accuracy_score(ytest, ypreds) / n_folds
        else:
            raise ValueError("Scoring metric not supported")
        coefficients = algorithm.coef_ / n_folds
        return(score, coefficients)     

In [101]:
#logistic regression
l1 = L1Reducer(scaled_training, train_target, reduction_size = 1)
l1.recursive_selection()

'Finished evaluating'

In [102]:
log_reduced = l1.best_subset

In [106]:
SVM_params = {
    "loss" : "squared_hinge",
    "penalty" : "l1",
    "learning_rate" : "adaptive",
    "eta0" : 0.1,
    "alpha" : 0.001,
    "fit_intercept" : False
}

svm = L1Reducer(scaled_training, train_target, algorithm = SGDClassifier(),
               params = SVM_params, reduction_size = 5)
svm.recursive_selection()

'Finished evaluating'

In [107]:
svm_reduced = svm.best_subset

The highest performing feature set is the full feature set

# MODEL TUNING 

In [116]:
#set non-default hyperparameters before grid search
SVM_init = {
    "fit_intercept" : False,
    "eta0" : 0.1,
}

SVM_param_search = {
    "loss" : ["hinge", "squared_hinge"],
    "penalty" : ["l1", "l2", "elasticnet"],
    "alpha" : [0.0001, 0.001, 0.1],
    "learning_rate" : ["optimal", "adaptive"]    
}

tune_SVM = GridSearchWrapper(scaled_training, train_target, SGDClassifier(), SVM_init, SVM_param_search)
tune_SVM.search()

Best score: 0.8163889758786199

Best parameters: {'alpha': 0.001, 'learning_rate': 'adaptive', 'loss': 'squared_hinge', 'penalty': 'l1'}


In [117]:
huber_init = {
    "fit_intercept" : False,
    "eta0" : 0.1,
    "loss" : "modified_huber"
}

huber_param_search = {
    "penalty" : ["l1", "l2", "elasticnet"],
    "alpha" : [0.0001, 0.001, 0.1],
    "learning_rate" : ["optimal", "adaptive"]    
}

tune_Huber = GridSearchWrapper(scaled_training, train_target, SGDClassifier(), huber_init, huber_param_search)
tune_Huber.search()

Best score: 0.8162705405644545

Best parameters: {'alpha': 0.001, 'learning_rate': 'adaptive', 'penalty': 'l1'}


In [119]:
sgd_params = {
    "fit_intercept" : False,
    "eta0" : 0.1,
    "loss" : "squared_hinge",
    "learning_rate" : "adaptive",
    "penalty" : "l1",
    "alpha" : 0.001
}

sgd_final = SGDClassifier().set_params(**sgd_params)
sgd_final.fit(scaled_training, train_target)

with open("./models/regular_season_full_SVM.pk", "wb") as f:
    pickle.dump(sgd_final, f)

In [120]:
full_regular_season_features = list(scaled_training.columns)
with open("./models/full_reg_feature_set.json", "w+") as f:
    json.dump(full_regular_season_features, f)