In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [16]:
df = pd.read_csv("compare.csv")

In [17]:
X = df.drop(["home_win"], axis=1)
y = df.home_win.ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=49)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Model One: GradientBoostingClassifier

In [4]:
params = {
    'loss':['exponential','deviance'],
    'learning_rate':[0.05,0.1,0.2,0.3,0.4],
    'n_estimators':[10,250,500],
    'criterion':['friedman_mse', 'mse', 'mae'],
    'min_samples_split':[1,5,10],
    'min_samples_leaf':[1,4,8],
    'min_weight_fraction_leaf':[0,0.05,0.1],
    'max_depth':[2,3,4,7,8,9,None],
    'min_impurity_decrease':[0,0.01,0.05],
    'max_features':['sqrt','log2',8],
    'warm_start':[True,False],
    'n_iter_no_change':[25],
    'ccp_alpha':[0,1000,2000]   
}

In [5]:
# I ran this code block separately on a VM, to get the parameters
# featured below
'''
# run a RandomizedSearch
import warnings
warnings.simplefilter("ignore")
gbc = GradientBoostingClassifier()
gb_cv = RandomizedSearchCV(gbc, params, n_jobs=-1, n_iter=500)
gb_cv.fit(X_train, y_train)
print(gb_cv.best_params_)
'''

'warm_start': False, 'n_iter_no_change': 25, 'n_estimators': 250, 'min_weight_fraction_leaf': 0, 'min_samples_split': 5, 'min_samples_leaf': 4, 'min_impurity_decrease': 0, 'max_features': 8, 'max_depth': 3, 'loss': 'exponential', 'learning_rate': 0.1, 'criterion': 'friedman_mse'

SyntaxError: invalid syntax (<ipython-input-5-63858458e4ea>, line 13)

In [6]:
# check AUROC of gbc with best parameters found in random search
gbc = GradientBoostingClassifier(warm_start= False, random_state= 49,
                                 n_iter_no_change= 25, n_estimators= 500, 
                                 min_weight_fraction_leaf= 0.05, min_samples_split= 5,
                                 min_samples_leaf= 1, min_impurity_decrease= 0.05,
                                 max_features= 'sqrt', max_depth= 2, loss= 'exponential',
                                 learning_rate= 0.4, criterion = 'mae', ccp_alpha= 0)
gbc.fit(X_train,y_train)
y_pred = gbc.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("Random Search's Tuned GBC AUROC= " + str(round(roc_auc,3)))

Random Search's Tuned GBC AUROC= 0.637


In [9]:
# check AUROC of gbc with best parameters found in PaperSpace's 500 iteration random search
gbc = GradientBoostingClassifier(warm_start= False,
                                 n_iter_no_change= 25, n_estimators= 250, 
                                 min_weight_fraction_leaf= 0, min_samples_split= 5,
                                 min_samples_leaf= 4, min_impurity_decrease= 0,
                                 max_features= 8, max_depth= 3, loss= 'exponential',
                                 learning_rate= 0.1, criterion = 'friedman_mse', ccp_alpha= 0)
gbc.fit(X_train,y_train)
y_pred = gbc.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("Random Search's Tuned GBC AUROC= " + str(round(roc_auc,3)))

Random Search's Tuned GBC AUROC= 0.636


In [6]:
# hyperopt for tuning (in lieu of random search)
from hyperopt import tpe,hp,fmin,STATUS_OK,Trials
from hyperopt.pyll.base import scope

In [7]:
# define hyperparameter space
space = {
    'loss': 'exponential',
    'criterion': 'mae',
    'learning_rate': hp.quniform("learning_rate",0.01,0.5,0.01),
    'n_estimators': hp.choice('n_estimators',np.arange(10, 1000, 50, dtype=int)),
    'min_samples_split':hp.choice("min_samples_split",np.arange(2, 10, 1, dtype=int)),
    'min_samples_leaf': hp.choice("min_samples_leaf",np.arange(2, 10, 1, dtype=int)),
    'min_weight_fraction_leaf':hp.choice("min_weight_fraction_leaf",[0,0.05,0.1]),
    'max_depth': hp.uniform("max_depth",1,12),
    'min_impurity_decrease':hp.choice("min_impurity_decrease",[0,0.01,0.05]),
    'max_features':hp.choice("max_features",['sqrt','log2',8]),
    'warm_start':hp.choice("warm_start", [True,False]),
    'ccp_alpha':hp.uniform("ccp_alpha",0,2500)
}

In [11]:
# define objective function
def objective(space):
    clf = GradientBoostingClassifier(loss= space['loss'],
                                     learning_rate=space['learning_rate'],
                                     n_estimators=space['n_estimators'],
                                     criterion = space['criterion'],
                                     min_samples_split=space['min_samples_split'],
                                     min_samples_leaf=space['min_samples_leaf'],
                                     min_weight_fraction_leaf=space['min_weight_fraction_leaf'],
                                     max_depth=space['max_depth'],
                                     min_impurity_decrease=space['min_impurity_decrease'],
                                     max_features=space['max_features'],
                                     warm_start=space['warm_start'],
                                     ccp_alpha=space['ccp_alpha'],
                                    )

    clf.fit(X_train,y_train)
    acc = cross_val_score(clf, X_train, y_train, cv=5).mean()
    return{'loss':-acc, 'status': STATUS_OK }

In [12]:
# initialize trials object
trials = Trials()

best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=25,
    trials=trials
)

print("Best: {}".format(best))

100%|██████████████████████████████████████████| 25/25 [66:14:20<00:00, 9538.44s/trial, best loss: -0.5979743622430622]
Best: {'ccp_alpha': 1120.172193833575, 'learning_rate': 0.43, 'max_depth': 8.121216684877313, 'max_features': 0, 'min_impurity_decrease': 0, 'min_samples_leaf': 6, 'min_samples_split': 6, 'min_weight_fraction_leaf': 2, 'n_estimators': 10, 'warm_start': 0}


In [23]:
# test hyperopt's recommended gbc here for AUROC
gbc = GradientBoostingClassifier(learning_rate=0.43,
                                     n_estimators=10,
                                     criterion='mse',
                                     min_samples_split=6,
                                     min_samples_leaf=6,
                                     min_weight_fraction_leaf=0.1,
                                     max_depth=8,
                                     max_features=None,
                                     warm_start=False,
                                     ccp_alpha=0
                                    )
gbc.fit(X_train,y_train)
y_pred = gbc.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("HyperOpt's Tuned GBC AUROC= " + str(round(roc_auc,3)))
# check accuracy
predictions = pd.DataFrame(y_pred, columns=['home_win_prob'])
predictions['binary'] = predictions['home_win_prob'] > 0.5
y_pred = np.array(predictions['binary'])
print("Tuned GBC Accuracy = " + str(round(accuracy_score(y_test,y_pred),3)))

HyperOpt's Tuned GBC AUROC= 0.626
Tuned GBC Accuracy = 0.654


In [24]:
# test hyperopt's recommended gbc here for AUROC
gbc = GradientBoostingClassifier(learning_rate=0.42,
                                     n_estimators=16,
                                     criterion='mse',
                                     min_samples_split=5,
                                     min_samples_leaf=6,
                                     min_weight_fraction_leaf=0,
                                     max_depth=10,
                                     min_impurity_decrease=0,
                                     max_features=8,
                                     warm_start=True,
                                     ccp_alpha=0
                                    )
gbc.fit(X_train,y_train)
y_pred = gbc.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("HyperOpt's Tuned GBC AUROC= " + str(round(roc_auc,3)))
# check accuracy
predictions = pd.DataFrame(y_pred, columns=['home_win_prob'])
predictions['binary'] = predictions['home_win_prob'] > 0.5
y_pred = np.array(predictions['binary'])
print("Tuned GBC Accuracy = " + str(round(accuracy_score(y_test,y_pred),3)))

HyperOpt's Tuned GBC AUROC= 0.599
Tuned GBC Accuracy = 0.622


I tried RandomSearch and HyperOpt to see which could find the better set of hyperparameters for our GradientBoostingClassifier.

Our RandomSearch found a model with an accuracy of __________
Our HyperOpt found a model with an accuracy of _______.

The best performing GradientBoostingClassifier we have has these hyperparameters:

In [None]:
# gbc final model
gbc = GradientBoostingClassifier()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics

df = pd.read_csv("/storage/compare.csv")

X = df.drop(["home_win"], axis=1)
y = df.home_win.ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=49)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

params = {
    'loss':['exponential','deviance'],
    'learning_rate':[0.05,0.1,0.2,0.3,0.4],
    'n_estimators':[10,250,500],
    'criterion':['friedman_mse', 'mse', 'mae'],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[2,4,8],
    'min_weight_fraction_leaf':[0,0.05,0.1],
    'max_depth':[2,3,4,7,8,9,None],
    'min_impurity_decrease':[0,0.01,0.05],
    'max_features':['sqrt','log2',8],
    'warm_start':[True,False],
    'n_iter_no_change':[25]
}

# run a RandomizedSearch here
import warnings
warnings.simplefilter("ignore")
gbc = GradientBoostingClassifier()
gb_cv = RandomizedSearchCV(gbc, params, n_jobs=-1, n_iter=500)
gb_cv.fit(X_train, y_train)
print(gb_cv.best_params_)

## Model Two: Logistic Regression

In [76]:
params = {
    'penalty':['l1','l2','elasticnet','none'],
    'fit_intercept':[True,False],
    'random_state':[49],
    'solver':['newton-cg','lbfgs','liblinear','sag','saga'],
    'max_iter':[10,25,100,500],
    'warm_start':[True,False],
    'n_jobs':[6]
}

In [77]:
# run a GridSearch here
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter("ignore")

logit = LogisticRegression()
logit_cv = GridSearchCV(logit, param_grid=params, scoring="roc_auc", n_jobs=6)
logit_cv.fit(X_train, y_train)
print(logit_cv.best_params_)

{'fit_intercept': True, 'max_iter': 10, 'n_jobs': 6, 'penalty': 'none', 'random_state': 49, 'solver': 'sag', 'warm_start': True}


In [79]:
# check AUROC for model using best params from the GridSearch
logit = LogisticRegression(max_iter=10,penalty='none',solver='sag',warm_start=True)
logit.fit(X_train,y_train)
y_pred = logit.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("GridSearch Tuned Logit AUROC= " + str(round(roc_auc,3)))

GridSearch Tuned Logit AUROC= 0.625


Because it is less computationally intensive, I ran a GridSearch to tune the Logistic Regression model. It returned a final model result with an AUROC of 0.625.
The final model's hyperparameters are included below:

In [80]:
# final Logit Model
logit = LogisticRegression(max_iter=10,penalty='none',solver='sag',warm_start=True)

## Model Three: Neural Network

## Model Four: Soft Voting Classifier

In [None]:
# soft voting classifier
clf1 = gbc
clf2 = logit
clf3 = NN

eclf = VotingClassifier(
    estimators=[('gbc', clf1), ('logit', clf2), ('NN', clf3)], voting='soft')

In [None]:
for clf, label in zip([clf1, clf2, clf3, eclf], ['Gradient Boost', 'Logit', 'Neural Network', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [None]:
eclf.fit(X_train,y_train)
y_pred = eclf.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("Soft voting AUROC: " + str(round(roc_auc,3)))

Conclusion: the best performing of the three individual models is __________.
The soft voting classifier, by comparison, performs _________________.

For the purpose of most accurately predicting game outcomes, I would opt to use the ___________.