In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from keras.callbacks import EarlyStopping
from keras import Sequential
from keras.layers import Input, Dense
from keras.utils.np_utils import to_categorical
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
import warnings
import pickle

In [20]:
df = pd.read_csv("compare.csv")

X = df.drop(["home_win"], axis=1)
y = df.home_win.ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [3]:
# function to create + fit model, required for KerasClassifier

def create_model(layers,nodes,activation,batch_size,optimizer):
    model = Sequential()
    model.add(Input(shape=(14,)))
    for x in range(layers):
        model.add(Dense(nodes, activation=activation))
    model.add(Dense(2, activation='sigmoid'))
    model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['accuracy'])
    model.fit(predictors,target,batch_size=batch_size,epochs=50,validation_data=(X_val, y_val),callbacks=callbacks,verbose=0)
    model._estimator_type = 'regressor'
    return model

# adjust format of inputs for neural network classifier
predictors = np.matrix(X_train)
target = to_categorical(y_train)
X_val = np.matrix(X_test)
y_val = to_categorical(y_test)

# early stopping callback to expedite tuning search
callbacks = [EarlyStopping(monitor="val_loss", min_delta=1e-3,patience=3,verbose=0)]

In [4]:
clf1 = GradientBoostingClassifier(loss='exponential', n_estimators=250,
                                 min_samples_split=5, min_samples_leaf=4,
                                 max_features=8).fit(X_train,y_train)
clf2 = LogisticRegression(penalty='l1',solver='saga',warm_start=True).fit(X_train,y_train)
clf3 = create_model(16,128,'relu',64,'adam')

In [5]:
def soft_voting(input,*args):
    '''takes an input array and fitted predictors, and returns an array of their predictions averaged'''
    preds = {}
    clfs=[]
    vote_preds = []
    for arg in args:
        if arg._estimator_type == 'classifier':
            pred = arg.predict_proba(input)
            preds[arg] = pred
        else:
            new_preds = []
            pred = arg.predict(input)
            for entry in pred:
                new_preds.append(entry[1])
            preds[arg] = new_preds
    for key in preds.keys():
        clfs.append(key)
    for x in range(len(input)):
        votes = []
        for clf in clfs:
            votes.append(preds[clf][x])
        vote_preds.append(np.mean(votes))
    return vote_preds

def hard_voting(input,*args):
    '''takes an input array and fitted predictors, and returns an array of their prediction consensus'''
    preds = {}
    clfs=[]
    vote_preds = []
    for arg in args:
        if arg._estimator_type == 'classifier':
            pred = arg.predict(input)
            preds[arg] = pred
        else:
            new_preds = []
            pred = (arg.predict(input) > 0.5).astype("int32")
            for entry in pred:
                new_preds.append(entry[1])
            preds[arg] = new_preds
    for key in preds.keys():
        clfs.append(key)
    for x in range(len(input)):
        votes = []
        for clf in clfs:
            votes.append(preds[clf][x])
        vote_preds.append(np.mean(votes))
    return vote_preds

In [14]:
def check_models(X_test,y_test):
    '''takes an input subset of our dataset, and returns the AUROC for each of our models'''
    scores = []
    for clf in [clf1,clf2,clf3]:
        if clf._estimator_type =='classifier':
            y_pred = clf.predict(X_test)
            fpr, tpr, thresholds = roc_curve(y_test, y_pred)
            roc_auc = auc(fpr, tpr)
            scores.append(roc_auc)
        else:
            y_pred = []
            preds = clf.predict(X_test)
            for entry in preds:
                y_pred.append(entry[1])
            fpr, tpr, thresholds = roc_curve(y_test, y_pred)
            roc_auc = auc(fpr, tpr)
            scores.append(roc_auc)
    return scores

In [9]:
clf1.predict(X_test).shape

(5694,)

In [10]:
clf2.predict(X_test).shape

(5694,)

In [11]:
clf3.predict(X_test).shape

(5694, 2)

In [15]:
check_models(X_test,y_test)

[0.6157571253021276, 0.6182168292542753, 0.6759997572002083]

In [17]:
check_models(X_test,y_test)

[0.6520221370796084, 0.6165839103368097, 0.6936722747522878]

In [19]:
check_models(X_test,y_test)

[0.6422555168408828, 0.6080925280681377, 0.6915832365466512]

In [21]:
check_models(X_test,y_test)

[0.6415262354913883, 0.618715445352853, 0.7039357960103998]