<a href="https://colab.research.google.com/github/ceb263/nhl/blob/main/xG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Imports and input data
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, roc_auc_score

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from keras.layers.core import Dense, Activation, Dropout
from keras.models import Sequential, load_model

%matplotlib inline

pd.set_option('display.max_rows', 150)

In [None]:
!unzip data_2012-2019.zip

Archive:  data_2012-2019.zip
  inflating: pbp_2019.pkl            
  inflating: __MACOSX/._pbp_2019.pkl  
  inflating: pbp_2012.pkl            
  inflating: __MACOSX/._pbp_2012.pkl  
  inflating: pbp_2013.pkl            
  inflating: __MACOSX/._pbp_2013.pkl  
  inflating: pbp_2014.pkl            
  inflating: __MACOSX/._pbp_2014.pkl  
  inflating: pbp_2015.pkl            
  inflating: __MACOSX/._pbp_2015.pkl  
  inflating: pbp_2016.pkl            
  inflating: __MACOSX/._pbp_2016.pkl  
  inflating: pbp_2017.pkl            
  inflating: __MACOSX/._pbp_2017.pkl  
  inflating: pbp_2018.pkl            
  inflating: __MACOSX/._pbp_2018.pkl  


In [None]:
# read data
plays = pd.read_pickle('pbp_2012.pkl')
plays['Season'] = 2012
plays2013 = pd.read_pickle('pbp_2013.pkl')
plays2013['Season'] = 2013
plays2014 = pd.read_pickle('pbp_2014.pkl')
plays2014['Season'] = 2014
plays2015 = pd.read_pickle('pbp_2015.pkl')
plays2015['Season'] = 2015
plays2016 = pd.read_pickle('pbp_2016.pkl')
plays2016['Season'] = 2016
plays2017 = pd.read_pickle('pbp_2017.pkl')
plays2017['Season'] = 2017
plays2018 = pd.read_pickle('pbp_2018.pkl')
plays2018['Season'] = 2018
plays2019 = pd.read_pickle('pbp_2019.pkl')
plays2019['Season'] = 2019
plays = pd.concat([plays, plays2013], ignore_index=True)
plays = pd.concat([plays, plays2014], ignore_index=True)
plays = pd.concat([plays, plays2015], ignore_index=True)
plays = pd.concat([plays, plays2016], ignore_index=True)
plays = pd.concat([plays, plays2017], ignore_index=True)
plays = pd.concat([plays, plays2018], ignore_index=True)
plays = pd.concat([plays, plays2019], ignore_index=True)

del plays2013, plays2014, plays2015, plays2016, plays2017, plays2018, plays2019

In [None]:
# data preprocessing
def preprocess_plays(df):
    # remove null location data
    df = df.loc[~df['xC'].isnull()]
    df = df.loc[~df['yC'].isnull()]

    # get previous event time and location
    df = df.sort_values(by=['Game_Id','Period','Seconds_Elapsed'])
    df['prev_Game_Id'] = df['Game_Id'].shift(1)
    df['prev_Period'] = df['Period'].shift(1)
    df['keepPrev'] = ((df['prev_Game_Id']==df['Game_Id']) & (df['prev_Period']==df['Period'])).astype(int)
    df['prev_Event'] = df['Event'].shift(1)
    df['prev_Seconds_Elapsed'] = df['Seconds_Elapsed'].shift(1)
    df['prev_xC'] = df['xC'].shift(1)
    df['prev_yC'] = df['yC'].shift(1)
    df.at[df['keepPrev']==0, ['prev_Event','prev_Seconds_Elapsed','prev_xC','prev_yC']] = np.NaN
    df = df.loc[~df['prev_Event'].isnull()]

    # get time elapsed, and distance from previous event
    df['timeSincePrev'] = df['Seconds_Elapsed'] - df['prev_Seconds_Elapsed']
    df['distanceSincePrev'] = np.sqrt(np.square(df['xC']-df['prev_xC']) + np.square(df['yC']-df['prev_yC']))
    df['yDistanceSincePrev'] = np.abs(df['yC'] - df['prev_yC'])

    # remove invalid data
    df = df.loc[(df['timeSincePrev']>0) | (df['timeSincePrev'].isnull())]

    # filter for only shots and shot attempts
    df = df.loc[df['Event'].isin(['BLOCK','MISS','SHOT','GOAL'])]

    # filter for only most common game states
    df = df.loc[df['Strength'].isin(['5x5','4x5','3x5','5x4','4x4','5x3','4x3','6x5','5x6','3x4','3x3'])]

    # get previous shot time and location, and then calculate derived metrics
    df['prevShot_Game_Id'] = df['Game_Id'].shift(1)
    df['prevShot_Period'] = df['Period'].shift(1)
    df['keepPrevShot'] = ((df['prevShot_Game_Id']==df['Game_Id']) & (df['prevShot_Period']==df['Period'])).astype(int)
    df['prevShot_Seconds_Elapsed'] = df['Seconds_Elapsed'].shift(1)
    df['prevShot_xC'] = df['xC'].shift(1)
    df['prevShot_yC'] = df['yC'].shift(1)
    df['prevShot_Ev_Team'] = df['Ev_Team'].shift(1)
    df['prevShot_sameTeam'] = (df['prevShot_Ev_Team']==df['Ev_Team']).astype(int)
    df.at[df['keepPrevShot']==0, ['prevShot_Seconds_Elapsed','prevShot_xC','prevShot_yC','prevShot_Ev_Team']] = np.NaN
    df['timeSincePrevShot'] = df['Seconds_Elapsed'] - df['prevShot_Seconds_Elapsed']
    df['distanceSincePrevShot'] = np.sqrt(np.square(df['xC']-df['prevShot_xC']) + np.square(df['yC']-df['prevShot_yC']))
    df['yDistanceSincePrevShot'] = np.abs(df['yC'] - df['prevShot_yC'])

    # adjust shot locations so everything is on the same side of the ice
    # TODO this isn't quite right - shots from the D zone (into an empty net, for example), will not be adjusted correctly
    df['loc_adjust_factor'] = (((df['xC']>0).astype(int).astype(float)) - 0.5) * 2
    df['x_adj'] = df['xC']*df['loc_adjust_factor']
    df['y_adj'] = df['yC']*df['loc_adjust_factor']
    df['prev_loc_adjust_factor'] = (((df['prev_xC']>0).astype(int).astype(float)) - 0.5) * 2
    df['prev_x_adj'] = df['prev_xC']*df['prev_loc_adjust_factor']
    df['prev_y_adj'] = df['prev_yC']*df['prev_loc_adjust_factor']
    df['prevShot_loc_adjust_factor'] = (((df['prevShot_xC']>0).astype(int).astype(float)) - 0.5) * 2
    df['prevShot_x_adj'] = df['prevShot_xC']*df['prevShot_loc_adjust_factor']
    df['prevShot_y_adj'] = df['prevShot_yC']*df['prevShot_loc_adjust_factor']

    # fill nulls
    df[['prevShot_Seconds_Elapsed','prevShot_yC','prevShot_y_adj','prevShot_xC']] = \
        df[['prevShot_Seconds_Elapsed','prevShot_yC','prevShot_y_adj','prevShot_xC']].fillna(0)
    df[['prevShot_x_adj','distanceSincePrevShot','yDistanceSincePrevShot']] = df[['prevShot_x_adj','distanceSincePrevShot','yDistanceSincePrevShot']].fillna(-1)
    df[['timeSincePrevShot']] = df[['timeSincePrevShot']].fillna(1200)

    # fix time since prev shot if prev shot was in another period
    df.at[df['timeSincePrevShot']<0, 'timeSincePrevShot'] = 1200

    # adjust score to be score for and against, instead of home and away
    df['homeTeamShot'] = (df['Home_Team']==df['Ev_Team']).astype(int)
    df['scoreFor'] = (df['Home_Score']*df['homeTeamShot']) + (df['Away_Score']*(1-df['homeTeamShot']))
    df['scoreAgainst'] = (df['Away_Score']*df['homeTeamShot']) + (df['Home_Score']*(1-df['homeTeamShot']))
    df['scoreDiff'] = df['scoreFor'] - df['scoreAgainst']

    # add target variable
    df['goal'] = (df['Event']=='GOAL').astype(int)

    return df

In [None]:
shots = preprocess_plays(plays)

In [None]:
# select feature and target columns
X = shots[['Period','Seconds_Elapsed','scoreFor','scoreAgainst','scoreDiff','xC','yC','prev_Seconds_Elapsed','prev_xC','prev_yC','timeSincePrev','distanceSincePrev',
           'yDistanceSincePrev','prevShot_Seconds_Elapsed','prevShot_xC','prevShot_yC','prevShot_sameTeam','timeSincePrevShot','distanceSincePrevShot',
           'yDistanceSincePrevShot','x_adj','y_adj','prev_x_adj','prev_y_adj','prevShot_x_adj','prevShot_y_adj','Strength','Ev_Zone','Type','prev_Event','goal','Season']]
y = shots[['goal','Season']]

# train/test split
#[X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size=0.3, random_state=26)
X_train = X.loc[X['Season']!=2019]
X_test = X.loc[X['Season']==2019]
y_train = y.loc[y['Season']!=2019]['goal'].values
y_test = y.loc[y['Season']==2019]['goal'].values

# categorical feature encodings
mean_codes_strength = X_train.groupby(['Strength'])['goal'].mean().to_dict()
mean_codes_zone = X_train.groupby(['Ev_Zone'])['goal'].mean().to_dict()
mean_codes_type = X_train.groupby(['Type'])['goal'].mean().to_dict()
mean_codes_prevEvent = X_train.groupby(['prev_Event'])['goal'].mean().to_dict()

tot_count = X_train['goal'].count()
count_codes_strength = (X_train.groupby(['Strength'])['goal'].count()/tot_count).to_dict()
count_codes_zone = (X_train.groupby(['Ev_Zone'])['goal'].count()/tot_count).to_dict()
count_codes_type = (X_train.groupby(['Type'])['goal'].count()/tot_count).to_dict()
count_codes_prevEvent = (X_train.groupby(['prev_Event'])['goal'].count()/tot_count).to_dict()

X_train['Strength_meanEnc'] = X_train['Strength'].map(mean_codes_strength)
X_test['Strength_meanEnc'] = X_test['Strength'].map(mean_codes_strength)
X_train['Ev_Zone_meanEnc'] = X_train['Ev_Zone'].map(mean_codes_zone)
X_test['Ev_Zone_meanEnc'] = X_test['Ev_Zone'].map(mean_codes_zone)
X_train['Type_meanEnc'] = X_train['Type'].map(mean_codes_type)
X_test['Type_meanEnc'] = X_test['Type'].map(mean_codes_type)
X_train['prev_Event_meanEnc'] = X_train['prev_Event'].map(mean_codes_prevEvent)
X_test['prev_Event_meanEnc'] = X_test['prev_Event'].map(mean_codes_prevEvent)
X_train['Strength_countEnc'] = X_train['Strength'].map(count_codes_strength)
X_test['Strength_countEnc'] = X_test['Strength'].map(count_codes_strength)
X_train['Ev_Zone_countEnc'] = X_train['Ev_Zone'].map(count_codes_zone)
X_test['Ev_Zone_countEnc'] = X_test['Ev_Zone'].map(count_codes_zone)
X_train['Type_countEnc'] = X_train['Type'].map(count_codes_type)
X_test['Type_countEnc'] = X_test['Type'].map(count_codes_type)
X_train['prev_Event_countEnc'] = X_train['prev_Event'].map(count_codes_prevEvent)
X_test['prev_Event_countEnc'] = X_test['prev_Event'].map(count_codes_prevEvent)

X_train_df = X_train.copy(deep=True)
X_test_df = X_test.copy(deep=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [None]:
features = ['Period','Seconds_Elapsed','scoreDiff','yC','prev_Seconds_Elapsed','timeSincePrev','distanceSincePrev',
           'prevShot_Seconds_Elapsed','prevShot_sameTeam','timeSincePrevShot',
           'x_adj','y_adj','Strength_meanEnc','Ev_Zone_meanEnc','Type_meanEnc']
X_train = X_train_df[features].values
X_test = X_test_df[features].values

# apply scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Testing

In [25]:
[_, X_subsample, _, y_subsample] = train_test_split(X_train, y_train, test_size=0.05, random_state=26)

In [26]:
X_subsample.shape

(41460, 15)

In [30]:
model_test = GradientBoostingClassifier()

param_dist = {
    'max_depth' : [12,15],
    'min_samples_leaf' : [1000,5000]
}

random_search = GridSearchCV(model_test, param_dist, scoring=['neg_log_loss','roc_auc'], refit='neg_log_loss', cv=3, return_train_score=True)
random_search.fit(X_subsample, y_subsample)

report_cols = ['mean_test_neg_log_loss','mean_test_roc_auc']+['param_'+param for param in param_dist]
report = pd.DataFrame(random_search.cv_results_)[report_cols].sort_values(by='mean_test_neg_log_loss', ascending=False)
report

Unnamed: 0,mean_test_neg_log_loss,mean_test_roc_auc,param_max_depth,param_min_samples_leaf
0,-0.159329,0.801846,12,1000
2,-0.159639,0.800997,15,1000
1,-0.161883,0.789792,12,5000
3,-0.161883,0.789792,15,5000


# Selected Model Training

Originally was testing different types of models (neural nets, gradient boosting trees, random forest, K nearest neighbors, and logistic regression), with a meta model built on top. Neural networks, gradient boosting, and random forest all performed similarly, and better than KNN or logistic regression. Performance on test data did not improve from those models by themselves to adding a meta model, so the final model is just a gradient boosting classifier.

The final model trained on data from the 2012 - 2018 seasons, and used the 2019 season as a test set. The model achieved a LogLoss of 0.1623 and an AUC of 0.8255 on the test data (min_samples_leaf=200, max_depth=10).

In [35]:
X = np.concatenate((X_train, X_test))
y = np.concatenate((y_train, y_test))

In [36]:
model = GradientBoostingClassifier(min_samples_leaf=200, max_depth=10)
model.fit(X, y)
pickle.dump(model, open('xG_gb.pkl', 'wb'))

In [33]:
model_gb = GradientBoostingClassifier(min_samples_leaf=200, max_depth=10)
model_gb.fit(X_train, y_train)
preds_gb = model_gb.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_gb))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_gb))))

LogLoss score: 0.16234477772192488
AUC score: 0.8254788756324156


In [34]:
gb_features = pd.DataFrame()
gb_features['feature'] = features
gb_features['importance'] = model_gb.feature_importances_
gb_features.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
10,x_adj,0.219661
13,Ev_Zone_meanEnc,0.160604
3,yC,0.132226
11,y_adj,0.096482
2,scoreDiff,0.068501
14,Type_meanEnc,0.064492
0,Period,0.047695
6,distanceSincePrev,0.043232
9,timeSincePrevShot,0.03482
7,prevShot_Seconds_Elapsed,0.03095


In [None]:
pickle.dump(model_gb, open('gb.pkl', 'wb'))

In [None]:
model_gb = pickle.load(open('gb.pkl', 'rb'))

In [None]:
preds_gb = model_gb.predict_proba(X_train)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_train, preds_gb))))
print ('AUC score: {}'.format(str(roc_auc_score(y_train, preds_gb))))
np.save('train_preds_gb.npy', preds_gb)

LogLoss score: 0.1581178923761096
AUC score: 0.8192213657832067


In [None]:
model_gb2 = GradientBoostingClassifier(max_depth=3, min_weight_fraction_leaf=0.001)
model_gb2.fit(X_train, y_train)
preds_gb2 = model_gb2.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_gb2))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_gb2))))

LogLoss score: 0.1658067883554617
AUC score: 0.8163280936824684


In [None]:
pickle.dump(model_gb2, open('gb2.pkl', 'wb'))

In [None]:
model_gb2 = pickle.load(open('gb2.pkl', 'rb'))

In [None]:
preds_gb2 = model_gb2.predict_proba(X_train)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_train, preds_gb2))))
print ('AUC score: {}'.format(str(roc_auc_score(y_train, preds_gb2))))
np.save('train_preds_gb2.npy', preds_gb2)

LogLoss score: 0.15827058176118222
AUC score: 0.818599677692736


In [None]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
preds_lr = model_lr.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_lr))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_lr))))

LogLoss score: 0.17561467421598137
AUC score: 0.7684700372074639


In [None]:
pickle.dump(model_lr, open('lr.pkl', 'wb'))

In [None]:
model_lr = pickle.load(open('lr.pkl', 'rb'))

In [None]:
preds_lr = model_lr.predict_proba(X_train)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_train, preds_lr))))
print ('AUC score: {}'.format(str(roc_auc_score(y_train, preds_lr))))
np.save('train_preds_lr.npy', preds_lr)

LogLoss score: 0.16806671387027766
AUC score: 0.7675220516382505


In [None]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
preds_rf = model_rf.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_rf))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_rf))))

LogLoss score: 0.2025529015331919
AUC score: 0.7921210034054453


In [None]:
rf_features = pd.DataFrame()
rf_features['feature'] = features
rf_features['importance'] = model_rf.feature_importances_
rf_features.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
6,distanceSincePrev,0.13315
4,prev_Seconds_Elapsed,0.118159
1,Seconds_Elapsed,0.118
7,prevShot_Seconds_Elapsed,0.117813
10,x_adj,0.094005
11,y_adj,0.074657
3,yC,0.074262
9,timeSincePrevShot,0.062439
2,scoreDiff,0.055062
5,timeSincePrev,0.040902


In [None]:
pickle.dump(model_rf, open('rf.pkl', 'wb'))

In [None]:
model_rf = pickle.load(open('rf.pkl', 'rb'))

In [None]:
preds_rf = model_rf.predict_proba(X_train)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_train, preds_rf))))
print ('AUC score: {}'.format(str(roc_auc_score(y_train, preds_rf))))
np.save('train_preds_rf.npy', preds_rf)

LogLoss score: 0.14290832802468434
AUC score: 0.8902308978742874


In [None]:
def create_model_nn(input_dim):
    model = Sequential()
    model.add(Dense(512, input_dim=input_dim))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(128))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))

    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy')
    return model

In [None]:
model_nn = create_model_nn(X_train.shape[1])
model_nn.fit(X_train, y_train, verbose=1, epochs=10, batch_size=2048)
preds_nn = model_nn.predict(X_test)[:,0]
print ('LogLoss score: {}'.format(str(log_loss(y_test, np.clip(preds_nn, a_min=10e-5, a_max = 1-10e-5)))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, np.clip(preds_nn, a_min=10e-5, a_max = 1-10e-5)))))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
LogLoss score: 0.1641512059568805
AUC score: 0.8188887203127735


In [None]:
model_nn.save('nn.h5')

In [None]:
model_nn = load_model('nn.h5')

In [None]:
preds_nn = model_nn.predict(X_train)[:,0]
print ('LogLoss score: {}'.format(str(log_loss(y_train, np.clip(preds_nn, a_min=10e-5, a_max = 1-10e-5)))))
print ('AUC score: {}'.format(str(roc_auc_score(y_train, np.clip(preds_nn, a_min=10e-5, a_max = 1-10e-5)))))
np.save('train_preds_nn.npy', preds_nn)

LogLoss score: 0.16151307566498868
AUC score: 0.8208424245852204


In [None]:
def create_model_nn2(input_dim):
    model = Sequential()
    model.add(Dense(2048, input_dim=input_dim))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))

    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy')
    return model

In [None]:
model_nn2 = create_model_nn2(X_train.shape[1])
model_nn2.fit(X_train, y_train, verbose=1, epochs=10, batch_size=2048)
preds_nn2 = model_nn2.predict(X_test)[:,0]
print ('LogLoss score: {}'.format(str(log_loss(y_test, np.clip(preds_nn2, a_min=10e-5, a_max = 1-10e-5)))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, np.clip(preds_nn2, a_min=10e-5, a_max = 1-10e-5)))))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
LogLoss score: 0.1644034389894902
AUC score: 0.8174647173880174


In [None]:
model_nn2.save('nn2.h5')

In [None]:
model_nn2 = load_model('nn2.h5')

In [None]:
preds_nn2 = model_nn2.predict(X_train)[:,0]
print ('LogLoss score: {}'.format(str(log_loss(y_train, np.clip(preds_nn2, a_min=10e-5, a_max = 1-10e-5)))))
print ('AUC score: {}'.format(str(roc_auc_score(y_train, np.clip(preds_nn2, a_min=10e-5, a_max = 1-10e-5)))))
np.save('train_preds_nn2.npy', preds_nn2)

LogLoss score: 0.15588459537329236
AUC score: 0.8238504049570357


# Meta Model

In [None]:
np.save('test_actuals.npy', y_test)
np.save('train_actuals.npy', y_train)

In [None]:
preds_gb = np.load('train_preds_gb.npy')
preds_gb2 = np.load('train_preds_gb2.npy')
preds_lr = np.load('train_preds_lr.npy')
preds_nn = np.load('train_preds_nn.npy')
preds_nn2 = np.load('train_preds_nn2.npy')
preds_rf = np.load('train_preds_rf.npy')

preds_gb_test = np.load('test_preds_gb.npy')
preds_gb2_test = np.load('test_preds_gb2.npy')
preds_lr_test = np.load('test_preds_lr.npy')
preds_nn_test = np.load('test_preds_nn.npy')
preds_nn2_test = np.load('test_preds_nn2.npy')
preds_rf_test = np.load('test_preds_rf.npy')

In [None]:
X_meta = np.column_stack((preds_gb, preds_gb2, preds_rf, preds_lr, preds_nn, preds_nn2))
X_meta = np.concatenate((X_train, X_meta), axis=1)
X_meta_test = np.column_stack((preds_gb_test, preds_gb2_test, preds_rf_test, preds_lr_test, preds_nn_test, preds_nn2_test))
X_meta_test = np.concatenate((X_test, X_meta_test), axis=1)

In [None]:
def create_model_nn_meta(input_dim):
    model = Sequential()
    model.add(Dense(1, input_dim=input_dim))
    #model.add(Activation('relu'))
    #model.add(Dropout(0.5))

    #model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy')
    return model

In [None]:
#model_nn_meta = create_model_nn_meta(X_meta.shape[1])
model_nn_meta.fit(X_meta, y_train, verbose=1, epochs=10, batch_size=512)
preds_nn_meta = model_nn_meta.predict(X_meta_test)[:,0]
print ('LogLoss score: {}'.format(str(log_loss(y_test, np.clip(preds_nn_meta, a_min=10e-5, a_max = 1-10e-5)))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, np.clip(preds_nn_meta, a_min=10e-5, a_max = 1-10e-5)))))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
LogLoss score: 0.16684700429050514
AUC score: 0.814733401315431


In [None]:
model_test = RandomForestClassifier()

param_dist = {
    'max_depth' : [1,5,10]
}

random_search = GridSearchCV(model_test, param_dist, scoring=['neg_log_loss','roc_auc'], refit='neg_log_loss', cv=3, return_train_score=True)
random_search.fit(X_meta, y_train)

report_cols = ['mean_test_neg_log_loss','mean_test_roc_auc']+['param_'+param for param in param_dist]
report = pd.DataFrame(random_search.cv_results_)[report_cols].sort_values(by='mean_test_neg_log_loss', ascending=False)
report

Unnamed: 0,mean_test_neg_log_loss,mean_test_roc_auc,param_max_depth
2,-0.094348,0.962418,10
1,-0.116484,0.940895,5
0,-0.161838,0.771469,1


In [None]:
model_meta = RandomForestClassifier(max_depth=1)
model_meta.fit(X_meta, y_train)
test_preds_meta = model_meta.predict_proba(X_meta_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, test_preds_meta))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, test_preds_meta))))

LogLoss score: 0.17531882694905196
AUC score: 0.7488173236352991


In [None]:
model_meta = GradientBoostingClassifier(max_depth=1)
model_meta.fit(X_meta, y_train)
test_preds_meta = model_meta.predict_proba(X_meta_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, test_preds_meta))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, test_preds_meta))))

LogLoss score: 0.1699710727159766
AUC score: 0.8054097221707287


In [None]:
print ('LogLoss score: {}'.format(str(log_loss(y_test, np.mean(X_meta_test, axis=1)))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, np.mean(X_meta_test, axis=1)))))

LogLoss score: 0.165079201893919
AUC score: 0.819913135514671
