In [1]:
# Use with NHL_Model_Data_Transform_v5.py
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt

from tensorflow import keras


rng = 69

In [2]:
df = pd.read_csv(r"C:\Users\zchodaniecky\OneDrive - Franklin Templeton\Documents\Python\NHL_data\NHL_Data_Transformed.csv")
#df = pd.read_csv(r"C:\Users\zanec\OneDrive\Documents\Python\NHL_data\NHL_Data_All_Games_Transformed.csv")

In [3]:
df = df.drop(columns=['team_Home','team_Away','gameId','home_or_away_Home','home_or_away_Away','win_or_lose_Away'])

In [4]:
# df.hist(bins=50, figsize=(20,15))
# plt.show()

In [5]:
# Heatmap to visualize correlations
corr_matrix = df.corr(numeric_only=True)
# sns.heatmap(corr_matrix, cmap="PiYG")

In [6]:
corr_matrix['win_or_lose_Home'].sort_values(ascending=False)

win_or_lose_Home             1.000000
seasonPointsPerGame_Home     0.301171
fenwickPercentageAvg_Home    0.158833
goalDiffAvg_Home             0.111469
pointsFromGameAvg_Home       0.105522
hitsDiffAvg_Away             0.074194
reboundsForAvg_Home          0.059812
penaltiesAgainstTotal        0.028012
penaltiesForTotal           -0.019402
hitsDiffAvg_Home            -0.038123
reboundsForAvg_Away         -0.057037
pointsFromGameAvg_Away      -0.112578
goalDiffAvg_Away            -0.113146
fenwickPercentageAvg_Away   -0.164881
seasonPointsPerGame_Away    -0.292426
Name: win_or_lose_Home, dtype: float64

In [7]:
# Visualize distributions of attributes
# from pandas.plotting import scatter_matrix

# attributes = ['fenwickPercentageAvg_Home','goalDiffAvg_Home','seasonPointsPerGame_Home','seasonPointsPerGame_Away']
# scatter_matrix(df[attributes], figsize=(12,8));

In [8]:
# These are tentative just to test
df_Final = df
# df_Final = df.drop(columns=['corsiPercentageAvg_Away','penaltiesAgainstTotal','shotsOnGoalDiffAvg_Away'])

In [9]:
from sklearn.model_selection import train_test_split

# Split into 80% training data and 20% testing data
train_set, test_set = train_test_split(df_Final, test_size=0.2, random_state=rng)

nhl = train_set.drop('win_or_lose_Home', axis=1)
nhl_labels = train_set['win_or_lose_Home'].copy()

X_train = train_set.drop('win_or_lose_Home', axis=1)
Y_train = train_set['win_or_lose_Home'].copy()

X_valid = train_set.drop('win_or_lose_Home', axis=1)
Y_valid = train_set['win_or_lose_Home'].copy()

X_test = test_set.drop('win_or_lose_Home', axis=1)
Y_test = test_set['win_or_lose_Home'].copy()

df_Final = df_Final.drop('win_or_lose_Home', axis=1)

In [10]:
# Pipeline constructor used to run transformation steps in order
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

X_train_prepared= num_pipeline.fit_transform(X_train)
X_valid_prepared= num_pipeline.fit_transform(X_valid)
X_test_prepared= num_pipeline.fit_transform(X_test)


In [11]:
# set epochs to use same num in all calls
n_epochs = 75

In [12]:
def build_model(learning_rate = 0.00335195324238683, n_hidden=1, n_neurons=15, input_shape=[X_train.shape[1]]):
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(shape=input_shape))
    if n_hidden > 0:
        for _ in range(n_hidden):
            model.add(keras.layers.Dropout(rate=0.2))
            model.add(keras.layers.Dense(n_neurons,activation='selu',kernel_initializer="lecun_normal"))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer= keras.optimizers.Adam(learning_rate=learning_rate))
    return model

In [13]:
# Build model using best params from hypertuning

model = build_model

In [14]:
from scikeras.wrappers import KerasClassifier

# Seems like you need to specify some items again when using the scikeras wrapper
keras_clf = KerasClassifier(model=model, epochs=n_epochs, batch_size=5, verbose=0, random_state=42,
                           loss='binary_crossentropy',optimizer='adam',metrics='accuracy')



# Stop early if model is not getting better after # of patience epochs, restore to best model
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)
save_file = r"C:\Users\zanec\OneDrive\Documents\Python\NHL_data\Best_NHL_Model.keras"
checkpoint_cb = keras.callbacks.ModelCheckpoint(save_file, save_best_only = True)

keras_clf.fit(X_train_prepared, Y_train, 
              validation_data = (X_valid_prepared, Y_valid),
              callbacks=[early_stopping_cb,checkpoint_cb])



In [15]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold


kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
Y_pred = cross_val_predict(keras_clf, X_train_prepared, Y_train, cv=kfold)

from sklearn.metrics import confusion_matrix
confusion_matrix(Y_train, Y_pred)
# [Predict Loss - Actual Loss  |  Predict Win - Actual Loss]
# [Predict Loss - Actual Win   |  Predict Win - Actual Win]

array([[ 931,  596],
       [ 483, 1254]], dtype=int64)

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

print('When claiming is a win, correct % is ', precision_score(Y_train, Y_pred))
print('Detected % of wins' ,recall_score(Y_train, Y_pred))
print('F1 score is ', f1_score(Y_train, Y_pred))
print('Training Accuracy score is ', accuracy_score(Y_train, Y_pred))


When claiming is a win, correct % is  0.6778378378378378
Detected % of wins 0.7219343696027634
F1 score is  0.6991915249512127
Training Accuracy score is  0.6694240196078431


In [17]:
# Train Accuracy
Y_train_pred = keras_clf.predict(X_train_prepared)
print(accuracy_score(Y_train, Y_train_pred))

Y_test_pred = keras_clf.predict(X_test_prepared)
print(accuracy_score(Y_test, Y_test_pred))

0.6850490196078431
0.6715686274509803


In [18]:
df_Predict = pd.read_csv(r"C:\Users\zchodaniecky\OneDrive - Franklin Templeton\Documents\Python\NHL_data\NHL_Data_Predict.csv")
#df_Predict = pd.read_csv(r"C:\Users\zanec\OneDrive\Documents\Python\NHL_data\NHL_Data_All_Games_Predict.csv")

In [19]:
X_real_predict = df_Predict.iloc[:,2:]

np.array((df_Predict.iloc[:,1].values,keras_clf.predict(X_real_predict))).T

array([['TBL', 0],
       ['DET', 0],
       ['PIT', 0],
       ['NJD', 1]], dtype=object)

In [20]:
X_real_predict

Unnamed: 0,seasonPointsPerGame_Home,goalDiffAvg_Home,hitsDiffAvg_Home,pointsFromGameAvg_Home,fenwickPercentageAvg_Home,reboundsForAvg_Home,seasonPointsPerGame_Away,goalDiffAvg_Away,hitsDiffAvg_Away,pointsFromGameAvg_Away,fenwickPercentageAvg_Away,reboundsForAvg_Away,penaltiesForTotal,penaltiesAgainstTotal
0,1.073171,-0.4,-5.58,0.65,0.5,2.59,0.939024,0.03,-5.79,0.77,0.43,1.82,6.14,6.48
1,0.975309,0.04,2.47,0.97,0.48,3.41,0.777778,0.12,-2.88,0.75,0.39,3.21,5.38,7.78
2,0.939024,0.65,-8.76,1.32,0.52,5.14,1.109756,1.08,-2.71,1.28,0.56,3.97,7.02,7.03
3,0.95122,-0.59,-5.75,0.76,0.42,1.74,1.0,1.43,3.79,1.71,0.51,2.67,6.84,5.27
