In [1]:
# Use with NHL_Model_Data_Transform_v5.py
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
import datetime
import json
import requests
from tensorflow import keras

rng = 42

In [2]:
#df = pd.read_csv(r"C:\Users\zchodaniecky\OneDrive - Franklin Templeton\Documents\Python\NHL_data\NHL_Data_Transformed.csv")
df = pd.read_csv(r"C:\Users\zanec\OneDrive\Documents\Python\NHL_data\NHL_Data_All_Games_Transformed.csv")

In [3]:
df = df.drop(columns=['team_Home','team_Away','gameId','home_or_away_Home','home_or_away_Away','win_or_lose_Away'])

In [4]:
# df.hist(bins=50, figsize=(20,15))
# plt.show()

In [5]:
# Heatmap to visualize correlations
corr_matrix = df.corr(numeric_only=True)
# sns.heatmap(corr_matrix, cmap="PiYG")

In [6]:
corr_matrix['win_or_lose_Home'].sort_values(ascending=False)

win_or_lose_Home             1.000000
seasonPointsPerGame_Home     0.276991
fenwickPercentageAvg_Home    0.145742
goalDiffAvg_Home             0.099019
pointsFromGameAvg_Home       0.094528
hitsDiffAvg_Away             0.075956
reboundsForAvg_Home          0.051855
penaltiesAgainstTotal        0.024796
penaltiesForTotal           -0.016065
hitsDiffAvg_Home            -0.026004
reboundsForAvg_Away         -0.058582
pointsFromGameAvg_Away      -0.101494
goalDiffAvg_Away            -0.102619
fenwickPercentageAvg_Away   -0.154375
seasonPointsPerGame_Away    -0.272244
Name: win_or_lose_Home, dtype: float64

In [7]:
# Visualize distributions of attributes
# from pandas.plotting import scatter_matrix

# attributes = ['fenwickPercentageAvg_Home','goalDiffAvg_Home','seasonPointsPerGame_Home','seasonPointsPerGame_Away']
# scatter_matrix(df[attributes], figsize=(12,8));

In [8]:
# These are tentative just to test
df_Final = df
# df_Final = df.drop(columns=['corsiPercentageAvg_Away','penaltiesAgainstTotal','shotsOnGoalDiffAvg_Away'])

In [9]:
from sklearn.model_selection import train_test_split

# Split into 80% training data and 20% testing data
train_set, test_set = train_test_split(df_Final, test_size=0.2, random_state=rng)

nhl = train_set.drop('win_or_lose_Home', axis=1)
nhl_labels = train_set['win_or_lose_Home'].copy()

X_train = train_set.drop('win_or_lose_Home', axis=1)
Y_train = train_set['win_or_lose_Home'].copy()

X_valid = train_set.drop('win_or_lose_Home', axis=1)
Y_valid = train_set['win_or_lose_Home'].copy()

X_test = test_set.drop('win_or_lose_Home', axis=1)
Y_test = test_set['win_or_lose_Home'].copy()

df_Final = df_Final.drop('win_or_lose_Home', axis=1)

In [10]:
# Pipeline constructor used to run transformation steps in order
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

X_train_prepared= num_pipeline.fit_transform(X_train)
X_valid_prepared= num_pipeline.fit_transform(X_valid)
X_test_prepared= num_pipeline.fit_transform(X_test)


In [11]:
# set epochs to use same num in all calls
n_epochs = 75

In [12]:
def build_model(learning_rate = 0.0009144243340277702, n_hidden=2, n_neurons=5, input_shape=[X_train.shape[1]]):
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(shape=input_shape))
    if n_hidden > 0:
        for _ in range(n_hidden):
            model.add(keras.layers.Dropout(rate=0.2))
            model.add(keras.layers.Dense(n_neurons,activation='selu',kernel_initializer="lecun_normal"))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer= keras.optimizers.Adam(learning_rate=learning_rate))
    return model

In [13]:
# Build model using best params from hypertuning

model = build_model

In [14]:
from scikeras.wrappers import KerasClassifier

# Seems like you need to specify some items again when using the scikeras wrapper
keras_clf = KerasClassifier(model=model, epochs=n_epochs, batch_size=5, verbose=0, random_state=42,
                           loss='binary_crossentropy',optimizer='adam',metrics='accuracy')



# Stop early if model is not getting better after # of patience epochs, restore to best model
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)
save_file = r"C:\Users\zanec\OneDrive\Documents\Python\NHL_data\Best_NHL_Model.keras"
checkpoint_cb = keras.callbacks.ModelCheckpoint(save_file, save_best_only = True)

keras_clf.fit(X_train_prepared, Y_train, 
              validation_data = (X_valid_prepared, Y_valid),
              callbacks=[early_stopping_cb,checkpoint_cb])



In [16]:
from sklearn.metrics import accuracy_score

# Train Accuracy
Y_train_pred = keras_clf.predict(X_train_prepared)
print(accuracy_score(Y_train, Y_train_pred))

Y_test_pred = keras_clf.predict(X_test_prepared)
print(accuracy_score(Y_test, Y_test_pred))

0.6710264429911856
0.6636363636363637


In [17]:
#df_Predict = pd.read_csv(r"C:\Users\zchodaniecky\OneDrive - Franklin Templeton\Documents\Python\NHL_data\NHL_Data_Predict.csv")
df_Predict = pd.read_csv(r"C:\Users\zanec\OneDrive\Documents\Python\NHL_data\NHL_Data_All_Games_Predict.csv")

In [18]:
X_real_predict = df_Predict.iloc[:,2:]
results_dict = {1 : 'win', 0 : 'lose'}

home_team_results = list(keras_clf.predict(X_real_predict))

for i, item in enumerate(home_team_results):
    if item in results_dict:
        home_team_results[i] = results_dict[item]

np.array((df_Predict.iloc[:,1].values,keras_clf.predict(X_real_predict))).T
        

array([['TBL', 1],
       ['DET', 1],
       ['PIT', 1],
       ['NJD', 1]], dtype=object)

In [19]:
df_results = pd.DataFrame(columns=['home_team_short','home_team_long','home_team_odds','home_team_result',
                                   'away_team_short','away_team_long','away_team_odds'])

In [20]:
teams_dict = {'BOS':'Boston Bruins',
'BUF':'Buffalo Sabres',
'CGY':'Calgary Flames',
'CAR':'Carolina Hurricanes',
'CHI':'Chicago Blackhawks',
'COL':'Colorado Avalanche',
'CBJ':'Columbus Blue Jackets',
'DAL':'Dallas Stars',
'DET':'Detroit Red Wings',
'EDM':'Edmonton Oilers',
'FLA':'Florida Panthers',
'LAK':'Los Angeles Kings',
'MIN':'Minnesota Wild',
'MTL':'Montréal Canadiens',
'NSH':'Nashville Predators',
'NJD':'New Jersey Devils',
'NYI':'New York Islanders',
'NYR':'New York Rangers',
'OTT':'Ottawa Senators',
'PHI':'Philadelphia Flyers',
'PIT':'Pittsburgh Penguins',
'SJS':'San Jose Sharks',
'SEA':'Seattle Kraken',
'STL':'St Louis Blues',
'TBL':'Tampa Bay Lightning',
'MTL':'Toronto Maple Leafs',
'UTH':'Utah Hockey Club',
'VAN':'Vancouver Canucks',
'VGK':'Vegas Golden Knights',
'WPG':'Winnipeg Jets'}

In [21]:
away_teams = df_Predict.iloc[:,0].tolist()
home_teams = df_Predict.iloc[:,1].tolist()

In [22]:
df_results['home_team_short'] = home_teams
df_results['home_team_long'] = df_results['home_team_short'].map(teams_dict)
df_results['home_team_result'] = home_team_results
df_results['away_team_short'] = away_teams
df_results['away_team_long'] = df_results['away_team_short'].map(teams_dict)

In [23]:
# Pull game odds

commence_time = '2024-09-27T00:15:00Z'

odds_URL = 'https://api.the-odds-api.com/v4/sports/icehockey_nhl/odds/?'\
           'apiKey=94588626e22d896e9f196d0745f00928&bookmakers=draftkings&markets=h2h&'\
           f'oddsFormat=american'#&commenceTimeTo={commence_time}'

print(f'Hitting this url: {odds_URL}')

Hitting this url: https://api.the-odds-api.com/v4/sports/icehockey_nhl/odds/?apiKey=94588626e22d896e9f196d0745f00928&bookmakers=draftkings&markets=h2h&oddsFormat=american


In [24]:
# API is limited to 500 pulls per month so only use when needed

result = requests.get(odds_URL)

# Check if the request was successful
if result.status_code == 200:
    # Parse the JSON response
    data = result.json()
    
    teams_of_interest = ['Dallas Cowboys','New York Giants']
    odds_list = []
    teams_list = []
    # Extracting 'price' values from the JSON data
    for game in data:
        bookmakers = game.get('bookmakers', [])
        if bookmakers:
            markets = bookmakers[0].get('markets', [])
            if markets:
                outcomes = markets[0].get('outcomes', [])
                if outcomes:
                    for outcome in outcomes:
                        team_name = outcome.get('name', 'N/A')
                        if team_name not in teams_of_interest:  # Check if team is in the list
                            price = outcome.get('price', 'N/A')
                            odds_list.append(price)
                            teams_list.append(team_name)
                            #print(f'Team: {team_name}, Price: {price}')
else:
    print(f"Failed to retrieve data. Status code: {result.status_code}")

In [25]:
odds_dict = dict(zip(teams_list,odds_list))

odds_dict['Detroit Red Wings']

df_results['home_team_odds'] = df_results['home_team_long'].map(odds_dict)
df_results['away_team_odds'] = df_results['away_team_long'].map(odds_dict)

In [26]:
df_results.head()

Unnamed: 0,home_team_short,home_team_long,home_team_odds,home_team_result,away_team_short,away_team_long,away_team_odds
0,TBL,Tampa Bay Lightning,105,win,BUF,Buffalo Sabres,100
1,DET,Detroit Red Wings,-130,win,MTL,Toronto Maple Leafs,110
2,PIT,Pittsburgh Penguins,110,win,NSH,Nashville Predators,-115
3,NJD,New Jersey Devils,-130,win,NYI,New York Islanders,-155
