In [1]:
import pandas as pd
import hockey_scraper
import datetime as dt
import numpy as np
import sqlalchemy
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# SQL CONNECTION

engine = sqlalchemy.create_engine('mysql+pymysql://root:Sp1d3rman@localhost:3306/new_nhl_data')

## BUILDING THE MODEL

In [3]:
# NEW NHL MODEL

# Reading in all games data from nhl_data SQL db.
df_sql = pd.read_sql("game_data", engine)

#Checking that last nights games were added correctly.

df_sql.tail()

Unnamed: 0,Team,GP,TOI/GP,W,L,OTL,ROW,Points,Point %,CF/60,...,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Date,id
27783,Tampa Bay Lightning,1,60.0,1,0,,,,,46.0,...,2,0,100.0,22.22,100.0,18.52,94.74,1.133,2021-06-28,27834
27784,Montreal Canadiens,1,60.0,0,1,,,,,67.0,...,1,2,33.33,5.56,80.0,2.33,86.96,0.893,2021-06-30,27835
27785,Tampa Bay Lightning,1,60.0,1,0,,,,,46.0,...,2,1,66.67,20.0,94.44,13.04,97.67,1.107,2021-06-30,27836
27786,Tampa Bay Lightning,1,60.0,1,0,,,,,48.0,...,2,1,66.67,16.67,96.0,20.0,91.43,1.114,2021-07-02,27837
27787,Montreal Canadiens,1,60.0,0,1,,,,,59.0,...,1,2,33.33,4.0,83.33,8.57,80.0,0.886,2021-07-02,27838


In [4]:
df_sql.reset_index().set_index(['index','Date']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Team,GP,TOI/GP,W,L,OTL,ROW,Points,Point %,CF/60,...,LDSF%,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,id
index,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,2009-10-01,Washington Capitals,1,60.0,1,0,0.0,1.0,2.0,1.0,56.0,...,70.0,0,0,-,0,100,11.76,95.0,1.068,9
1,2009-10-01,Boston Bruins,1,60.0,0,1,0.0,0.0,0.0,0.0,47.0,...,30.0,0,0,-,0,100,5.0,88.24,0.932,10
2,2009-10-01,Montreal Canadiens,1,64.7833,1,0,0.0,1.0,2.0,1.0,59.27,...,38.46,0,0,-,0,100,14.81,93.48,1.083,11
3,2009-10-01,Toronto Maple Leafs,1,64.7833,0,0,1.0,0.0,1.0,0.5,85.21,...,61.54,0,0,-,0,100,6.52,85.19,0.917,12
4,2009-10-01,San Jose Sharks,1,60.0,0,1,0.0,0.0,0.0,0.0,75.0,...,81.48,0,1,0.00,0,80,5.0,75.0,0.8,13


In [5]:
df_sql = df_sql.replace([np.inf, -np.inf, '-'], np.nan)

In [6]:
predictors = [x for x in df_sql.columns if x not in ['Team', 'Teamopp', 'Date', 'W','L']]

In [7]:
df_sql[predictors] = df_sql[predictors].astype(float)

## Convert into rolling measures

In [8]:
rolling_df = df_sql.reset_index().set_index(['index','Date','W','L']).groupby('Team').rolling(10).mean().shift(1)

In [9]:
rolling_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,GP,TOI/GP,OTL,ROW,Points,Point %,CF/60,CA/60,CF%,FF/60,...,LDSF%,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,id
Team,index,Date,W,L,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
Anaheim Ducks,43,2009-10-03,0,1,,,,,,,,,,,...,,,,,,,,,,
Anaheim Ducks,56,2009-10-06,0,0,,,,,,,,,,,...,,,,,,,,,,
Anaheim Ducks,68,2009-10-08,1,0,,,,,,,,,,,...,,,,,,,,,,
Anaheim Ducks,102,2009-10-10,1,0,,,,,,,,,,,...,,,,,,,,,,
Anaheim Ducks,122,2009-10-11,0,1,,,,,,,,,,,...,,,,,,,,,,


In [10]:
final = rolling_df.reset_index(level=['Team', 'Date','W','L']).sort_index()

In [11]:
final.head()

Unnamed: 0_level_0,Team,Date,W,L,GP,TOI/GP,OTL,ROW,Points,Point %,...,LDSF%,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Washington Capitals,2009-10-01,1,0,1.0,61.66666,,,,,...,53.766,0.9,0.2,,7.218,98.175,10.193,91.276,1.0147,27810.0
1,Boston Bruins,2009-10-01,0,1,1.0,60.5,0.0,0.3,0.8,0.4,...,53.799,0.3,0.4,,2.046,96.0,6.267,87.657,0.9391,4792.8
2,Montreal Canadiens,2009-10-01,1,0,1.0,60.61,,,,,...,43.649,0.3,0.7,,2.823,94.273,10.076,87.754,0.9784,27701.0
3,Toronto Maple Leafs,2009-10-01,0,0,1.0,60.11333,,,,,...,49.932,1.0,0.3,,9.722,98.377,11.219,94.542,1.0575,27824.4
4,San Jose Sharks,2009-10-01,0,1,1.0,63.735,,,,,...,56.013,0.5,0.6,,3.506,96.4,10.141,88.589,0.9872,27654.9


In [12]:
df_a = final.loc[final.index % 2 == 0, :] # Away
df_b = final.loc[final.index % 2 != 0, :] # Home

In [13]:
n_visitwins = len(df_a[(df_a.W == 1) & (df_a.Date >= '2021-01-13')])

n_matches = len(df_a[df_a.Date >= '2021-01-13'])

win_rate = (float(n_visitwins)/ (n_matches)) *100

home_adv = (float(100-win_rate)-win_rate) /100

print (f'Visitor wins {n_visitwins}')
print(f'Visitor win % {win_rate:.2f}')
print(f'Home win % {100 - win_rate:.2f}')
print(f'Home adv: {home_adv*100:.2f}')

Visitor wins 439
Visitor win % 46.70
Home win % 53.30
Home adv: 6.60


In [14]:
df_c = pd.concat([df_a, df_b.set_index(df_a.index)], axis = 1)

In [15]:
suffix = 'opp'
df_c.columns = [name if duplicated == False else name + suffix for duplicated, name in zip(df_c.columns.duplicated(),df_c.columns)]

In [16]:
df_c.set_index('Date', inplace = True)

In [17]:
df = df_c.drop(columns=['Team','Teamopp','Dateopp','GP','TOI/GP','L','OTL','ROW','Points','Point %','GPopp','TOI/GPopp','Wopp','Lopp','OTLopp','ROWopp','Pointsopp','Point %opp','id','idopp','HDSF/60opp', 'FA/60opp', 'LDSV%', 'SF/60', 'FF/60opp', 'SF/60opp', 'HDSF%opp', 'FA/60', 'SF%opp', 'SA/60opp', 'SA/60', 'FF%', 'HDSA/60opp', 'SCGF/60opp', 'LDSV%opp', 'FF/60', 'HDSF/60', 'SF%', 'HDSA/60', 'HDSF%', 'FF%opp'])

In [18]:
check_null = df.isnull().sum()
check_null[check_null.gt(5000)]

HDGF%        5891
MDGF%       12952
LDGF%       13733
HDGF%opp     5901
MDGF%opp    12925
LDGF%opp    13743
dtype: int64

In [19]:
df.drop(['HDGF%','MDGF%','LDGF%','HDGF%opp','MDGF%opp','LDGF%opp'],inplace=True,axis=1)

In [20]:
predictors = [x for x in df.columns if x not in ['Team','Teamopp','Date','W','L']]

In [21]:
df = df.dropna()

In [22]:
X = df.drop(columns=['W'])
y = df['W']

In [23]:
# scaler = preprocessing.StandardScaler()
# X = scaler.fit_transform(X)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 0, shuffle = False)

In [25]:
model = LogisticRegression(C = 0.00026366508987303583,max_iter = 1000, penalty = 'l2', solver='lbfgs')
model.fit(X_train, y_train)

LogisticRegression(C=0.00026366508987303583, max_iter=1000)

In [26]:
model.score(X_test, y_test)

0.5728194726166329

## Fetching live data and predicting on it

In [27]:
# # REGULAR SEASON ONLY

# games = pd.read_html('https://www.hockey-reference.com/leagues/NHL_2021_games.html')
# games[0].drop(columns = ['Unnamed: 5','G','G.1','Att.','LOG','Notes'], inplace = True)
# games = games[0]
# games['Date'] = pd.to_datetime(games["Date"],format="%Y-%m-%d")
# today = pd.Timestamp("today").floor("D")
# games = games.loc[(games.Date == today)]
# games = games.replace('St. Louis Blues', 'St Louis Blues')
# games

In [28]:
# PLAYOFFS ONLY
df_s = hockey_scraper.scrape_schedule(dt.date.today().strftime('%Y-%m-%d'), dt.date.today().strftime('%Y-%m-%d'))

lookup_dict = { 
    'WSH':'Washington Capitals'
    , 'NSH': 'Nashville Predators'
    , 'STL': 'St Louis Blues'
    , 'BOS': 'Boston Bruins'
    , 'CAR': 'Carolina Hurricanes'
    , 'COL': 'Colorado Avalanche'
    , 'T.B': 'Tampa Bay Lightning'
    , 'FLA': 'Florida Panthers'
    , 'TOR': 'Toronto Maple Leafs'
    , 'MTL': 'Montreal Canadiens'
    , 'VGK': 'Vegas Golden Knights'
    , 'MIN': 'Minnesota Wild'
    , 'EDM': 'Edmonton Oilers'
    , 'WPG': 'Winnipeg Jets'
    , 'VAN': 'Vancouver Canucks'
    , 'CGY': 'Calgary Flames'
    , 'PIT': 'Pittsburgh Penguins'
    , 'NYI': 'New York Islanders'
}

df_s['Home'] = [lookup_dict[x] for x in df_s.home_team]
df_s['Visitor'] = [lookup_dict[x] for x in df_s.away_team]

games = df_s[['date','Visitor','Home']]

Scraping the schedule between 2021-07-05 and 2021-07-05


In [29]:
# Initialise columns for each predictor column
for col in predictors:
    games[col] = np.nan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[0m

In [30]:
def update_row_with_features(row):
    
    n= 15 # Number of games back to get data

    # Fetch the last n games mean from original dataset for the particular teams of the game
    home_rec = df_sql[df_sql.Team.eq(row['Home'])].iloc[-n:].mean()
    visit_rec = df_sql[df_sql.Team.eq(row['Visitor'])].iloc[-n:].mean()
    home_rec.index = [x+'opp' for x in home_rec.index]

    #Convert into dictionary for easier addition to dataframe
    home_rec = home_rec.to_dict()
    visit_rec = visit_rec.to_dict()
    visit_rec.update(home_rec)
    
    #Update dataframe row using dictionary
    for k,v in visit_rec.items():
        if k in predictors:
            games.loc[row.name,k] = v

In [31]:
# Add feature values to each row of dataframe for predictions
games.apply(update_row_with_features,axis=1)

DataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version.
[0mDataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version.
[0m

0    None
dtype: object

In [32]:
games

Unnamed: 0,date,Visitor,Home,CF/60,CA/60,CF%,GF/60,GA/60,GF%,xGF/60,...,LDCF%opp,LDSF/60opp,LDSA/60opp,LDSF%opp,LDGF/60opp,LDGA/60opp,LDSH%opp,SH%opp,SV%opp,PDOopp
0,2021-07-05,Tampa Bay Lightning,Montreal Canadiens,51.170667,55.540667,48.092667,3.185333,1.644667,67.000667,2.596,...,48.128667,12.122,13.768,46.532,0.2,0.866667,1.113333,9.094667,91.452667,1.0054


In [33]:
out = pd.DataFrame(data = {'v_team': games['Visitor'], 'v_prob': np.round(model.predict_proba(games[predictors])[:,1],3),'h_prob': np.round(model.predict_proba(games[predictors])[:,0],3),'h_team': games['Home']})

In [34]:
out = pd.DataFrame(data = {'v_team': games['Visitor'], 'v_prob': np.round(model.predict_proba(games[predictors])[:,1],3),'v_odds': np.round(1 / out['v_prob'],2),'h_prob': np.round(model.predict_proba(games[predictors])[:,0],3),'h_odds':np.round(1 / out['h_prob'],2),'h_team': games['Home']})

In [35]:
out

Unnamed: 0,v_team,v_prob,v_odds,h_prob,h_odds,h_team
0,Tampa Bay Lightning,0.476,2.1,0.524,1.91,Montreal Canadiens


In [36]:
out.to_csv('daily projections.csv')

## Removing Correlated Features

In [37]:
# import seaborn as sns
# plt.figure(figsize=(12,10))
# cor = X_train.corr()
# sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
# plt.show()

In [38]:
# def correlation(dataset, threshold):
#     col_corr = set() # Set all the names of correlated columns
#     corr_matrix = dataset.corr()
#     for i in range(len(corr_matrix.columns)):
#         for j in range(i):
#             if abs(corr_matrix.iloc[i,j]) > threshold: # getting absolute coeff value
#                 colname=corr_matrix.columns[i] # getting name of column
#                 col_corr.add(colname)
#     return col_corr

In [39]:
# corr_features = correlation(X_train, 0.9)
# len(set(corr_features))

In [40]:
len(predictors)

97

In [41]:
# print(corr_features)