In [1]:
import pandas as pd
import hockey_scraper
import datetime as dt
import numpy as np
import sqlalchemy
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# SQL CONNECTION

engine = sqlalchemy.create_engine('mysql+pymysql://root:<PASSWORD>@localhost:3306/nhl_data')

In [3]:
# NEW NHL MODEL

# Reading in all games data from nhl_data SQL db.
df_sql = pd.read_sql("game_data", engine)

#Checking that last nights games were added correctly.

df_sql.tail()

Unnamed: 0,Team,GP,TOI/GP,W,L,OTL,ROW,Points,Point %,CF/60,...,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Date,ID
27635,Vancouver Canucks,1,63.4,0,0,1,0,1,0.5,51.1,...,2,2,50,22.22,88.89,14.29,83.33,0.976,2021-05-16,27648
27636,Calgary Flames,1,60.0,0,1,0,0,0,0.0,58.0,...,0,0,-,0.0,100.0,5.0,83.33,0.883,2021-05-18,27649
27637,Vancouver Canucks,1,60.0,1,0,0,1,2,1.0,41.0,...,0,0,-,0.0,100.0,16.67,95.0,1.117,2021-05-18,27650
27638,Vancouver Canucks,1,60.0,0,1,0,0,0,0.0,43.0,...,0,0,-,0.0,100.0,9.52,78.57,0.881,2021-05-19,27651
27639,Calgary Flames,1,60.0,1,0,0,1,2,1.0,52.0,...,0,0,-,0.0,100.0,21.43,90.48,1.119,2021-05-19,27652


In [4]:
df_sql.reset_index().set_index(['index','Date']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Team,GP,TOI/GP,W,L,OTL,ROW,Points,Point %,CF/60,...,LDSF%,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,ID
index,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,2009-10-01,Washington Capitals,1,60.0,1,0,0,1,2,1.0,56.0,...,70.0,0,0,-,0,100,11.76,95.0,1.068,9
1,2009-10-01,Boston Bruins,1,60.0,0,1,0,0,0,0.0,47.0,...,30.0,0,0,-,0,100,5.0,88.24,0.932,10
2,2009-10-01,Montreal Canadiens,1,64.7833,1,0,0,1,2,1.0,59.27,...,38.46,0,0,-,0,100,14.81,93.48,1.083,11
3,2009-10-01,Toronto Maple Leafs,1,64.7833,0,0,1,0,1,0.5,85.21,...,61.54,0,0,-,0,100,6.52,85.19,0.917,12
4,2009-10-01,San Jose Sharks,1,60.0,0,1,0,0,0,0.0,75.0,...,81.48,0,1,0,0,80,5.0,75.0,0.8,13


In [5]:
df_sql = df_sql.replace([np.inf, -np.inf, '-'], np.nan)

In [6]:
predictors = [x for x in df_sql.columns if x not in ['Team', 'Teamopp', 'Date', 'W','L']]

In [7]:
df_sql[predictors] = df_sql[predictors].astype(float)

## Convert into rolling measures

In [8]:
rolling_df = df_sql.reset_index().set_index(['index','Date','W','L']).groupby('Team').rolling(10).mean().shift(1)

In [9]:
rolling_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,GP,TOI/GP,OTL,ROW,Points,Point %,CF/60,CA/60,CF%,FF/60,...,LDSF%,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,ID
Team,index,Date,W,L,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
Anaheim Ducks,43,2009-10-03,0,1,,,,,,,,,,,...,,,,,,,,,,
Anaheim Ducks,56,2009-10-06,0,0,,,,,,,,,,,...,,,,,,,,,,
Anaheim Ducks,68,2009-10-08,1,0,,,,,,,,,,,...,,,,,,,,,,
Anaheim Ducks,102,2009-10-10,1,0,,,,,,,,,,,...,,,,,,,,,,
Anaheim Ducks,122,2009-10-11,0,1,,,,,,,,,,,...,,,,,,,,,,


In [10]:
final = rolling_df.reset_index(level=['Team', 'Date','W','L']).sort_index()

In [11]:
final.head()

Unnamed: 0_level_0,Team,Date,W,L,GP,TOI/GP,OTL,ROW,Points,Point %,...,LDSF%,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,ID
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Washington Capitals,2009-10-01,1,0,1.0,60.965,0.0,0.7,1.4,0.7,...,56.055,0.6,0.5,,4.464,95.462,11.019,91.936,1.0295,27518.0
1,Boston Bruins,2009-10-01,0,1,1.0,60.5,0.0,0.3,0.8,0.4,...,53.799,0.3,0.4,,2.046,96.0,6.267,87.657,0.9391,4792.8
2,Montreal Canadiens,2009-10-01,1,0,1.0,61.09167,0.2,0.5,1.2,0.6,...,51.348,0.9,0.6,,6.661,95.289,12.528,85.319,0.9784,27518.1
3,Toronto Maple Leafs,2009-10-01,0,0,1.0,60.51667,0.1,0.6,1.3,0.65,...,51.055,0.4,0.4,,4.374,97.654,10.213,91.517,1.0173,27480.7
4,San Jose Sharks,2009-10-01,0,1,1.0,60.21833,0.0,0.8,1.6,0.8,...,49.272,0.4,0.5,,3.159,97.111,12.155,92.868,1.0501,27456.2


In [12]:
df_a = final.loc[final.index % 2 == 0, :] # Away
df_b = final.loc[final.index % 2 != 0, :] # Home

In [13]:
# HOME & AWAY WIN % OVERVIEW

n_visitwins = len(df_a[(df_a.W == 1) & (df_a.Date >= '2021-01-13')])

n_matches = len(df_a[df_a.Date >= '2021-01-13'])

win_rate = (float(n_visitwins)/ (n_matches)) *100

home_adv = (float(100-win_rate)-win_rate) /100

print (f'Visitor wins {n_visitwins}')
print(f'Visitor win % {win_rate:.2f}')
print(f'Home win % {100 - win_rate:.2f}')
print(f'Home adv: {home_adv*100:.2f}')

Visitor wins 405
Visitor win % 46.66
Home win % 53.34
Home adv: 6.68


In [14]:
df_c = pd.concat([df_a, df_b.set_index(df_a.index)], axis = 1)

In [15]:
suffix = 'opp'
df_c.columns = [name if duplicated == False else name + suffix for duplicated, name in zip(df_c.columns.duplicated(),df_c.columns)]

In [16]:
df_c.set_index('Date', inplace = True)

In [17]:
df = df_c.drop(columns=['Team','Teamopp','Dateopp','GP','TOI/GP','L','OTL','ROW','Points','Point %','GPopp','TOI/GPopp','Wopp','Lopp','OTLopp','ROWopp','Pointsopp','Point %opp','ID','IDopp','HDSF/60opp', 'FA/60opp', 'LDSV%', 'SF/60', 'FF/60opp', 'SF/60opp', 'HDSF%opp', 'FA/60', 'SF%opp', 'SA/60opp', 'SA/60', 'FF%', 'HDSA/60opp', 'SCGF/60opp', 'LDSV%opp', 'FF/60', 'HDSF/60', 'SF%', 'HDSA/60', 'HDSF%', 'FF%opp'])

In [18]:
check_null = df.isnull().sum()
check_null[check_null.gt(5000)]

HDGF%        5857
MDGF%       12885
LDGF%       13659
HDGF%opp     5870
MDGF%opp    12851
LDGF%opp    13669
dtype: int64

In [19]:
df.drop(['HDGF%','MDGF%','LDGF%','HDGF%opp','MDGF%opp','LDGF%opp'],inplace=True,axis=1)

In [20]:
predictors = [x for x in df.columns if x not in ['Team','Teamopp','Date','W','L']]

In [21]:
df = df.dropna()

In [22]:
X = df.drop(columns=['W'])
y = df['W']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 0, shuffle = False)

In [24]:
model = LogisticRegression(C = 0.00026366508987303583,max_iter = 1000, penalty = 'l2', solver='lbfgs')
model.fit(X_train, y_train)

LogisticRegression(C=0.00026366508987303583, max_iter=1000)

In [25]:
model.score(X_test, y_test)

0.5769854132901134

## Fetching Live Data & Predicting 

In [49]:
## REGULAR SEASON ONLY - OLD (BELOW WORKS BETTER - ONLY USE IF ERROR WITH OTHER VERSION)

# games = pd.read_html('https://www.hockey-reference.com/leagues/NHL_2021_games.html')
# games[0].drop(columns = ['Unnamed: 5','G','G.1','Att.','LOG','Notes'], inplace = True)
# games = games[0]
# games['Date'] = pd.to_datetime(games["Date"],format="%Y-%m-%d")
# today = pd.Timestamp("today").floor("D")
# games = games.loc[(games.Date == today)]
# games = games.replace('St. Louis Blues', 'St Louis Blues')
# games
df_s = hockey_scraper.scrape_schedule(dt.date.today().strftime('%Y-%m-%d'), dt.date.today().strftime('%Y-%m-%d'))

lookup_dict = {   
'ANA': "Anaheim Ducks",
'ARI': "Arizona Coyotes",
'ATL': "Atlanta Thrashers",
'BOS': "Boston Bruins",
'BUF': "Buffalo Sabres",
'CAR': "Carolina Hurricanes",
'CBJ': "Columbus Blue Jackets",
'CGY': "Calgary Flames",
'CHI': "Chicago Blackhawks",
'COL': "Colorado Avalanche",
'DAL': "Dallas Stars",
'DET': "Detroit Red Wings",
'EDM': "Edmonton Oilers",
'FLA': "Florida Panthers",
'L.A': "Los Angeles Kings",
'MIN': "Minnesota Wild",
'MTL': "Montreal Canadiens",
'N.J': "New Jersey Devils",
'NSH': "Nashville Predators",
'NYI': "New York Islanders",
'NYR': "New York Rangers",
'OTT': "Ottawa Senators",
'PHI': "Philadelphia Flyers",
'PHX': "Phoenix Coyotes",
'PIT': "Pittsburgh Penguins",
'S.J': "San Jose Sharks",
'SEATTLE KRAKEN': "Seattle Kraken",
'STL': "St Louis Blues",
'T.B': "Tampa Bay Lightning",
'TOR': "Toronto Maple Leafs",
'VAN': "Vancouver Canucks",
'VGK': "Vegas Golden Knights",
'WPG': "Winnipeg Jets",
'WSH': "Washington Capitals"
}


df_s['Home'] = [lookup_dict[x] for x in df_s.home_team]
df_s['Visitor'] = [lookup_dict[x] for x in df_s.away_team]

games = df_s[['date','Visitor','Home']]
games

Scraping the schedule between 2021-10-01 and 2021-10-01


Unnamed: 0,date,Visitor,Home
0,2021-10-01,Pittsburgh Penguins,Buffalo Sabres
1,2021-10-01,New York Rangers,New Jersey Devils
2,2021-10-01,Dallas Stars,Florida Panthers
3,2021-10-01,Carolina Hurricanes,Tampa Bay Lightning
4,2021-10-01,Montreal Canadiens,Ottawa Senators
5,2021-10-01,St Louis Blues,Chicago Blackhawks
6,2021-10-01,Vancouver Canucks,Calgary Flames
7,2021-10-01,Los Angeles Kings,Vegas Golden Knights
8,2021-10-01,Edmonton Oilers,Seattle Kraken


In [50]:
# Initialise columns for each predictor column
for col in predictors:
    games[col] = np.nan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[0m

In [51]:
def update_row_with_features(row):
    
    n= 15 # Number of games back to get data

    # Fetch the last n games mean from original dataset for the particular teams of the game
    home_rec = df_sql[df_sql.Team.eq(row['Home'])].iloc[-n:].mean()
    visit_rec = df_sql[df_sql.Team.eq(row['Visitor'])].iloc[-n:].mean()
    home_rec.index = [x+'opp' for x in home_rec.index]

    #Convert into dictionary for easier addition to dataframe
    home_rec = home_rec.to_dict()
    visit_rec = visit_rec.to_dict()
    visit_rec.update(home_rec)
    
    #Update dataframe row using dictionary
    for k,v in visit_rec.items():
        if k in predictors:
            games.loc[row.name,k] = v

In [52]:
# Add feature values to each row of dataframe for predictions
games.apply(update_row_with_features,axis=1)

DataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version.
[0mDataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version.
[0m

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
dtype: object

In [53]:
games

Unnamed: 0,date,Visitor,Home,CF/60,CA/60,CF%,GF/60,GA/60,GF%,xGF/60,...,LDCF%opp,LDSF/60opp,LDSA/60opp,LDSF%opp,LDGF/60opp,LDGA/60opp,LDSH%opp,SH%opp,SV%opp,PDOopp
0,2021-10-01,Pittsburgh Penguins,Buffalo Sabres,52.316,52.154667,50.179333,3.649333,2.585333,63.870667,2.372,...,44.632,13.6,17.605333,43.784667,0.666667,0.866667,5.002,8.825333,90.143333,0.989667
1,2021-10-01,New York Rangers,New Jersey Devils,48.4,58.933333,44.776,2.933333,3.0,48.49,2.418,...,47.258,10.706667,12.347333,46.478667,0.466667,0.4,5.563333,10.938,87.135333,0.980667
2,2021-10-01,Dallas Stars,Florida Panthers,57.536,48.538667,53.804,2.875333,3.076,45.666667,2.594667,...,57.152667,15.015333,11.468,58.092667,0.6,0.333333,3.726,10.276,89.42,0.996933
3,2021-10-01,Carolina Hurricanes,Tampa Bay Lightning,57.338,50.707333,52.826667,3.04,2.235333,56.588,3.25,...,53.553333,11.64,12.248,49.130667,0.333333,0.333333,3.749333,9.888667,89.596667,0.994867
4,2021-10-01,Montreal Canadiens,Ottawa Senators,52.792,49.293333,51.661333,2.234,3.504667,38.611333,2.147333,...,44.226667,9.439333,13.018,41.009333,0.466667,0.466667,4.312667,13.453333,92.302,1.0576
5,2021-10-01,St Louis Blues,Chicago Blackhawks,48.786667,55.322667,46.772,3.28,2.678,55.918667,2.013333,...,49.332,11.901333,12.792667,47.832,0.466667,0.6,3.463333,11.080667,88.652667,0.9974
6,2021-10-01,Vancouver Canucks,Calgary Flames,47.473333,58.394667,44.857333,2.515333,3.845333,38.832,1.988,...,56.401333,12.461333,10.165333,55.934,0.4,0.466667,2.512667,10.174,89.765333,0.999333
7,2021-10-01,Los Angeles Kings,Vegas Golden Knights,47.354,58.895333,44.464,2.066,3.264667,40.444667,2.123333,...,54.249333,12.890667,10.094667,56.632667,0.6,0.466667,4.915333,11.434667,92.803333,1.042267
8,2021-10-01,Edmonton Oilers,Seattle Kraken,53.294,54.345333,49.516667,3.253333,2.59,57.054667,2.636667,...,,,,,,,,,,


In [54]:
out = pd.DataFrame(data = {'v_team': games['Visitor'], 'v_prob': np.round(model.predict_proba(games[predictors])[:,1],3),'h_prob': np.round(model.predict_proba(games[predictors])[:,0],3),'h_team': games['Home']})

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [38]:
out = pd.DataFrame(data = {'v_team': games['Visitor'], 'v_prob': np.round(model.predict_proba(games[predictors])[:,1],3),'v_odds': np.round(1 / out['v_prob'],2),'h_prob': np.round(model.predict_proba(games[predictors])[:,0],3),'h_odds':np.round(1 / out['h_prob'],2),'h_team': games['Home']})

KeyError: "None of [Index(['CF/60', 'CA/60', 'CF%', 'GF/60', 'GA/60', 'GF%', 'xGF/60', 'xGA/60',\n       'xGF%', 'SCF/60', 'SCA/60', 'SCF%', 'SCSF/60', 'SCSA/60', 'SCSF%',\n       'SCGF/60', 'SCGA/60', 'SCGF%', 'SCSH%', 'SCSV%', 'HDCF/60', 'HDCA/60',\n       'HDCF%', 'HDGF/60', 'HDGA/60', 'HDSH%', 'HDSV%', 'MDCF/60', 'MDCA/60',\n       'MDCF%', 'MDSF/60', 'MDSA/60', 'MDSF%', 'MDGF/60', 'MDGA/60', 'MDSH%',\n       'MDSV%', 'LDCF/60', 'LDCA/60', 'LDCF%', 'LDSF/60', 'LDSA/60', 'LDSF%',\n       'LDGF/60', 'LDGA/60', 'LDSH%', 'SH%', 'SV%', 'PDO', 'CF/60opp',\n       'CA/60opp', 'CF%opp', 'GF/60opp', 'GA/60opp', 'GF%opp', 'xGF/60opp',\n       'xGA/60opp', 'xGF%opp', 'SCF/60opp', 'SCA/60opp', 'SCF%opp',\n       'SCSF/60opp', 'SCSA/60opp', 'SCSF%opp', 'SCGA/60opp', 'SCGF%opp',\n       'SCSH%opp', 'SCSV%opp', 'HDCF/60opp', 'HDCA/60opp', 'HDCF%opp',\n       'HDGF/60opp', 'HDGA/60opp', 'HDSH%opp', 'HDSV%opp', 'MDCF/60opp',\n       'MDCA/60opp', 'MDCF%opp', 'MDSF/60opp', 'MDSA/60opp', 'MDSF%opp',\n       'MDGF/60opp', 'MDGA/60opp', 'MDSH%opp', 'MDSV%opp', 'LDCF/60opp',\n       'LDCA/60opp', 'LDCF%opp', 'LDSF/60opp', 'LDSA/60opp', 'LDSF%opp',\n       'LDGF/60opp', 'LDGA/60opp', 'LDSH%opp', 'SH%opp', 'SV%opp', 'PDOopp'],\n      dtype='object')] are in the [columns]"

In [39]:
out

NameError: name 'out' is not defined

In [None]:
out.to_csv('daily projections.csv')