In [1]:
import pandas as pd
import hockey_scraper
import datetime as dt
import numpy as np
import sqlalchemy
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# SQL CONNECTION

engine = sqlalchemy.create_engine('mysql+pymysql://root:Sp1d3rman@localhost:3306/nhl_data')

In [3]:
# NEW NHL MODEL

# Reading in all games data from nhl_data SQL db.
df_sql = pd.read_sql("game_data", engine)

#Checking that last nights games were added correctly.

df_sql.tail()

Unnamed: 0,Team,GP,TOI/GP,W,L,OTL,ROW,Points,Point %,CF/60,...,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Date,ID
28581,Winnipeg Jets,1,60.0,1,0,0,1,2,1.0,55.0,...,0,0,-,0.0,100.0,11.76,92.86,1.046,2021-12-19,28625
28582,Minnesota Wild,1,60.0,0,1,0,0,0,0.0,47.0,...,0,0,-,0.0,100.0,17.39,75.86,0.933,2021-12-20,28626
28583,Dallas Stars,1,60.0,1,0,0,1,2,1.0,53.0,...,0,0,-,0.0,100.0,24.14,82.61,1.067,2021-12-20,28627
28584,Tampa Bay Lightning,1,60.0,1,0,0,1,2,1.0,47.0,...,1,0,100,12.5,100.0,21.05,92.68,1.137,2021-12-21,28628
28585,Vegas Golden Knights,1,60.0,0,1,0,0,0,0.0,61.0,...,0,1,0,0.0,87.5,7.32,78.95,0.863,2021-12-21,28629


In [4]:
df_sql.reset_index().set_index(['index','Date']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Team,GP,TOI/GP,W,L,OTL,ROW,Points,Point %,CF/60,...,LDSF%,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,ID
index,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,2009-10-01,Washington Capitals,1,60.0,1,0,0,1,2,1.0,56.0,...,70.0,0,0,-,0,100,11.76,95.0,1.068,9
1,2009-10-01,Boston Bruins,1,60.0,0,1,0,0,0,0.0,47.0,...,30.0,0,0,-,0,100,5.0,88.24,0.932,10
2,2009-10-01,Montreal Canadiens,1,64.7833,1,0,0,1,2,1.0,59.27,...,38.46,0,0,-,0,100,14.81,93.48,1.083,11
3,2009-10-01,Toronto Maple Leafs,1,64.7833,0,0,1,0,1,0.5,85.21,...,61.54,0,0,-,0,100,6.52,85.19,0.917,12
4,2009-10-01,San Jose Sharks,1,60.0,0,1,0,0,0,0.0,75.0,...,81.48,0,1,0,0,80,5.0,75.0,0.8,13


In [5]:
df_sql = df_sql.replace([np.inf, -np.inf, '-'], np.nan)

In [6]:
predictors = [x for x in df_sql.columns if x not in ['Team', 'Teamopp', 'Date', 'W','L']]

In [7]:
df_sql[predictors] = df_sql[predictors].astype(float)

## Convert into rolling measures

In [8]:
rolling_df = df_sql.reset_index().set_index(['index','Date','W','L']).groupby('Team').rolling(10).mean().shift(1)

In [9]:
final = rolling_df.reset_index(level=['Team', 'Date','W','L']).sort_index()

In [10]:
df_a = final.loc[final.index % 2 == 0, :] # Away
df_b = final.loc[final.index % 2 != 0, :] # Home

In [11]:
# HOME & AWAY WIN % OVERVIEW

n_visitwins = len(df_a[(df_a.W == 1) & (df_a.Date >= '2021-10-12')])

n_matches = len(df_a[df_a.Date >= '2021-10-12'])

win_rate = (float(n_visitwins)/ (n_matches)) *100

home_adv = (float(100-win_rate)-win_rate) /100

print (f'Visitor wins {n_visitwins}')
print(f'Visitor win % {win_rate:.2f}')
print(f'Home win % {100 - win_rate:.2f}')
print(f'Home adv: {home_adv*100:.2f}')

Visitor wins 213
Visitor win % 45.03
Home win % 54.97
Home adv: 9.94


In [12]:
df_c = pd.concat([df_a, df_b.set_index(df_a.index)], axis = 1)

In [13]:
suffix = 'opp'
df_c.columns = [name if duplicated == False else name + suffix for duplicated, name in zip(df_c.columns.duplicated(),df_c.columns)]

In [14]:
df_c.set_index('Date', inplace = True)

In [15]:
df = df_c.drop(columns=['Team','Teamopp','Dateopp','GP','TOI/GP','L','OTL','ROW','Points','Point %','GPopp','TOI/GPopp','Wopp','Lopp','OTLopp','ROWopp','Pointsopp','Point %opp','ID','IDopp','HDSF/60opp', 'FA/60opp', 'LDSV%', 'SF/60', 'FF/60opp', 'SF/60opp', 'HDSF%opp', 'FA/60', 'SF%opp', 'SA/60opp', 'SA/60', 'FF%', 'HDSA/60opp', 'SCGF/60opp', 'LDSV%opp', 'FF/60', 'HDSF/60', 'SF%', 'HDSA/60', 'HDSF%', 'FF%opp'])

In [16]:
check_null = df.isnull().sum()
check_null[check_null.gt(5000)]

HDGF%        6043
MDGF%       13294
LDGF%       14124
HDGF%opp     6050
MDGF%opp    13260
LDGF%opp    14138
dtype: int64

In [17]:
df.drop(['HDGF%','MDGF%','LDGF%','HDGF%opp','MDGF%opp','LDGF%opp'],inplace=True,axis=1)

In [18]:
predictors = [x for x in df.columns if x not in ['Team','Teamopp','Date','W','L']]

In [19]:
df = df.dropna()

In [20]:
X = df.drop(columns=['W'])
y = df['W']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 0, shuffle = False)

In [22]:
model = LogisticRegression(C = 0.00026366508987303583,max_iter = 1000, penalty = 'l2', solver='lbfgs')
model.fit(X_train, y_train)

LogisticRegression(C=0.00026366508987303583, max_iter=1000)

In [23]:
model.score(X_test, y_test)

0.573661586557249

In [24]:
test_probs = pd.DataFrame(model.predict_proba(X_test))

home_predict_rate = test_probs[test_probs[0]>test_probs[1]].shape[0] / test_probs.shape[0]
home_win_rate = 1-(y_test.sum() / len(y_test))
print(f'Home win rate {float(home_win_rate):.2f}, Model predicts home team: {home_predict_rate:.2f}')

Home win rate 0.53, Model predicts home team: 0.73


## Fetching Live Data & Predicting 

In [25]:
## REGULAR SEASON ONLY - OLD (BELOW WORKS BETTER - ONLY USE IF ERROR WITH OTHER VERSION)

# games = pd.read_html('https://www.hockey-reference.com/leagues/NHL_2021_games.html')
# games[0].drop(columns = ['Unnamed: 5','G','G.1','Att.','LOG','Notes'], inplace = True)
# games = games[0]
# games['Date'] = pd.to_datetime(games["Date"],format="%Y-%m-%d")
# today = pd.Timestamp("today").floor("D")
# games = games.loc[(games.Date == today)]
# games = games.replace('St. Louis Blues', 'St Louis Blues')
# games
df_s = hockey_scraper.scrape_schedule(dt.date.today().strftime('%Y-%m-%d'), dt.date.today().strftime('%Y-%m-%d'))

lookup_dict = {   
'ANA': "Anaheim Ducks",
'ARI': "Arizona Coyotes",
'ATL': "Atlanta Thrashers",
'BOS': "Boston Bruins",
'BUF': "Buffalo Sabres",
'CAR': "Carolina Hurricanes",
'CBJ': "Columbus Blue Jackets",
'CGY': "Calgary Flames",
'CHI': "Chicago Blackhawks",
'COL': "Colorado Avalanche",
'DAL': "Dallas Stars",
'DET': "Detroit Red Wings",
'EDM': "Edmonton Oilers",
'FLA': "Florida Panthers",
'L.A': "Los Angeles Kings",
'MIN': "Minnesota Wild",
'MTL': "Montreal Canadiens",
'N.J': "New Jersey Devils",
'NSH': "Nashville Predators",
'NYI': "New York Islanders",
'NYR': "New York Rangers",
'OTT': "Ottawa Senators",
'PHI': "Philadelphia Flyers",
'PHX': "Phoenix Coyotes",
'PIT': "Pittsburgh Penguins",
'S.J': "San Jose Sharks",
'SEATTLE KRAKEN': "Seattle Kraken",
'STL': "St Louis Blues",
'T.B': "Tampa Bay Lightning",
'TOR': "Toronto Maple Leafs",
'VAN': "Vancouver Canucks",
'VGK': "Vegas Golden Knights",
'WPG': "Winnipeg Jets",
'WSH': "Washington Capitals"
}


df_s['Home'] = [lookup_dict[x] for x in df_s.home_team]
df_s['Visitor'] = [lookup_dict[x] for x in df_s.away_team]

games = df_s[['date','Visitor','Home']]
games

Scraping the schedule between 2021-12-22 and 2021-12-22


Unnamed: 0,date,Visitor,Home
0,2021-12-22,Colorado Avalanche,Buffalo Sabres
1,2021-12-22,Montreal Canadiens,New York Rangers
2,2021-12-22,Winnipeg Jets,Dallas Stars
3,2021-12-22,Edmonton Oilers,Los Angeles Kings


In [26]:
# Initialise columns for each predictor column
for col in predictors:
    games[col] = np.nan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[0m

In [27]:
def update_row_with_features(row):
    
    n= 15 # Number of games back to get data

    # Fetch the last n games mean from original dataset for the particular teams of the game
    home_rec = df_sql[df_sql.Team.eq(row['Home'])].iloc[-n:].mean()
    visit_rec = df_sql[df_sql.Team.eq(row['Visitor'])].iloc[-n:].mean()
    home_rec.index = [x+'opp' for x in home_rec.index]

    #Convert into dictionary for easier addition to dataframe
    home_rec = home_rec.to_dict()
    visit_rec = visit_rec.to_dict()
    visit_rec.update(home_rec)
    
    #Update dataframe row using dictionary
    for k,v in visit_rec.items():
        if k in predictors:
            games.loc[row.name,k] = v

In [28]:
# Add feature values to each row of dataframe for predictions
games.apply(update_row_with_features,axis=1)

DataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version.
[0mDropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
[0mDataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version.
[0mDropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
[0m

0    None
1    None
2    None
3    None
dtype: object

In [29]:
games

Unnamed: 0,date,Visitor,Home,CF/60,CA/60,CF%,GF/60,GA/60,GF%,xGF/60,...,LDCF%opp,LDSF/60opp,LDSA/60opp,LDSF%opp,LDGF/60opp,LDGA/60opp,LDSH%opp,SH%opp,SV%opp,PDOopp
0,2021-12-22,Colorado Avalanche,Buffalo Sabres,62.145333,53.477333,53.746,4.795333,3.461333,58.181333,3.080667,...,49.439333,15.147333,14.836,50.942667,0.2,0.6,1.196667,8.004667,88.594667,0.966067
1,2021-12-22,Montreal Canadiens,New York Rangers,51.429333,61.25,45.536,2.108667,3.703333,32.666667,2.294,...,46.119333,11.779333,13.39,46.888,0.533333,0.533333,4.412,9.974667,92.659333,1.026267
2,2021-12-22,Winnipeg Jets,Dallas Stars,59.910667,54.330667,52.406,2.579333,2.979333,42.627333,3.114667,...,50.782667,11.433333,12.374,48.352667,0.266667,0.333333,2.809333,10.748,90.262667,1.009933
3,2021-12-22,Edmonton Oilers,Los Angeles Kings,58.902667,54.969333,51.876667,2.728,3.194667,43.937333,3.208667,...,52.768667,14.189333,12.893333,52.264667,0.133333,0.466667,0.947333,7.218667,91.050667,0.982867


In [30]:
out = pd.DataFrame(data = {'v_team': games['Visitor'], 'v_prob': np.round(model.predict_proba(games[predictors])[:,1],3),'h_prob': np.round(model.predict_proba(games[predictors])[:,0],3),'h_team': games['Home']})

In [31]:
out = pd.DataFrame(data = {'v_team': games['Visitor'], 'v_prob': np.round(model.predict_proba(games[predictors])[:,1],3),'v_odds': np.round(1 / out['v_prob'],2),'h_prob': np.round(model.predict_proba(games[predictors])[:,0],3),'h_odds':np.round(1 / out['h_prob'],2),'h_team': games['Home']})

In [32]:
out

Unnamed: 0,v_team,v_prob,v_odds,h_prob,h_odds,h_team
0,Colorado Avalanche,0.64,1.56,0.36,2.78,Buffalo Sabres
1,Montreal Canadiens,0.311,3.22,0.689,1.45,New York Rangers
2,Winnipeg Jets,0.468,2.14,0.532,1.88,Dallas Stars
3,Edmonton Oilers,0.452,2.21,0.548,1.82,Los Angeles Kings


In [33]:
out.to_csv('daily projections.csv')