<a href="https://colab.research.google.com/github/ccstevie/nhl_model/blob/main/model2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd

In [2]:
urlH = "https://www.naturalstattrick.com/games.php?fromseason=20132014&thruseason=20232024&stype=2&sit=all&loc=H&team=All&rate=n"
reqH = requests.get(urlH)
reqH.status_code

200

In [3]:
urlA = "https://www.naturalstattrick.com/games.php?fromseason=20132014&thruseason=20232024&stype=2&sit=all&loc=A&team=All&rate=n"
reqA = requests.get(urlA)
reqA.status_code

200

In [4]:
dfH = pd.read_html(urlH, header=0, na_values=["-"])[0]
dfH.drop(columns=["Unnamed: 2", "Game", "TOI", "Attendance", "GF%", "PDO", "SH%", "SV%", "CF", "CA", "FF", 
                  "FA", "SF", "SA", "xGF", "xGA", "SCF", "SCA", "HDCF", "HDCA", "HDSF", "HDSA", "HDGF", "HDGA", "MDCF", "MDCA",
                    "MDSF", "MDSA", "MDGF", "MDGA", "LDCF", "LDCA", "LDSF", "LDSA", "LDGF", "LDGA"], inplace=True)
dfH = dfH.add_prefix("Home ")

In [5]:
dfH.columns

Index(['Home Team', 'Home CF%', 'Home FF%', 'Home SF%', 'Home GF', 'Home GA',
       'Home xGF%', 'Home SCF%', 'Home HDCF%', 'Home HDSF%', 'Home HDGF%',
       'Home HDSH%', 'Home HDSV%', 'Home MDCF%', 'Home MDSF%', 'Home MDGF%',
       'Home MDSH%', 'Home MDSV%', 'Home LDCF%', 'Home LDSF%', 'Home LDGF%',
       'Home LDSH%', 'Home LDSV%'],
      dtype='object')

In [6]:
dfA = pd.read_html(urlA, header=0, na_values=["-"])[0]
dfA.drop(columns=["Unnamed: 2", "Game", "TOI", "Attendance", "GF%", "PDO", "SH%", "SV%", "CF", "CA", "FF", 
                  "FA", "SF", "SA", "xGF", "xGA", "SCF", "SCA", "HDCF", "HDCA", "HDSF", "HDSA", "HDGF", "HDGA", "MDCF", "MDCA",
                    "MDSF", "MDSA", "MDGF", "MDGA", "LDCF", "LDCA", "LDSF", "LDSA", "LDGF", "LDGA"], inplace=True)
dfA = dfA.add_prefix("Away ")

In [7]:
# Merge home and array games to obtain matchup table
df = pd.merge(dfH, dfA, left_index=True, right_index=True)

In [8]:
# Drop unused columns
df.dropna(inplace=True)
df.drop(columns=["Away GF", "Away GA"], inplace=True)

In [9]:
X = df.drop(["Home Team", "Away Team", "Home GF", "Home GA"], axis=1)
# Convert percentage values to decimal form
for col in X:
    X[col] = X[col] / 100.0
X.head(1)

Unnamed: 0,Home CF%,Home FF%,Home SF%,Home xGF%,Home SCF%,Home HDCF%,Home HDSF%,Home HDGF%,Home HDSH%,Home HDSV%,...,Away MDCF%,Away MDSF%,Away MDGF%,Away MDSH%,Away MDSV%,Away LDCF%,Away LDSF%,Away LDGF%,Away LDSH%,Away LDSV%
1,0.4911,0.5063,0.5224,0.4631,0.4902,0.6667,0.6471,0.6667,0.3636,0.6667,...,0.6061,0.6471,0.6667,0.1818,0.8333,0.537,0.4839,0.0,0.0,0.9375


In [10]:
y = df[["Home GF", "Home GA"]]

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

base_model = RandomForestRegressor(random_state=42)
model = MultiOutputRegressor(base_model)
model.fit(X_train, y_train)

In [13]:
from sklearn.metrics import mean_squared_error
y_train_pred = model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
print("Training Mean Squared Error:", train_mse)

y_test_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
print("Test Mean Squared Error:", test_mse)

Training Mean Squared Error: 0.0799782832278481
Test Mean Squared Error: 0.5092370662460568


In [14]:
from sklearn.model_selection import cross_val_score

num_folds = 5

mse_scores = cross_val_score(model, X, y, cv=num_folds, scoring='neg_mean_squared_error')

mse_scores = -mse_scores

print("Cross-Validation Mean Squared Error Scores:")
print(mse_scores)

mean_mse = mse_scores.mean()
print("Mean MSE:", mean_mse)

Cross-Validation Mean Squared Error Scores:
[0.51662192 0.52304399 0.57999209 0.44851978 0.67199019]
Mean MSE: 0.5480335937188036


In [15]:
from datetime import date, timedelta

# we will use the last 30 days as our data range
today = date.today()
print("Today's date:", today)
start = today - timedelta(days=30)
print("30 days ago:", start)

Today's date: 2024-03-05
30 days ago: 2024-02-04


In [16]:
url = f"https://www.naturalstattrick.com/teamtable.php?fromseason=20232024&thruseason=20232024&stype=2&sit=all&score=all&rate=n&team=all&loc=B&gpf=410&fd={start}&td={today}"
req = requests.get(url)
req.status_code

200

In [17]:
df2 = pd.read_html(url, header=0, index_col = 0, na_values=["-"])[0]

In [18]:
from get_todays_games import getGames

matchups = getGames()
matchups


[('Panthers', 'Devils'),
 ('Blue Jackets', 'Penguins'),
 ('Oilers', 'Bruins'),
 ('Blues', 'Islanders'),
 ('Canadiens', 'Predators'),
 ('Kraken', 'Jets'),
 ('Blackhawks', 'Coyotes'),
 ('Canucks', 'Kings'),
 ('Stars', 'Sharks')]

In [19]:
res = pd.DataFrame()

for away, home in matchups:
    home_df = df2[df2["Team"].str.contains(home)]
    home_df = home_df.add_prefix('Home ')
    away_df = df2[df2["Team"].str.contains(away)]
    away_df = away_df.add_prefix('Away ')
    home_df = home_df.reset_index(drop=True)
    away_df = away_df.reset_index(drop=True)
    matchup_df = pd.merge(home_df, away_df, left_index=True, right_index=True)
    res = pd.concat([res, matchup_df], ignore_index=True)
    
res.head(10)

Unnamed: 0,Home Team,Home GP,Home TOI,Home W,Home L,Home OTL,Home ROW,Home Points,Home Point %,Home CF,...,Away LDSA,Away LDSF%,Away LDGF,Away LDGA,Away LDGF%,Away LDSH%,Away LDSV%,Away SH%,Away SV%,Away PDO
0,New Jersey Devils,14,843:40,6,7,1,6,13,0.464,1002,...,160,54.15,8,3,72.73,4.23,98.13,10.83,94.71,1.055
1,Pittsburgh Penguins,13,782:39,5,7,1,5,11,0.423,782,...,175,48.22,7,9,43.75,4.29,94.86,10.0,89.52,0.995
2,Boston Bruins,14,868:13,5,4,5,4,15,0.536,806,...,175,50.42,11,10,52.38,6.18,94.29,11.3,88.24,0.995
3,New York Islanders,11,669:01,6,3,2,6,14,0.636,642,...,182,45.83,4,7,36.36,2.6,96.15,10.09,91.52,1.016
4,Nashville Predators,11,661:36,9,2,0,9,18,0.818,729,...,152,50.16,9,7,56.25,5.88,95.39,10.45,88.59,0.99
5,Winnipeg Jets,13,780:58,9,4,0,9,18,0.692,781,...,162,47.91,8,8,50.0,5.37,95.06,8.83,92.55,1.014
6,Arizona Coyotes,13,782:09,2,9,2,2,6,0.231,817,...,171,46.39,1,8,11.11,0.68,95.32,6.15,88.64,0.948
7,Los Angeles Kings,12,729:33,8,4,0,7,16,0.667,736,...,161,50.46,5,8,38.46,3.05,95.03,9.72,86.93,0.967
8,San Jose Sharks,9,545:00,1,7,1,1,3,0.167,488,...,173,50.43,3,6,33.33,1.7,96.53,9.13,90.43,0.996


In [20]:
common_cols = [col for col in res.columns.intersection(df.columns)]
matchups_df = res[common_cols]
final_df = matchups_df.drop(["Home Team", "Away Team", "Home GA", "Home GF"], axis=1)

In [21]:
# Convert percentage values to decimal form
for col in final_df:
    final_df[col] = final_df[col] / 100.0
final_df.head(10)

Unnamed: 0,Home CF%,Home FF%,Home SF%,Home xGF%,Home SCF%,Home HDCF%,Home HDSF%,Home HDGF%,Home HDSH%,Home HDSV%,...,Away MDCF%,Away MDSF%,Away MDGF%,Away MDSH%,Away MDSV%,Away LDCF%,Away LDSF%,Away LDGF%,Away LDSH%,Away LDSV%
0,0.5496,0.5431,0.5528,0.5606,0.5332,0.5592,0.5654,0.4783,0.1642,0.767,...,0.558,0.5502,0.76,0.1652,0.9362,0.5407,0.5415,0.7273,0.0423,0.9813
1,0.5022,0.512,0.523,0.5129,0.5034,0.541,0.53,0.4359,0.1604,0.766,...,0.5114,0.5,0.45,0.0978,0.8804,0.5,0.4822,0.4375,0.0429,0.9486
2,0.4764,0.4736,0.4817,0.4943,0.494,0.5095,0.5437,0.4792,0.1679,0.7826,...,0.5092,0.4655,0.4091,0.0833,0.8952,0.5429,0.5042,0.5238,0.0618,0.9429
3,0.4838,0.5074,0.522,0.5145,0.4792,0.5267,0.5402,0.625,0.2128,0.85,...,0.4863,0.494,0.45,0.1084,0.8706,0.4641,0.4583,0.3636,0.026,0.9615
4,0.5317,0.5225,0.5049,0.5247,0.5137,0.4784,0.4654,0.5714,0.2162,0.8588,...,0.4722,0.4817,0.5217,0.1304,0.8889,0.5091,0.5016,0.5625,0.0588,0.9539
5,0.4893,0.4862,0.4957,0.456,0.4816,0.4611,0.4473,0.5227,0.217,0.8397,...,0.5525,0.5118,0.5714,0.092,0.9277,0.487,0.4791,0.5,0.0537,0.9506
6,0.5034,0.4914,0.4716,0.489,0.5064,0.4744,0.4631,0.3684,0.1489,0.7798,...,0.3985,0.3883,0.1765,0.0411,0.8783,0.4791,0.4639,0.1111,0.0068,0.9532
7,0.502,0.5239,0.5245,0.5103,0.5037,0.4891,0.4798,0.5143,0.2169,0.8111,...,0.5412,0.5506,0.6,0.1224,0.9,0.5274,0.5046,0.3846,0.0305,0.9503
8,0.4203,0.405,0.4189,0.3868,0.4062,0.4159,0.4104,0.4091,0.1636,0.8354,...,0.5422,0.5284,0.5357,0.124,0.8796,0.4938,0.5043,0.3333,0.017,0.9653


In [22]:
predictions = model.predict(final_df)

In [23]:
predictions_df = pd.DataFrame(predictions, columns=["X Home Goals", "X Away Goals"])
predictions_df = pd.concat([matchups_df[["Home Team", "Away Team"]], predictions_df], axis=1)

In [24]:
predictions_df = predictions_df.reindex(['Date', *predictions_df.columns], axis=1).assign(Date=today)

In [25]:
predictions_df["X Home Spread"] = round(predictions_df["X Home Goals"]-predictions_df["X Away Goals"], 2)

In [26]:
predictions_df['X Winner'] = predictions_df.apply(lambda row: row['Home Team'] if row['X Home Goals'] > row['X Away Goals'] else row['Away Team'] if row['X Away Goals'] > row['X Home Goals'] else 'Draw', axis=1)

In [27]:
predictions_df

Unnamed: 0,Date,Home Team,Away Team,X Home Goals,X Away Goals,X Home Spread,X Winner
0,2024-03-05,New Jersey Devils,Florida Panthers,2.68,3.9,-1.22,Florida Panthers
1,2024-03-05,Pittsburgh Penguins,Columbus Blue Jackets,3.29,4.02,-0.73,Columbus Blue Jackets
2,2024-03-05,Boston Bruins,Edmonton Oilers,3.42,4.02,-0.6,Edmonton Oilers
3,2024-03-05,New York Islanders,St Louis Blues,3.34,3.51,-0.17,St Louis Blues
4,2024-03-05,Nashville Predators,Montreal Canadiens,4.13,3.04,1.09,Nashville Predators
5,2024-03-05,Winnipeg Jets,Seattle Kraken,3.35,3.0,0.35,Winnipeg Jets
6,2024-03-05,Arizona Coyotes,Chicago Blackhawks,3.77,3.51,0.26,Arizona Coyotes
7,2024-03-05,Los Angeles Kings,Vancouver Canucks,3.79,3.2,0.59,Los Angeles Kings
8,2024-03-05,San Jose Sharks,Dallas Stars,3.32,3.73,-0.41,Dallas Stars


In [28]:
# Write today's predictions to csv
f = open("predictions.csv", 'w')

for col in predictions_df.columns.values:
    f.write(col + ",")

f.write("\n")

for col in predictions_df.values:
    for row in col:
        f.write(str(row) + ",")
    f.write("\n")

f.close()

In [29]:
# Add predictions to record.csv
f = open("record.csv", 'a')

for col in predictions_df.values:
    for row in col:
        f.write(str(row) + ",")
    f.write("\n")

f.close()