<a href="https://colab.research.google.com/github/ccstevie/nhl_model/blob/main/model2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
import pandas as pd

In [3]:
urlH = "https://www.naturalstattrick.com/games.php?fromseason=20132014&thruseason=20232024&stype=2&sit=all&loc=H&team=All&rate=n"
reqH = requests.get(urlH)
reqH.status_code

200

In [4]:
urlA = "https://www.naturalstattrick.com/games.php?fromseason=20132014&thruseason=20232024&stype=2&sit=all&loc=A&team=All&rate=n"
reqA = requests.get(urlA)
reqA.status_code

200

In [5]:
dfH = pd.read_html(urlH, header=0, na_values=["-"])[0]
dfH.drop(columns=["Unnamed: 2", "Game", "TOI", "Attendance", "GF%", "PDO", "SH%", "SV%", "CF", "CA", "FF", 
                  "FA", "SF", "SA", "xGF", "xGA", "SCF", "SCA", "HDCF", "HDCA", "HDSF", "HDSA", "HDGF", "HDGA", "MDCF", "MDCA",
                    "MDSF", "MDSA", "MDGF", "MDGA", "LDCF", "LDCA", "LDSF", "LDSA", "LDGF", "LDGA"], inplace=True)
dfH = dfH.add_prefix("Home ")

In [6]:
dfH.columns

Index(['Home Team', 'Home CF%', 'Home FF%', 'Home SF%', 'Home GF', 'Home GA',
       'Home xGF%', 'Home SCF%', 'Home HDCF%', 'Home HDSF%', 'Home HDGF%',
       'Home HDSH%', 'Home HDSV%', 'Home MDCF%', 'Home MDSF%', 'Home MDGF%',
       'Home MDSH%', 'Home MDSV%', 'Home LDCF%', 'Home LDSF%', 'Home LDGF%',
       'Home LDSH%', 'Home LDSV%'],
      dtype='object')

In [7]:
dfA = pd.read_html(urlA, header=0, na_values=["-"])[0]
dfA.drop(columns=["Unnamed: 2", "Game", "TOI", "Attendance", "GF%", "PDO", "SH%", "SV%", "CF", "CA", "FF", 
                  "FA", "SF", "SA", "xGF", "xGA", "SCF", "SCA", "HDCF", "HDCA", "HDSF", "HDSA", "HDGF", "HDGA", "MDCF", "MDCA",
                    "MDSF", "MDSA", "MDGF", "MDGA", "LDCF", "LDCA", "LDSF", "LDSA", "LDGF", "LDGA"], inplace=True)
dfA = dfA.add_prefix("Away ")

In [8]:
# Merge home and array games to obtain matchup table
df = pd.merge(dfH, dfA, left_index=True, right_index=True)

In [9]:
# Drop unused columns
df.dropna(inplace=True)
df.drop(columns=["Away GF", "Away GA"], inplace=True)

In [10]:
X = df.drop(["Home Team", "Away Team", "Home GF", "Home GA"], axis=1)
# Convert percentage values to decimal form
for col in X:
    X[col] = X[col] / 100.0
X.head(1)

Unnamed: 0,Home CF%,Home FF%,Home SF%,Home xGF%,Home SCF%,Home HDCF%,Home HDSF%,Home HDGF%,Home HDSH%,Home HDSV%,...,Away MDCF%,Away MDSF%,Away MDGF%,Away MDSH%,Away MDSV%,Away LDCF%,Away LDSF%,Away LDGF%,Away LDSH%,Away LDSV%
1,0.4911,0.5063,0.5224,0.4631,0.4902,0.6667,0.6471,0.6667,0.3636,0.6667,...,0.6061,0.6471,0.6667,0.1818,0.8333,0.537,0.4839,0.0,0.0,0.9375


In [11]:
y = df[["Home GF", "Home GA"]]

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

base_model = RandomForestRegressor(random_state=42)
model = MultiOutputRegressor(base_model)
model.fit(X_train, y_train)

In [14]:
from sklearn.metrics import mean_squared_error
y_train_pred = model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
print("Training Mean Squared Error:", train_mse)

y_test_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
print("Test Mean Squared Error:", test_mse)

Training Mean Squared Error: 0.0799782832278481
Test Mean Squared Error: 0.5092370662460568


In [15]:
from sklearn.model_selection import cross_val_score

num_folds = 5

mse_scores = cross_val_score(model, X, y, cv=num_folds, scoring='neg_mean_squared_error')

mse_scores = -mse_scores

print("Cross-Validation Mean Squared Error Scores:")
print(mse_scores)

mean_mse = mse_scores.mean()
print("Mean MSE:", mean_mse)

Cross-Validation Mean Squared Error Scores:
[0.51662192 0.52304399 0.57999209 0.44851978 0.67199019]
Mean MSE: 0.5480335937188036


In [16]:
from datetime import date, timedelta

# we will use the last 30 days as our data range
today = date.today()
print("Today's date:", today)
start = today - timedelta(days=30)
print("30 days ago:", start)

Today's date: 2024-03-04
30 days ago: 2024-02-03


In [17]:
url = f"https://www.naturalstattrick.com/teamtable.php?fromseason=20232024&thruseason=20232024&stype=2&sit=all&score=all&rate=n&team=all&loc=B&gpf=410&fd={start}&td={today}"
req = requests.get(url)
req.status_code

200

In [18]:
df2 = pd.read_html(url, header=0, index_col = 0, na_values=["-"])[0]

In [19]:
from get_todays_games import getGames

matchups = getGames()
matchups


[('Golden Knights', 'Blue Jackets'),
 ('Panthers', 'Rangers'),
 ('Blues', 'Flyers'),
 ('Bruins', 'Maple Leafs'),
 ('Blackhawks', 'Avalanche'),
 ('Kraken', 'Flames')]

In [20]:
res = pd.DataFrame()

for away, home in matchups:
    home_df = df2[df2["Team"].str.contains(home)]
    home_df = home_df.add_prefix('Home ')
    away_df = df2[df2["Team"].str.contains(away)]
    away_df = away_df.add_prefix('Away ')
    home_df = home_df.reset_index(drop=True)
    away_df = away_df.reset_index(drop=True)
    matchup_df = pd.merge(home_df, away_df, left_index=True, right_index=True)
    res = pd.concat([res, matchup_df], ignore_index=True)
    
res.head(10)

Unnamed: 0,Home Team,Home GP,Home TOI,Home W,Home L,Home OTL,Home ROW,Home Points,Home Point %,Home CF,...,Away LDSA,Away LDSF%,Away LDGF,Away LDGA,Away LDGF%,Away LDSH%,Away LDSV%,Away SH%,Away SV%,Away PDO
0,Columbus Blue Jackets,10,600:00,4,6,0,4,8,0.4,655,...,149,45.42,5,6,45.45,4.03,95.97,10.97,88.6,0.996
1,New York Rangers,12,729:40,10,1,1,10,21,0.875,727,...,147,54.06,7,3,70.0,4.05,97.96,10.62,94.86,1.055
2,Philadelphia Flyers,12,720:54,7,4,1,7,15,0.625,776,...,165,45.9,4,7,36.36,2.86,95.76,10.58,90.83,1.014
3,Toronto Maple Leafs,13,785:54,10,3,0,9,20,0.769,818,...,174,44.05,6,9,40.0,4.38,94.83,8.92,89.16,0.981
4,Colorado Avalanche,13,785:35,5,6,2,5,12,0.462,797,...,159,45.36,1,8,11.11,0.76,94.97,6.94,88.86,0.958
5,Calgary Flames,11,663:56,8,3,0,8,16,0.727,715,...,146,48.04,8,7,53.33,5.93,95.21,8.36,92.31,1.007


In [21]:
common_cols = [col for col in res.columns.intersection(df.columns)]
matchups_df = res[common_cols]
final_df = matchups_df.drop(["Home Team", "Away Team", "Home GA", "Home GF"], axis=1)

In [22]:
# Convert percentage values to decimal form
for col in final_df:
    final_df[col] = final_df[col] / 100.0
final_df.head(10)

Unnamed: 0,Home CF%,Home FF%,Home SF%,Home xGF%,Home SCF%,Home HDCF%,Home HDSF%,Home HDGF%,Home HDSH%,Home HDSV%,...,Away MDCF%,Away MDSF%,Away MDGF%,Away MDSH%,Away MDSV%,Away LDCF%,Away LDSF%,Away LDGF%,Away LDSH%,Away LDSV%
0,0.5101,0.5069,0.4962,0.5169,0.5162,0.5137,0.5215,0.5143,0.2118,0.7821,...,0.4439,0.4745,0.4483,0.1398,0.8447,0.4458,0.4542,0.4545,0.0403,0.9597
1,0.4699,0.4706,0.4646,0.4607,0.4728,0.4422,0.451,0.5172,0.163,0.875,...,0.5614,0.5487,0.7391,0.1589,0.9318,0.5406,0.5406,0.7,0.0405,0.9796
2,0.5139,0.5449,0.557,0.5225,0.495,0.4912,0.5059,0.5,0.2093,0.7857,...,0.4878,0.5065,0.45,0.1154,0.8553,0.4549,0.459,0.3636,0.0286,0.9576
3,0.4946,0.5132,0.5378,0.5359,0.5075,0.5508,0.5604,0.5625,0.2328,0.7692,...,0.4853,0.52,0.4375,0.0673,0.9063,0.4681,0.4405,0.4,0.0438,0.9483
4,0.4984,0.4978,0.5037,0.4745,0.5088,0.506,0.5088,0.4722,0.1478,0.8288,...,0.395,0.3882,0.2143,0.0455,0.8942,0.4703,0.4536,0.1111,0.0076,0.9497
5,0.517,0.5225,0.5007,0.5462,0.5122,0.463,0.422,0.5789,0.3014,0.84,...,0.5658,0.5333,0.5833,0.0875,0.9286,0.4948,0.4804,0.5333,0.0593,0.9521


In [25]:
predictions = model.predict(final_df)

In [64]:
predictions_df = pd.DataFrame(predictions, columns=["X Home Goals", "X Away Goals"])
predictions_df = pd.concat([matchups_df[["Home Team", "Away Team"]], predictions_df], axis=1)

In [65]:
predictions_df = predictions_df.reindex(['Date', *predictions_df.columns], axis=1).assign(Date=today)

In [66]:
predictions_df["X Home Spread"] = round(predictions_df["X Home Goals"]-predictions_df["X Away Goals"], 2)

In [67]:
predictions_df['X Winner'] = predictions_df.apply(lambda row: row['Home Team'] if row['X Home Goals'] > row['X Away Goals'] else row['Away Team'] if row['X Away Goals'] > row['X Home Goals'] else 'Draw', axis=1)

In [68]:
predictions_df

Unnamed: 0,Date,Home Team,Away Team,X Home Goals,X Away Goals,X Home Spread,X Winner
0,2024-03-04,Columbus Blue Jackets,Vegas Golden Knights,3.42,3.56,-0.14,Vegas Golden Knights
1,2024-03-04,New York Rangers,Florida Panthers,2.97,3.46,-0.49,Florida Panthers
2,2024-03-04,Philadelphia Flyers,St Louis Blues,3.35,3.66,-0.31,St Louis Blues
3,2024-03-04,Toronto Maple Leafs,Boston Bruins,4.15,3.43,0.72,Toronto Maple Leafs
4,2024-03-04,Colorado Avalanche,Chicago Blackhawks,3.6,2.66,0.94,Colorado Avalanche
5,2024-03-04,Calgary Flames,Seattle Kraken,3.91,2.97,0.94,Calgary Flames


In [69]:
# Write today's predictions to csv
f = open("predictions.csv", 'w')

for col in predictions_df.columns.values:
    f.write(col + ",")

f.write("\n")

for col in predictions_df.values:
    for row in col:
        f.write(str(row) + ",")
    f.write("\n")

f.close()

In [70]:
# Add predictions to record.csv
f = open("record.csv", 'a')

for col in predictions_df.values:
    for row in col:
        f.write(str(row) + ",")
    f.write("\n")

f.close()