<a href="https://colab.research.google.com/github/ccstevie/nhl_model/blob/main/model2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd

In [2]:
urlH = "https://www.naturalstattrick.com/games.php?fromseason=20132014&thruseason=20232024&stype=2&sit=all&loc=H&team=All&rate=n"
reqH = requests.get(urlH)
reqH.status_code

200

In [3]:
urlA = "https://www.naturalstattrick.com/games.php?fromseason=20132014&thruseason=20232024&stype=2&sit=all&loc=A&team=All&rate=n"
reqA = requests.get(urlA)
reqA.status_code

200

In [4]:
dfH = pd.read_html(urlH, header=0, na_values=["-"])[0]
dfH.drop(columns=["Unnamed: 2", "Game", "TOI", "Attendance", "GF%", "PDO", "SH%", "SV%", "CF", "CA", "FF", 
                  "FA", "SF", "SA", "xGF", "xGA", "SCF", "SCA", "HDCF", "HDCA", "HDSF", "HDSA", "HDGF", "HDGA", "MDCF", "MDCA",
                    "MDSF", "MDSA", "MDGF", "MDGA", "LDCF", "LDCA", "LDSF", "LDSA", "LDGF", "LDGA"], inplace=True)
dfH = dfH.add_prefix("Home ")

In [5]:
dfH.columns

Index(['Home Team', 'Home CF%', 'Home FF%', 'Home SF%', 'Home GF', 'Home GA',
       'Home xGF%', 'Home SCF%', 'Home HDCF%', 'Home HDSF%', 'Home HDGF%',
       'Home HDSH%', 'Home HDSV%', 'Home MDCF%', 'Home MDSF%', 'Home MDGF%',
       'Home MDSH%', 'Home MDSV%', 'Home LDCF%', 'Home LDSF%', 'Home LDGF%',
       'Home LDSH%', 'Home LDSV%'],
      dtype='object')

In [6]:
dfA = pd.read_html(urlA, header=0, na_values=["-"])[0]
dfA.drop(columns=["Unnamed: 2", "Game", "TOI", "Attendance", "GF%", "PDO", "SH%", "SV%", "CF", "CA", "FF", 
                  "FA", "SF", "SA", "xGF", "xGA", "SCF", "SCA", "HDCF", "HDCA", "HDSF", "HDSA", "HDGF", "HDGA", "MDCF", "MDCA",
                    "MDSF", "MDSA", "MDGF", "MDGA", "LDCF", "LDCA", "LDSF", "LDSA", "LDGF", "LDGA"], inplace=True)
dfA = dfA.add_prefix("Away ")

In [7]:
# Merge home and array games to obtain matchup table
df = pd.merge(dfH, dfA, left_index=True, right_index=True)

In [8]:
# Drop unused columns
df.dropna(inplace=True)
df.drop(columns=["Away GF", "Away GA"], inplace=True)

In [9]:
X = df.drop(["Home Team", "Away Team", "Home GF", "Home GA"], axis=1)
# Convert percentage values to decimal form
for col in X:
    X[col] = X[col] / 100.0
X.head(1)

Unnamed: 0,Home CF%,Home FF%,Home SF%,Home xGF%,Home SCF%,Home HDCF%,Home HDSF%,Home HDGF%,Home HDSH%,Home HDSV%,...,Away MDCF%,Away MDSF%,Away MDGF%,Away MDSH%,Away MDSV%,Away LDCF%,Away LDSF%,Away LDGF%,Away LDSH%,Away LDSV%
1,0.4911,0.5063,0.5224,0.4631,0.4902,0.6667,0.6471,0.6667,0.3636,0.6667,...,0.6061,0.6471,0.6667,0.1818,0.8333,0.537,0.4839,0.0,0.0,0.9375


In [10]:
y = df[["Home GF", "Home GA"]]

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

base_model = RandomForestRegressor(random_state=42)
model = MultiOutputRegressor(base_model)
model.fit(X_train, y_train)

In [13]:
from sklearn.metrics import mean_squared_error
y_train_pred = model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
print("Training Mean Squared Error:", train_mse)

y_test_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
print("Test Mean Squared Error:", test_mse)

Training Mean Squared Error: 0.0799782832278481
Test Mean Squared Error: 0.5092370662460568


In [14]:
from sklearn.model_selection import cross_val_score

num_folds = 5

mse_scores = cross_val_score(model, X, y, cv=num_folds, scoring='neg_mean_squared_error')

mse_scores = -mse_scores

print("Cross-Validation Mean Squared Error Scores:")
print(mse_scores)

mean_mse = mse_scores.mean()
print("Mean MSE:", mean_mse)

Cross-Validation Mean Squared Error Scores:
[0.51662192 0.52304399 0.57999209 0.44851978 0.67199019]
Mean MSE: 0.5480335937188036


In [15]:
from datetime import date, timedelta

# we will use the last 30 days as our data range
today = date.today()
print("Today's date:", today)
start = today - timedelta(days=30)
print("30 days ago:", start)

Today's date: 2024-03-06
30 days ago: 2024-02-05


In [16]:
url = f"https://www.naturalstattrick.com/teamtable.php?fromseason=20232024&thruseason=20232024&stype=2&sit=all&score=all&rate=n&team=all&loc=B&gpf=410&fd={start}&td={today}"
req = requests.get(url)
req.status_code

200

In [17]:
df2 = pd.read_html(url, header=0, index_col = 0, na_values=["-"])[0]

In [18]:
from get_todays_games import getGames

matchups = getGames()
matchups


[('Sabres', 'Maple Leafs'), ('Red Wings', 'Avalanche'), ('Senators', 'Ducks')]

In [19]:
res = pd.DataFrame()

for away, home in matchups:
    home_df = df2[df2["Team"].str.contains(home)]
    home_df = home_df.add_prefix('Home ')
    away_df = df2[df2["Team"].str.contains(away)]
    away_df = away_df.add_prefix('Away ')
    home_df = home_df.reset_index(drop=True)
    away_df = away_df.reset_index(drop=True)
    matchup_df = pd.merge(home_df, away_df, left_index=True, right_index=True)
    res = pd.concat([res, matchup_df], ignore_index=True)
    
res.head(10)

Unnamed: 0,Home Team,Home GP,Home TOI,Home W,Home L,Home OTL,Home ROW,Home Points,Home Point %,Home CF,...,Away LDSA,Away LDSF%,Away LDGF,Away LDGA,Away LDGF%,Away LDSH%,Away LDSV%,Away SH%,Away SV%,Away PDO
0,Toronto Maple Leafs,14,845:54,10,4,0,9,20,0.714,881,...,166,53.5,10,5,66.67,5.24,96.99,8.65,91.42,1.001
1,Colorado Avalanche,14,845:35,6,6,2,6,14,0.5,856,...,153,46.88,6,4,60.0,4.44,97.39,12.54,90.06,1.026
2,Anaheim Ducks,11,665:00,4,6,1,4,9,0.409,531,...,152,49.83,2,12,14.29,1.32,92.11,9.78,87.83,0.976


In [20]:
common_cols = [col for col in res.columns.intersection(df.columns)]
matchups_df = res[common_cols]
final_df = matchups_df.drop(["Home Team", "Away Team", "Home GA", "Home GF"], axis=1)

In [21]:
# Convert percentage values to decimal form
for col in final_df:
    final_df[col] = final_df[col] / 100.0
final_df.head(10)

Unnamed: 0,Home CF%,Home FF%,Home SF%,Home xGF%,Home SCF%,Home HDCF%,Home HDSF%,Home HDGF%,Home HDSH%,Home HDSV%,...,Away MDCF%,Away MDSF%,Away MDGF%,Away MDSH%,Away MDSV%,Away LDCF%,Away LDSF%,Away LDGF%,Away LDSH%,Away LDSV%
0,0.4989,0.5186,0.5386,0.5359,0.5099,0.548,0.5495,0.54,0.2213,0.77,...,0.5222,0.5512,0.6111,0.0973,0.9239,0.5238,0.535,0.6667,0.0524,0.9699
1,0.4968,0.4976,0.5034,0.4713,0.5101,0.5042,0.506,0.5,0.1508,0.8455,...,0.49,0.506,0.6,0.1765,0.8795,0.4969,0.4688,0.6,0.0444,0.9739
2,0.4148,0.4004,0.3907,0.3529,0.408,0.3415,0.3439,0.3902,0.2963,0.7573,...,0.4976,0.5475,0.6,0.1531,0.8765,0.5011,0.4983,0.1429,0.0132,0.9211


In [22]:
predictions = model.predict(final_df)

In [23]:
predictions_df = pd.DataFrame(predictions, columns=["X Home Goals", "X Away Goals"])
predictions_df = pd.concat([matchups_df[["Home Team", "Away Team"]], predictions_df], axis=1)

In [24]:
predictions_df = predictions_df.reindex(['Date', *predictions_df.columns], axis=1).assign(Date=today)

In [25]:
predictions_df["X Home Spread"] = round(predictions_df["X Home Goals"]-predictions_df["X Away Goals"], 2)

In [26]:
predictions_df['X Winner'] = predictions_df.apply(lambda row: row['Home Team'] if row['X Home Goals'] > row['X Away Goals'] else row['Away Team'] if row['X Away Goals'] > row['X Home Goals'] else 'Draw', axis=1)

In [27]:
predictions_df

Unnamed: 0,Date,Home Team,Away Team,X Home Goals,X Away Goals,X Home Spread,X Winner
0,2024-03-06,Toronto Maple Leafs,Buffalo Sabres,3.8,3.26,0.54,Toronto Maple Leafs
1,2024-03-06,Colorado Avalanche,Detroit Red Wings,3.24,3.49,-0.25,Detroit Red Wings
2,2024-03-06,Anaheim Ducks,Ottawa Senators,4.14,4.04,0.1,Anaheim Ducks


In [28]:
# Write today's predictions to csv
f = open("predictions.csv", 'w')

for i, col in enumerate(predictions_df.columns.values):
    f.write(col)
    if i != len(predictions_df.columns.values) - 1:
        f.write(",")

f.write("\n")

for col in predictions_df.values:
    for i, row in enumerate(col):
        f.write(str(row))
        if i != len(col) - 1:
            f.write(",")
    f.write("\n")

f.close()