In [9]:
import pandas as pd

In [10]:
home_data = pd.read_csv('home_games.csv')
away_data = pd.read_csv('away_games.csv')

In [11]:
# Drop unused columns
home_data.drop(columns=["Unnamed: 2", "TOI", "GF%", "PDO", "SH%", "SV%", "CF", "CA", "FF", 
                        "FA", "SF", "SA", "xGA", "SCF", "SCA", "HDCF", "HDCA", "HDGF", "HDGA"], inplace=True)
away_data.drop(columns=["Unnamed: 2", "TOI", "GF%", "PDO", "SH%", "SV%", "CF", "CA", "FF", 
                        "FA", "SF", "SA", "xGA", "SCF", "SCA", "HDCF", "HDCA", "HDGF", "HDGA"], inplace=True)

In [12]:
# Merge datasets, each row represents 1 game with advanced stats for both teams
merged_data = pd.merge(home_data, away_data, on='Game', suffixes=('_home', '_away'))

In [13]:
merged_data.columns

Index(['Game', 'Team_home', 'CF%_home', 'FF%_home', 'SF%_home', 'GF_home',
       'GA_home', 'xGF_home', 'xGF%_home', 'SCF%_home', 'HDCF%_home',
       'HDGF%_home', 'HDSH%_home', 'HDSV%_home', 'Team_away', 'CF%_away',
       'FF%_away', 'SF%_away', 'GF_away', 'GA_away', 'xGF_away', 'xGF%_away',
       'SCF%_away', 'HDCF%_away', 'HDGF%_away', 'HDSH%_away', 'HDSV%_away'],
      dtype='object')

In [14]:
# Drop NA
merged_data.replace('-', pd.NA, inplace=True)
merged_data.dropna(inplace=True)

In [15]:
# Convert "Game" column to Date
merged_data["Game"] = merged_data["Game"].str.split(" - ").str[0]
merged_data.rename(columns={'Game': 'Date'}, inplace=True)
# Convert 'Date' column from string to datetime
merged_data['Date'] = pd.to_datetime(merged_data['Date'])

In [16]:
drop = ["Date", "Team_home", "Team_away", "GF_home", "GA_home", "GF_away", "GA_away", "xGF_home", "xGF_away"]

for col in merged_data:
    if col not in drop:
        try:
            merged_data[col] = merged_data[col].astype(float)
            merged_data[col] = merged_data[col] / 100.0
        except:
            continue

In [17]:
merged_data

Unnamed: 0,Date,Team_home,CF%_home,FF%_home,SF%_home,GF_home,GA_home,xGF_home,xGF%_home,SCF%_home,...,SF%_away,GF_away,GA_away,xGF_away,xGF%_away,SCF%_away,HDCF%_away,HDGF%_away,HDSH%_away,HDSV%_away
0,2013-10-01,Montreal Canadiens,0.5895,0.6286,0.6042,2,2,2.42,0.6277,0.5789,...,0.3958,2,2,1.43,0.3723,0.4211,0.2353,0.3333,0.3333,0.8000
1,2013-10-01,Chicago Blackhawks,0.5526,0.5882,0.6098,4,1,1.44,0.5750,0.5714,...,0.3902,1,4,1.07,0.4250,0.4286,0.2000,0.0000,0.0000,0.7143
2,2013-10-01,Edmonton Oilers,0.5517,0.4918,0.5227,2,4,1.54,0.3679,0.4783,...,0.4773,4,2,2.64,0.6321,0.5217,0.6842,0.5000,0.1111,0.6667
3,2013-10-02,Philadelphia Flyers,0.5714,0.5273,0.4474,0,2,1.56,0.5740,0.5882,...,0.5526,2,0,1.16,0.4260,0.4118,0.3846,1.0000,0.4000,1.0000
4,2013-10-02,Detroit Red Wings,0.6825,0.7200,0.7105,2,1,1.39,0.6347,0.6786,...,0.2895,1,2,0.80,0.3653,0.3214,0.3750,0.5000,0.3333,0.7500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13124,2024-03-19,Edmonton Oilers,0.4302,0.3846,0.4595,2,1,1.43,0.4317,0.4444,...,0.5405,1,2,1.88,0.5683,0.5556,0.5000,0.3333,0.2500,0.7143
13125,2024-03-19,Anaheim Ducks,0.5217,0.5094,0.5250,0,3,1.33,0.5109,0.4000,...,0.4750,3,0,1.28,0.4891,0.6000,0.3333,1.0000,0.5000,1.0000
13126,2024-03-19,Los Angeles Kings,0.4516,0.5000,0.5610,5,1,2.36,0.6473,0.5000,...,0.4390,1,5,1.29,0.3527,0.5000,0.3158,0.0000,0.0000,0.5000
13127,2024-03-19,Vancouver Canucks,0.5062,0.5769,0.6452,1,0,3.38,0.7669,0.6829,...,0.3548,0,1,1.03,0.2331,0.3171,0.1579,0.0000,0.0000,0.8889


In [18]:
# Function to calculate games in the last 4 days
def games_in_last_4_days(group):
    group = group.sort_values('Date')  # Ensure games are sorted by date
    count_list = []  # Store counts

    for i in range(len(group)):
        current_date = group.iloc[i]['Date']
        past_games = group[(group['Date'] >= current_date - pd.Timedelta(days=4)) & 
                           (group['Date'] < current_date)]
        count_list.append(len(past_games))

    group['Games in Last 4 Days'] = count_list
    return group

In [19]:
# Apply the function separately for home and away teams
merged_data = merged_data.groupby('Team_home', group_keys=False).apply(games_in_last_4_days)
merged_data.rename(columns={'Games in Last 4 Days': 'Games in Last 4 Days_home'}, inplace=True)

merged_data = merged_data.groupby('Team_away', group_keys=False).apply(games_in_last_4_days)
merged_data.rename(columns={'Games in Last 4 Days': 'Games in Last 4 Days_away'}, inplace=True)


In [20]:
# Calculate goals above expected for home and away teams
merged_data['Goals Above Expected_home'] = merged_data['GF_home'] - merged_data['xGF_home']
merged_data['Goals Above Expected_away'] = merged_data['GF_away'] - merged_data['xGF_away']

In [21]:
# Calculate rolling averages for home and away teams
def calculate_rolling_averages(group, window=3):
    group['Rolling Goals Above Expected_home'] = group['Goals Above Expected_home'].rolling(window=window, min_periods=1).sum()
    group['Rolling Goals Above Expected_away'] = group['Goals Above Expected_away'].rolling(window=window, min_periods=1).sum()
    
    return group

In [22]:
# Apply rolling averages to each home and away team
merged_data = merged_data.groupby('Team_home').apply(calculate_rolling_averages)
merged_data = merged_data.groupby('Team_away').apply(calculate_rolling_averages)

In [23]:
# Add rolling averages for goals scored and conceded (from earlier code)
merged_data['GF_avg_home'] = merged_data.groupby('Team_home')['GF_home'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
merged_data['GA_avg_home'] = merged_data.groupby('Team_home')['GA_home'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
merged_data['GF_avg_away'] = merged_data.groupby('Team_away')['GF_away'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
merged_data['GA_avg_away'] = merged_data.groupby('Team_away')['GA_away'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())

In [24]:
merged_data.columns

Index(['Date', 'Team_home', 'CF%_home', 'FF%_home', 'SF%_home', 'GF_home',
       'GA_home', 'xGF_home', 'xGF%_home', 'SCF%_home', 'HDCF%_home',
       'HDGF%_home', 'HDSH%_home', 'HDSV%_home', 'Team_away', 'CF%_away',
       'FF%_away', 'SF%_away', 'GF_away', 'GA_away', 'xGF_away', 'xGF%_away',
       'SCF%_away', 'HDCF%_away', 'HDGF%_away', 'HDSH%_away', 'HDSV%_away',
       'Games in Last 4 Days_home', 'Games in Last 4 Days_away',
       'Goals Above Expected_home', 'Goals Above Expected_away',
       'Rolling Goals Above Expected_home',
       'Rolling Goals Above Expected_away', 'GF_avg_home', 'GA_avg_home',
       'GF_avg_away', 'GA_avg_away'],
      dtype='object')

In [25]:
# Define features and target
features = [
    # Home team stats
    'CF%_home', 'FF%_home', 'SF%_home', 'xGF%_home', 'SCF%_home', 'HDCF%_home',
    
    # Away team stats
    'CF%_away', 'FF%_away', 'SF%_away', 'xGF%_away', 'SCF%_away', 'HDCF%_away',
    
    # Rolling averages
    'GF_avg_home', 'GA_avg_home', 'GF_avg_away', 'GA_avg_away',
    
    # Goals above expected (rolling)
    'Rolling Goals Above Expected_home', 'Rolling Goals Above Expected_away',

    # Number of games in 4 day window
    'Games in Last 4 Days_home', 'Games in Last 4 Days_away'
]

target_home = 'GF_home'
target_away = 'GF_away'

In [192]:
# Split the data into features and targets
X = merged_data[features]
y_home = merged_data[target_home]
y_away = merged_data[target_away]

In [193]:
# Select a team to inspect (e.g., 'Toronto Maple Leafs')
team_to_inspect = 'Toronto Maple Leafs'

# Filter the dataset for games where the selected team is the away team
team_games = merged_data[merged_data['Team_home'] == team_to_inspect]

# Display relevant columns
inspection_columns = ['Date', 'Team_home', 'Games in Last 4 Days_home']
print(team_games[inspection_columns].head(10))  # Inspect the first 10 games

          Date            Team_home  Games in Last 4 Days_home
21  2013-10-05  Toronto Maple Leafs                          0
36  2013-10-08  Toronto Maple Leafs                          1
63  2013-10-12  Toronto Maple Leafs                          1
96  2013-10-17  Toronto Maple Leafs                          0
128 2013-10-22  Toronto Maple Leafs                          0
312 2013-11-19  Toronto Maple Leafs                          0
409 2013-12-03  Toronto Maple Leafs                          0
422 2013-12-05  Toronto Maple Leafs                          1
449 2013-12-08  Toronto Maple Leafs                          1
489 2013-12-14  Toronto Maple Leafs                          0


In [194]:
import numpy as np

# Combine home and away scores into a single target variable
y = np.column_stack((y_home, y_away))  # y_home and y_away are from your earlier code

In [195]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [196]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

# Define the model
model = RandomForestRegressor(random_state=42)

# Wrap the model in MultiOutputRegressor (if needed, though RandomForestRegressor supports multi-output natively)
multi_output_model = MultiOutputRegressor(model)

# Train the model
multi_output_model.fit(X_train, y_train)

In [197]:
from sklearn.metrics import mean_absolute_error

# Predict on the test set
y_pred = multi_output_model.predict(X_test)

# Calculate MAE for home and away scores
mae_home = mean_absolute_error(y_test[:, 0], y_pred[:, 0])  # Home team scores
mae_away = mean_absolute_error(y_test[:, 1], y_pred[:, 1])  # Away team scores

print(f"MAE for Home Team Goals: {mae_home}")
print(f"MAE for Away Team Goals: {mae_away}")

MAE for Home Team Goals: 0.8694817073170732
MAE for Away Team Goals: 0.7796080139372822


In [198]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'estimator__n_estimators': [100, 200, 300],
    'estimator__max_depth': [None, 10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=multi_output_model, param_grid=param_grid, 
                           cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred = best_model.predict(X_test)
mae_home = mean_absolute_error(y_test[:, 0], y_pred[:, 0])
mae_away = mean_absolute_error(y_test[:, 1], y_pred[:, 1])

print(f"Best Model MAE for Home Team Goals: {mae_home}")
print(f"Best Model MAE for Away Team Goals: {mae_away}")

Best Model MAE for Home Team Goals: 0.8614322367229664
Best Model MAE for Away Team Goals: 0.7739479999978218


In [201]:
# Access feature importance for home and away scores
importances_home = best_model.estimators_[0].feature_importances_
importances_away = best_model.estimators_[1].feature_importances_

# Create DataFrames to visualize feature importance
feature_importance_home = pd.DataFrame({'Feature': features, 'Importance': importances_home})
feature_importance_away = pd.DataFrame({'Feature': features, 'Importance': importances_away})

print("Feature Importance for Home Team Goals:")
print(feature_importance_home.sort_values(by='Importance', ascending=False))

print("\nFeature Importance for Away Team Goals:")
print(feature_importance_away.sort_values(by='Importance', ascending=False))

Feature Importance for Home Team Goals:
                              Feature  Importance
16  Rolling Goals Above Expected_home    0.413261
12                        GF_avg_home    0.245222
15                        GA_avg_away    0.036754
3                           xGF%_home    0.033714
9                           xGF%_away    0.033283
17  Rolling Goals Above Expected_away    0.029223
11                         HDCF%_away    0.022464
6                            CF%_away    0.021158
0                            CF%_home    0.020522
5                          HDCF%_home    0.020326
13                        GA_avg_home    0.018628
8                            SF%_away    0.015266
2                            SF%_home    0.014985
4                           SCF%_home    0.014319
10                          SCF%_away    0.013749
14                        GF_avg_away    0.013560
1                            FF%_home    0.011916
7                            FF%_away    0.011704
19        

In [202]:
import joblib

# Save the model
joblib.dump(best_model, 'nhl_score_predictor.pkl')

['nhl_score_predictor.pkl']

### Predict

In [2]:
import joblib

best_model = joblib.load('nhl_score_predictor.pkl')

In [3]:
from datetime import date, timedelta
import requests

# we will use the last 30 days as our data range
today = date.today()
print("Today's date:", today)
start = today - timedelta(days=30)
print("30 days ago:", start)

Today's date: 2025-03-20
30 days ago: 2025-02-18


In [4]:
url = f"https://www.naturalstattrick.com/teamtable.php?fromseason=20242025&thruseason=20242025&stype=2&sit=all&score=all&rate=n&team=all&loc=B&gpf=410&fd={start}&td={today}"
req = requests.get(url)
req.status_code

200

In [5]:
import pandas as pd

df = pd.read_html(url, header=0, index_col = 0, na_values=["-"])[0]

In [6]:
from get_todays_games import getGames

matchups = getGames()
matchups

[('Avalanche', 'Senators'),
 ('Flames', 'Devils'),
 ('Maple Leafs', 'Rangers'),
 ('Flyers', 'Capitals'),
 ('Panthers', 'Blue Jackets'),
 ('Canadiens', 'Islanders'),
 ('Canucks', 'Blues'),
 ('Ducks', 'Predators'),
 ('Lightning', 'Stars'),
 ('Kings', 'Blackhawks'),
 ('Jets', 'Oilers'),
 ('Sabres', 'Hockey Club'),
 ('Bruins', 'Golden Knights'),
 ('Hurricanes', 'Sharks')]

In [7]:
res = pd.DataFrame()

for away, home in matchups:
    # Extract home and away team stats
    home_df = df[df["Team"].str.contains(home)]
    away_df = df[df["Team"].str.contains(away)]
    
    # Add prefixes to away team stats
    home_df = home_df.add_suffix('_home')
    away_df = away_df.add_suffix('_away')
    
    # Merge home and away stats into a single row
    home_df = home_df.reset_index(drop=True)
    away_df = away_df.reset_index(drop=True)
    matchup_df = pd.merge(home_df, away_df, left_index=True, right_index=True)
    
    # Append to the result DataFrame
    res = pd.concat([res, matchup_df], ignore_index=True)

In [26]:
common_cols = [col for col in res.columns.intersection(merged_data.columns)]
matchups_df = res[common_cols]

In [27]:
drop = ["Team_home", "Team_away", "GA_home", "GF_home", "GA_away", "GF_away", "xGF_home", "xGF_away"]

# Convert percentage values to decimal form
for col in matchups_df:
    if col not in drop:
        try:
            matchups_df = matchups_df.copy()
            matchups_df[col] = matchups_df[col].astype(float) / 100.0
        except:
            continue

In [28]:
matchups_df

Unnamed: 0,Team_home,CF%_home,FF%_home,SF%_home,GF_home,GA_home,xGF_home,xGF%_home,SCF%_home,HDCF%_home,...,SF%_away,GF_away,GA_away,xGF_away,xGF%_away,SCF%_away,HDCF%_away,HDGF%_away,HDSH%_away,HDSV%_away
0,Ottawa Senators,0.4903,0.4819,0.48,40,36,38.8,0.5215,0.4943,0.5057,...,0.5623,43,24,40.34,0.5786,0.5567,0.5251,0.4828,0.1892,0.8101
1,New Jersey Devils,0.4812,0.4749,0.4803,31,36,32.61,0.4533,0.486,0.4596,...,0.4894,25,31,29.08,0.438,0.4708,0.438,0.4815,0.1667,0.8462
2,New York Rangers,0.4931,0.4917,0.4584,40,38,43.45,0.4965,0.4904,0.4441,...,0.4627,48,43,41.48,0.4928,0.4738,0.4876,0.4324,0.1818,0.7789
3,Washington Capitals,0.5318,0.5318,0.531,51,36,39.23,0.5156,0.5292,0.5302,...,0.5201,28,42,31.9,0.4769,0.4862,0.5217,0.4,0.1687,0.6957
4,Columbus Blue Jackets,0.4896,0.491,0.465,34,37,37.03,0.4813,0.4684,0.5,...,0.5651,29,19,39.36,0.5863,0.5657,0.5709,0.64,0.1569,0.8615
5,New York Islanders,0.4962,0.496,0.516,31,36,38.56,0.4696,0.4939,0.4795,...,0.4785,40,25,32.23,0.4634,0.4223,0.4449,0.56,0.1795,0.8451
6,St Louis Blues,0.5115,0.5256,0.535,51,31,37.5,0.5614,0.5357,0.5476,...,0.4915,35,39,36.48,0.4841,0.5085,0.4781,0.4706,0.1975,0.8144
7,Nashville Predators,0.5064,0.5122,0.5152,28,39,41.15,0.5478,0.5083,0.5101,...,0.4337,45,50,39.56,0.4536,0.4578,0.4582,0.449,0.2418,0.7672
8,Dallas Stars,0.4725,0.4524,0.4545,47,38,37.37,0.4894,0.5029,0.5189,...,0.5046,39,25,39.76,0.5613,0.5229,0.5424,0.7083,0.2073,0.9041
9,Chicago Blackhawks,0.4332,0.4226,0.4342,35,52,30.15,0.417,0.411,0.4412,...,0.5172,35,35,37.32,0.5111,0.513,0.4808,0.4412,0.1852,0.7979


In [29]:
urlH = f"https://www.naturalstattrick.com/games.php?fromseason=20242025&thruseason=20242025&stype=2&sit=5v5&loc=H&team=All&rate=n"
reqH = requests.get(urlH)
reqH.status_code

200

In [30]:
urlA = f"https://www.naturalstattrick.com/games.php?fromseason=20242025&thruseason=20242025&stype=2&sit=5v5&loc=A&team=All&rate=n"
reqA = requests.get(urlA)
reqA.status_code

200

In [31]:
home_szn_data = pd.read_html(urlH, header=0, na_values=["-"])[0]
away_szn_data = pd.read_html(urlA, header=0, na_values=["-"])[0]

In [32]:
full_szn_data = pd.merge(home_szn_data, away_szn_data, on='Game', suffixes=('_home', '_away'))

In [33]:
full_szn_data["Game"] = full_szn_data["Game"].str.split(" - ").str[0]
full_szn_data.rename(columns={"Game": "Date"}, inplace=True)
full_szn_data['Date'] = pd.to_datetime(full_szn_data['Date'])

In [34]:
full_szn_data = full_szn_data[["Date"] + common_cols]

In [35]:
drop = ["Team_home", "Team_away", "GA_home", "GF_home", "GA_away", "GF_away", "xGF_home", "xGF_away"]

# Convert percentage values to decimal form
for col in full_szn_data:
    if col not in drop:
        try:
            full_szn_data = full_szn_data.copy()
            full_szn_data[col] = full_szn_data[col].astype(float) / 100.0
        except:
            continue

In [36]:
full_szn_data.columns

Index(['Date', 'Team_home', 'CF%_home', 'FF%_home', 'SF%_home', 'GF_home',
       'GA_home', 'xGF_home', 'xGF%_home', 'SCF%_home', 'HDCF%_home',
       'HDGF%_home', 'HDSH%_home', 'HDSV%_home', 'Team_away', 'CF%_away',
       'FF%_away', 'SF%_away', 'GF_away', 'GA_away', 'xGF_away', 'xGF%_away',
       'SCF%_away', 'HDCF%_away', 'HDGF%_away', 'HDSH%_away', 'HDSV%_away'],
      dtype='object')

In [37]:
from datetime import datetime, timedelta

# Get today's date
today = datetime.today()

# Function to calculate games in the last 4 days
def games_in_last_4_days(team_games, today):
    # Filter games within the last 4 days (excluding today)
    last_4_days_games = team_games[(team_games['Date'] >= today - timedelta(days=4)) & 
                                 (team_games['Date'] < today)]
    return len(last_4_days_games)

# Function to calculate rolling averages and goals above expected
def calculate_rolling_stats(team_games, window=5):
    # Combine GF and GA for home and away games
    team_games['GF'] = team_games.apply(lambda row: row['GF_home'] if row['Team_home'] == team else row['GF_away'], axis=1)
    team_games['GA'] = team_games.apply(lambda row: row['GA_home'] if row['Team_home'] == team else row['GA_away'], axis=1)
    team_games['xGF'] = team_games.apply(lambda row: row['xGF_home'] if row['Team_home'] == team else row['xGF_away'], axis=1)
    
    # Calculate rolling averages for GF and GA
    team_games['GF_avg'] = team_games['GF'].rolling(window=window, min_periods=1).mean()
    team_games['GA_avg'] = team_games['GA'].rolling(window=window, min_periods=1).mean()
    
    # Calculate rolling goals above expected
    team_games['Rolling Goals Above Expected'] = (team_games['GF'] - team_games['xGF']).rolling(window=window, min_periods=1).sum()
    
    # Return the latest values
    return team_games.iloc[-1][['GF_avg', 'GA_avg', 'Rolling Goals Above Expected']]

# Create a dictionary to store the latest stats for each team
latest_stats = {}

# Iterate through each team in full_szn_data
for team in pd.concat([full_szn_data['Team_home'], full_szn_data['Team_away']]).unique():
    # Filter historical data for the team (up to today)
    team_games = full_szn_data[(full_szn_data['Team_home'] == team) | 
                               (full_szn_data['Team_away'] == team)]
    team_games = team_games[team_games['Date'] < today].sort_values('Date')
    
    # Calculate games in the last 4 days
    games_last_4_days = games_in_last_4_days(team_games, today)
    
    # Calculate rolling stats
    rolling_stats = calculate_rolling_stats(team_games)
    
    # Store the latest stats for the team
    latest_stats[team] = {
        'Games in Last 4 Days': games_last_4_days,
        'GF_avg': rolling_stats['GF_avg'],
        'GA_avg': rolling_stats['GA_avg'],
        'Rolling Goals Above Expected': rolling_stats['Rolling Goals Above Expected']
    }

# Add features to matchups_df
for index, row in matchups_df.iterrows():
    home_team = row['Team_home']
    away_team = row['Team_away']
    
    # Add home team features
    matchups_df.at[index, 'Games in Last 4 Days_home'] = latest_stats[home_team]['Games in Last 4 Days']
    matchups_df.at[index, 'GF_avg_home'] = latest_stats[home_team]['GF_avg']
    matchups_df.at[index, 'GA_avg_home'] = latest_stats[home_team]['GA_avg']
    matchups_df.at[index, 'Rolling Goals Above Expected_home'] = latest_stats[home_team]['Rolling Goals Above Expected']
    
    # Add away team features
    matchups_df.at[index, 'Games in Last 4 Days_away'] = latest_stats[away_team]['Games in Last 4 Days']
    matchups_df.at[index, 'GF_avg_away'] = latest_stats[away_team]['GF_avg']
    matchups_df.at[index, 'GA_avg_away'] = latest_stats[away_team]['GA_avg']
    matchups_df.at[index, 'Rolling Goals Above Expected_away'] = latest_stats[away_team]['Rolling Goals Above Expected']

# Check the updated matchups_df
print(matchups_df.head())

               Team_home  CF%_home  FF%_home  SF%_home  GF_home  GA_home  \
0        Ottawa Senators    0.4903    0.4819    0.4800       40       36   
1      New Jersey Devils    0.4812    0.4749    0.4803       31       36   
2       New York Rangers    0.4931    0.4917    0.4584       40       38   
3    Washington Capitals    0.5318    0.5318    0.5310       51       36   
4  Columbus Blue Jackets    0.4896    0.4910    0.4650       34       37   

   xGF_home  xGF%_home  SCF%_home  HDCF%_home  ...  HDSH%_away  HDSV%_away  \
0     38.80     0.5215     0.4943      0.5057  ...      0.1892      0.8101   
1     32.61     0.4533     0.4860      0.4596  ...      0.1667      0.8462   
2     43.45     0.4965     0.4904      0.4441  ...      0.1818      0.7789   
3     39.23     0.5156     0.5292      0.5302  ...      0.1687      0.6957   
4     37.03     0.4813     0.4684      0.5000  ...      0.1569      0.8615   

   Games in Last 4 Days_home GF_avg_home  GA_avg_home  \
0                

In [38]:
X = matchups_df[features]
X

Unnamed: 0,CF%_home,FF%_home,SF%_home,xGF%_home,SCF%_home,HDCF%_home,CF%_away,FF%_away,SF%_away,xGF%_away,SCF%_away,HDCF%_away,GF_avg_home,GA_avg_home,GF_avg_away,GA_avg_away,Rolling Goals Above Expected_home,Rolling Goals Above Expected_away,Games in Last 4 Days_home,Games in Last 4 Days_away
0,0.4903,0.4819,0.48,0.5215,0.4943,0.5057,0.5684,0.5721,0.5623,0.5786,0.5567,0.5251,2.4,2.2,1.6,1.0,4.07,-3.17,1.0,1.0
1,0.4812,0.4749,0.4803,0.4533,0.486,0.4596,0.4821,0.4775,0.4894,0.438,0.4708,0.438,2.4,2.4,1.2,1.6,3.64,-3.05,1.0,2.0
2,0.4931,0.4917,0.4584,0.4965,0.4904,0.4441,0.4508,0.4573,0.4627,0.4928,0.4738,0.4876,1.4,0.8,1.4,1.4,-1.85,-2.43,1.0,2.0
3,0.5318,0.5318,0.531,0.5156,0.5292,0.5302,0.4918,0.4955,0.5201,0.4769,0.4862,0.5217,3.6,2.0,1.2,2.4,5.34,-0.92,1.0,1.0
4,0.4896,0.491,0.465,0.4813,0.4684,0.5,0.5614,0.5451,0.5651,0.5863,0.5657,0.5709,2.2,2.4,1.4,1.8,-0.22,-2.55,1.0,0.0
5,0.4962,0.496,0.516,0.4696,0.4939,0.4795,0.4391,0.4444,0.4785,0.4634,0.4223,0.4449,1.8,1.4,2.0,1.4,-0.11,-0.22,1.0,1.0
6,0.5115,0.5256,0.535,0.5614,0.5357,0.5476,0.4898,0.5038,0.4915,0.4841,0.5085,0.4781,2.4,1.8,2.6,1.8,1.29,2.15,1.0,1.0
7,0.5064,0.5122,0.5152,0.5478,0.5083,0.5101,0.4584,0.4476,0.4337,0.4536,0.4578,0.4582,0.6,1.8,2.0,2.6,-7.11,0.17,1.0,1.0
8,0.4725,0.4524,0.4545,0.4894,0.5029,0.5189,0.5126,0.5199,0.5046,0.5613,0.5229,0.5424,2.2,2.6,1.4,2.2,1.35,-3.59,1.0,1.0
9,0.4332,0.4226,0.4342,0.417,0.411,0.4412,0.5313,0.5313,0.5172,0.5111,0.513,0.4808,1.2,3.4,1.6,0.8,-3.82,-1.64,1.0,1.0


In [39]:
# Make predictions
predictions = best_model.predict(X)

In [41]:
# Format predictions into a DataFrame
predictions_df = pd.DataFrame(predictions, columns=["X Goals Home", "X Goals Away"])

# Add team names and other metadata to the predictions
predictions_df = pd.concat([matchups_df[["Team_home", "Team_away"]], predictions_df], axis=1)

# Calculate additional columns (e.g., spread, winner)
predictions_df["X Spread"] = round(predictions_df["X Goals Home"] - predictions_df["X Goals Away"], 2)
predictions_df['X Winner'] = predictions_df.apply(
    lambda row: row['Team_home'] if row['X Goals Home'] > row['X Goals Away'] else row['Team_away'] if row['X Goals Away'] > row['X Goals Home'] else 'Draw', axis=1
)

# Add today's date
predictions_df.insert(0, "Date", date.today())

# Display the predictions
print(predictions_df)

# Save predictions to a CSV file
predictions_df.to_csv("predictions.csv", index=False)

          Date              Team_home            Team_away  X Goals Home  \
0   2025-03-20        Ottawa Senators   Colorado Avalanche      3.164138   
1   2025-03-20      New Jersey Devils       Calgary Flames      3.252948   
2   2025-03-20       New York Rangers  Toronto Maple Leafs      0.972744   
3   2025-03-20    Washington Capitals  Philadelphia Flyers      4.409255   
4   2025-03-20  Columbus Blue Jackets     Florida Panthers      2.037913   
5   2025-03-20     New York Islanders   Montreal Canadiens      1.572592   
6   2025-03-20         St Louis Blues    Vancouver Canucks      2.505303   
7   2025-03-20    Nashville Predators        Anaheim Ducks      0.949005   
8   2025-03-20           Dallas Stars  Tampa Bay Lightning      2.408899   
9   2025-03-20     Chicago Blackhawks    Los Angeles Kings      0.347955   
10  2025-03-20        Edmonton Oilers        Winnipeg Jets      2.032744   
11  2025-03-20       Utah Hockey Club       Buffalo Sabres      1.504629   
12  2025-03-