In [1]:
import pandas as pd
from datetime import date, timedelta
from random import randint
from time import sleep
import os
import numpy as np
import pymysql
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
import joblib

In [2]:
username = "root"
password = "root"
host = "localhost"
port = "3307"
database = "nhl_optimizer"

engine = create_engine(f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}")

In [3]:
query = "SELECT * FROM player_data"

df = pd.read_sql(query,engine)


## Goals Model

In [6]:
# Split data
X = df.drop(columns=['Player', 'Goals/60', 'Total Assists/60', 'First Assists/60', 'Second Assists/60', 'Total Points/60'])
y = df['Goals/60']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = ['Team', 'Position']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Initialize the model
ridge = Ridge(alpha=10)



goals_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', ridge)
])

# Train model
goals_pipeline.fit(X_train, y_train)
# Make predictions
y_pred = goals_pipeline.predict(X_test)


# Calculate the MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Calculate the R^2 value
r2 = r2_score(y_test, y_pred)
print(f"R^2 Value: {r2}")

Mean Absolute Error: 0.06439601606150593
R^2 Value: 0.9580436657773154


## Assists Model

In [7]:
# Split data
X = df.drop(columns=['Player', 'Goals/60','Total Assists/60', 'First Assists/60', 'Second Assists/60', 'Total Points/60'])
y = df['Total Assists/60']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = ['Team', 'Position']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Initialize the model
ridge = Ridge(alpha=10)


assists_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', ridge)
])

# Train model
assists_pipeline.fit(X_train, y_train)
# Make predictions
y_pred = assists_pipeline.predict(X_test)


# Calculate the MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Calculate the R^2 value
r2 = r2_score(y_test, y_pred)
print(f"R^2 Value: {r2}")

Mean Absolute Error: 0.22165411977203472
R^2 Value: 0.6659799919158238


In [8]:
# Save the pipelines
joblib.dump(goals_pipeline, 'goals_model.pkl')
joblib.dump(assists_pipeline,'assists_model.pkl')

['assists_model.pkl']

In [9]:
# Load the models
loaded_goals_pipeline = joblib.load('goals_model.pkl')
loaded_assists_pipeline = joblib.load('assists_model.pkl')

# Read and process the data
season_totals = pd.read_csv('season_totals_23.csv',index_col=0)
season_totals.drop(columns=["Goals/60","Total Assists/60"], inplace=True)

# Predictions
goal_predictions = loaded_goals_pipeline.predict(season_totals)
assist_predictions = loaded_assists_pipeline.predict(season_totals)

In [10]:
# Create dataframe for reference

num_games = 82  # Standard NHL season

result_df = pd.DataFrame({
    'Team':season_totals['Team'].values,
    'Position':season_totals['Position'].values,
    'Player':season_totals['Player'].values,
    'Projected Goals/60':goal_predictions,
    'Projected Assists/60':assist_predictions,
    'Projected Goals (TOI)':(season_totals['TOI/GP']/60) * goal_predictions,
    'Projected Assists (TOI)':(season_totals['TOI/GP']/60) * assist_predictions,
})


result_df['Projected Goals (Season)'] = (result_df['Projected Goals (TOI)'] * num_games).round(1)

result_df['Projected Assists (Season)'] = (result_df['Projected Assists (TOI)'] * num_games).round(1)

result_df['Projected Pts (Season)'] = ((result_df['Projected Assists (TOI)'] + result_df['Projected Goals (TOI)'] )* num_games).round(1)


result_df = result_df.round(2)

selected = result_df[['Team','Position','Player', 'Projected Goals (Season)','Projected Assists (Season)', 'Projected Pts (Season)']]

## Sort and filter end data
selected = selected.sort_values(by='Projected Pts (Season)', ascending=False)
selected.head(30).round()

Unnamed: 0,Team,Position,Player,Projected Goals (Season),Projected Assists (Season),Projected Pts (Season)
380,EDM,C,Connor McDavid,54.0,65.0,119.0
293,COL,C,Nathan MacKinnon,46.0,64.0,110.0
340,BOS,R,David Pastrnak,51.0,58.0,109.0
323,EDM,C,Leon Draisaitl,48.0,60.0,107.0
387,COL,R,Mikko Rantanen,48.0,59.0,106.0
435,MIN,L,Kirill Kaprizov,45.0,60.0,106.0
654,N.J,C,Jack Hughes,42.0,54.0,97.0
446,FLA,L,Matthew Tkachuk,41.0,53.0,94.0
10,WSH,L,Alex Ovechkin,42.0,50.0,92.0
551,COL,D,Cale Makar,26.0,66.0,92.0


In [11]:
result_df.to_csv('nhl_predictions.csv',index=False)