In [348]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import joblib
import os

In [349]:
goalies = pd.read_csv('/Users/blairjdaniel/lighthouse/lighthouse/NHL/files/linear_reg_csv_files/goalies_lr.csv')

In [363]:
goalies_to_predict = pd.read_csv('/Users/blairjdaniel/lighthouse/lighthouse/NHL/files/linear_reg_csv_files/goalies_pred.csv')
goalies_to_predict.drop(columns=['Unnamed: 0'])
goalies_rec = pd.read_csv('/Users/blairjdaniel/lighthouse/lighthouse/NHL/files/master_copies/goalies_rec.csv')

In [351]:
goalies.head(50)

Unnamed: 0.1,Unnamed: 0,name,GP,SV%_x,W,L,GA,SOG,SO,TIME,CAP HIT
0,1,"Allen, Jake",454.0,0.908,206.0,181.0,1176.0,12729.0,28.0,25626.0,"$5,850,000"
1,2,"Andersen, Frederik",509.0,0.916,304.0,132.0,1248.0,14802.0,28.0,29401.0,"$5,400,000"
2,3,"Annunen, Justus",44.0,0.901,24.0,14.0,113.0,1142.0,2.0,2375.0,"$837,500"
3,4,"Askarov, Yaroslav",16.0,0.899,5.0,7.0,43.0,427.0,0.0,857.0,"$925,000"
4,5,"Binnington, Jordan",326.0,0.906,165.0,115.0,880.0,9392.0,18.0,18868.0,"$6,000,000"
5,9,"Blackwood, Mackenzie",243.0,0.906,99.0,98.0,676.0,7188.0,13.0,13691.0,"$4,350,000"
6,11,"Bobrovsky, Sergei",745.0,0.914,424.0,240.0,1849.0,21594.0,48.0,42978.0,"$10,000,000"
7,20,"Comrie, Eric",73.0,0.898,31.0,34.0,216.0,2110.0,3.0,4093.0,"$3,000,00"
8,21,"Copley, Pheonix",77.0,0.898,44.0,17.0,202.0,1988.0,4.0,4261.0,"$1,000,000"
9,24,"Daccord, Joey",114.0,0.908,44.0,45.0,293.0,3165.0,4.0,6436.0,"$1,200,000"


In [352]:
# Display basic information about the DataFrame
goalies.info()

# Display summary statistics
goalies.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  69 non-null     int64  
 1   name        69 non-null     object 
 2   GP          69 non-null     float64
 3   SV%_x       69 non-null     float64
 4   W           69 non-null     float64
 5   L           69 non-null     float64
 6   GA          69 non-null     float64
 7   SOG         69 non-null     float64
 8   SO          69 non-null     float64
 9   TIME        69 non-null     float64
 10  CAP HIT     69 non-null     object 
dtypes: float64(8), int64(1), object(2)
memory usage: 6.1+ KB


Unnamed: 0.1,Unnamed: 0,GP,SV%_x,W,L,GA,SOG,SO,TIME
count,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0
mean,62.043478,223.217391,0.905159,108.724638,78.826087,579.26087,6377.884058,12.898551,12639.144928
std,35.111769,200.26386,0.008759,106.530427,69.275443,498.76296,5774.962145,14.753011,11557.487403
min,1.0,15.0,0.877,5.0,7.0,43.0,427.0,0.0,857.0
25%,33.0,84.0,0.901,39.0,32.0,227.0,2288.0,3.0,4655.0
50%,61.0,164.0,0.906,72.0,55.0,431.0,4615.0,7.0,9011.0
75%,94.0,281.0,0.91,148.0,97.0,725.0,8078.0,16.0,15953.0
max,121.0,1047.0,0.918,573.0,337.0,2622.0,29745.0,76.0,60531.0


In [353]:
# Remove NaN from salary and predict on them after the model is built
goalies = goalies.dropna(subset=['CAP HIT'])

In [354]:
# Change CAP HIT to a float and name is salary
goalies = goalies.rename(columns={'CAP HIT': 'salary'})

# Strip the $
goalies['salary'] = goalies['salary'].str.replace('$', '').str.replace(',', '').astype(float)


In [355]:
goalies.describe()

Unnamed: 0.1,Unnamed: 0,GP,SV%_x,W,L,GA,SOG,SO,TIME,salary
count,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0
mean,62.043478,223.217391,0.905159,108.724638,78.826087,579.26087,6377.884058,12.898551,12639.144928,3393394.0
std,35.111769,200.26386,0.008759,106.530427,69.275443,498.76296,5774.962145,14.753011,11557.487403,2428647.0
min,1.0,15.0,0.877,5.0,7.0,43.0,427.0,0.0,857.0,300000.0
25%,33.0,84.0,0.901,39.0,32.0,227.0,2288.0,3.0,4655.0,1000000.0
50%,61.0,164.0,0.906,72.0,55.0,431.0,4615.0,7.0,9011.0,2900000.0
75%,94.0,281.0,0.91,148.0,97.0,725.0,8078.0,16.0,15953.0,5000000.0
max,121.0,1047.0,0.918,573.0,337.0,2622.0,29745.0,76.0,60531.0,10000000.0


In [356]:
goalies = goalies.drop(columns=['Unnamed: 0',])
goalies.head()

Unnamed: 0,name,GP,SV%_x,W,L,GA,SOG,SO,TIME,salary
0,"Allen, Jake",454.0,0.908,206.0,181.0,1176.0,12729.0,28.0,25626.0,5850000.0
1,"Andersen, Frederik",509.0,0.916,304.0,132.0,1248.0,14802.0,28.0,29401.0,5400000.0
2,"Annunen, Justus",44.0,0.901,24.0,14.0,113.0,1142.0,2.0,2375.0,837500.0
3,"Askarov, Yaroslav",16.0,0.899,5.0,7.0,43.0,427.0,0.0,857.0,925000.0
4,"Binnington, Jordan",326.0,0.906,165.0,115.0,880.0,9392.0,18.0,18868.0,6000000.0


In [357]:
games_played = goalies.groupby(by=['GP', 'name'], ).sum()
# Sort the resulting DataFrame by 'GP' in descending order
games_played_sorted = games_played.sort_values(by='GP', ascending=True)

games_played.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SV%_x,W,L,GA,SOG,SO,TIME,salary
GP,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
15.0,"Greaves, Jet",0.91,5.0,8.0,48.0,535.0,0.0,877.0,850000.0
16.0,"Askarov, Yaroslav",0.899,5.0,7.0,43.0,427.0,0.0,857.0,925000.0
19.0,"Silovs, Arturs",0.88,8.0,8.0,57.0,474.0,0.0,1092.0,850000.0
24.0,"Fedotov, Ivan",0.877,5.0,12.0,70.0,569.0,0.0,1289.0,950000.0
39.0,"Levi, Devon",0.894,17.0,17.0,121.0,1146.0,0.0,2205.0,925000.0


In [None]:
# Define the function to assign salary based on games played
def assign_salary(gp):
    if gp <= 50:
        return 900000
    elif gp <= 100:
        return 2000000
    elif gp <= 150:
        return 3000000
    elif gp <= 200:
        return 5000000
    elif gp <= 300:
        return 8000000
    elif gp <= 400:
        return 9000000
    elif gp > 401:
        return 10000000
    else:
        return 0  # Default value if GP is greater than 200

# Apply the function to the 'GP' column to create the 'salary' column
goalies['salary'] = goalies['GP'].apply(assign_salary)
goalies.head(50)

In [360]:
# Merge the DataFrames on a common key (e.g., 'player_id' or 'name')
goalies_df = goalies_to_predict.merge(goalies[['name', 'salary']], on='name', how='left', suffixes=('', '_drop'))

# Drop the duplicate columns if they exist
goalies_df = goalies_df.loc[:, ~goalies_df.columns.str.endswith('_drop')]

# Display the first few rows of the updated DataFrame
goalies_df.drop(columns=['Unnamed: 0'])

Unnamed: 0,name,GP,W,L,GA,SOG,SO,TIME,salary
0,"Dell, Aaron",130,50,50,335,3518.0,5,6873,
1,"Hill, Adin",176,88,62,431,4744.0,11,9782,5000000.0
2,"Schmid, Akira",44,14,18,99,991.0,1,2084,900000.0
3,"Montoya, Al",168,67,49,405,4418.0,7,9169,
4,"Auld, Alex",237,91,88,606,6293.0,6,12986,
...,...,...,...,...,...,...,...,...,...
204,"Fasth, Viktor",63,26,26,161,1682.0,4,3465,
205,"Husso, Ville",143,70,46,408,4108.0,7,8009,3000000.0
206,"Vanecek, Vitek",183,95,53,478,4930.0,10,10231,5000000.0
207,"Danis, Yann",55,17,22,127,1381.0,3,2732,


In [362]:
# Define the function to assign salary based on games played
def assign_salary(gp):
    if gp <= 50:
        return 900000
    elif gp <= 100:
        return 2000000
    elif gp <= 150:
        return 3000000
    elif gp <= 200:
        return 5000000
    elif gp <= 300:
        return 8000000
    elif gp <= 400:
        return 9000000
    elif gp > 401:
        return 10000000
    else:
        return 0  # Default value if GP is greater than 200

# Apply the function to the 'GP' column to create the 'salary' column
goalies_df['salary'] = goalies_df['GP'].apply(assign_salary)
goalies_df.drop(columns=['Unnamed: 0'])

Unnamed: 0,name,GP,W,L,GA,SOG,SO,TIME,salary
0,"Dell, Aaron",130,50,50,335,3518.0,5,6873,3000000
1,"Hill, Adin",176,88,62,431,4744.0,11,9782,5000000
2,"Schmid, Akira",44,14,18,99,991.0,1,2084,900000
3,"Montoya, Al",168,67,49,405,4418.0,7,9169,5000000
4,"Auld, Alex",237,91,88,606,6293.0,6,12986,8000000
...,...,...,...,...,...,...,...,...,...
204,"Fasth, Viktor",63,26,26,161,1682.0,4,3465,2000000
205,"Husso, Ville",143,70,46,408,4108.0,7,8009,3000000
206,"Vanecek, Vitek",183,95,53,478,4930.0,10,10231,5000000
207,"Danis, Yann",55,17,22,127,1381.0,3,2732,2000000


In [375]:
# Merge the DataFrames on the 'name' column, keeping only the 'salary' column from goalies_df
goalies_df_salary = goalies_rec.merge(goalies_df[['name', 'salary']], on='name', how='left', suffixes=('', '_drop'))

# Drop the duplicate columns if they exist
goalies_df_salary = goalies_df_salary.loc[:, ~goalies_df_salary.columns.str.endswith('_drop')]

# Drop the 'Unnamed: 0' column if it exists
if 'Unnamed: 0' in goalies_df_salary.columns:
    goalies_df_salary = goalies_df_salary.drop(columns=['Unnamed: 0'])

# Save the file
goalies_df_salary.to_csv('/Users/blairjdaniel/lighthouse/lighthouse/NHL/files/master_copies/goalies_rec_two.csv')

In [373]:
goalies_df_salary.columns.to_list()

['Unnamed: 0',
 'name',
 'icetime',
 'x_goals',
 'goals',
 'unblocked_shot_attempts',
 'x_rebounds',
 'rebounds',
 'x_freeze',
 'freeze',
 'x_on_goal',
 'on_goal',
 'x_play_stopped',
 'play_stopped',
 'x_play_continued_in_zone',
 'play_continued_in_zone',
 'x_play_continued_outside_zone',
 'play_continued_outside_zone',
 'flurry_adjusted_x_goals',
 'low_danger_shots',
 'medium_danger_shots',
 'high_danger_shots',
 'low_danger_x_goals',
 'medium_danger_x_goals',
 'high_danger_x_goals',
 'low_danger_goals',
 'medium_danger_goals',
 'high_danger_goals',
 'blocked_shot_attempts',
 'penalty_minutes',
 'penalties',
 'position',
 'salary']

In [None]:
# Plot histograms for numerical features
goalies.hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot for Salary vs other numerical features
features = ['GP', 'W', 'L', 'salary']

# Create a grid of subplots
num_features = len(features)
num_cols = 3
num_rows = (num_features + num_cols - 1) // num_cols

fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows * 5))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Plot each feature
for i, feature in enumerate(features):
    sns.scatterplot(x=goalies[feature], y=goalies['salary'], ax=axes[i])
    axes[i].set_title(f'Salary vs {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Salary')

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
time = goalies[goalies['TIME'] > 35000]
time

In [None]:
# Define the features to plot
features = goalies.columns.drop('salary')  # Exclude the target variable 'salary'

# Create a grid of subplots
num_features = len(features)
num_cols = 3
num_rows = (num_features + num_cols - 1) // num_cols

fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows * 5))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Plot each feature
for i, feature in enumerate(features):
    sns.boxplot(y=goalies[feature], ax=axes[i])
    axes[i].set_title(f'Box Plot of {feature}')
    axes[i].set_xlabel(feature)

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
goalies_test = goalies.drop(columns=['name'])
# Define the features and target
features = goalies_test.columns.drop('salary')
X = goalies_test[features]
y = goalies_test['salary']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Perform PCA
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_pca = pca.fit_transform(X_scaled)

# Explained variance ratio
explained_variance = pca.explained_variance_ratio_
print("Explained variance ratio:", explained_variance)

# Cumulative explained variance
cumulative_explained_variance = np.cumsum(explained_variance)
print("Cumulative explained variance:", cumulative_explained_variance)

In [None]:
# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, where='mid', label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.title('Explained Variance Ratio by Principal Components')
plt.show()

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f'Linear Regression MSE: {mse_lr}')
print(f'Linear Regression R2: {r2_lr}')

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f'Random Forest MSE: {mse_rf}')
print(f'Random Forest R2: {r2_rf}')

# XGBoost
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f'XGBoost MSE: {mse_xgb}')
print(f'XGBoost R2: {r2_xgb}')

# Compare Model Performance
models = ['Linear Regression', 'Random Forest', 'XGBoost']
mse_scores = [mse_lr, mse_rf, mse_xgb]
r2_scores = [r2_lr, r2_rf, r2_xgb]
performance_df = pd.DataFrame({'Model': models, 'MSE': mse_scores, 'R2': r2_scores})
print(performance_df)

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

In [None]:
# Train XGBoost model with best parameters
best_xgb_model = xgb.XGBRegressor(objective='reg:squarederror', **best_params, random_state=42)
best_xgb_model.fit(X_train, y_train)

In [None]:
# Predict on test set
y_pred_xgb = best_xgb_model.predict(X_test)

# Evaluate XGBoost model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f'XGBoost MSE: {mse_xgb}')
print(f'XGBoost R2: {r2_xgb}')

In [None]:
# Define the output directory
output_dir = '/Users/blairjdaniel/lighthouse/lighthouse/NHL/NHL_points_projection/models'

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Save the model
model_path = os.path.join(output_dir, 'best_xgb_model_goalie.pkl')
joblib.dump(best_xgb_model, model_path)
print(f"Model saved to {model_path}")

In [None]:
# Define the model path
model_path = '/Users/blairjdaniel/lighthouse/lighthouse/NHL/NHL_points_projection/models/best_xgb_model_goalie.pkl'

# Load the trained XGBoost model
best_xgb_model = joblib.load(model_path)

In [None]:
# Merge the DataFrames on a common key (e.g., 'player_id' or 'name')
goalies_to_predict = goalies_to_predict.merge(goalies[['name', 'salary']], on='name', how='left', suffixes=('', '_drop'))

# Drop the duplicate columns if they exist
goalies_to_predict = goalies_to_predict.loc[:, ~goalies_to_predict.columns.str.endswith('_drop')]

# Display the first few rows of the updated DataFrame
goalies_to_predict.drop(columns=['Unnamed: 0'])

goalies_to_predict = goalies_to_predict[goalies_to_predict['salary'].isnull()]


In [None]:
goalies_to_predict_copy = goalies_to_predict.copy()

In [None]:
# Drop the name col from the df
goalies_to_predict = goalies_to_predict.drop(columns=['name', 'salary'])

# Define the features (assuming the same features as used during training)
features = goalies_to_predict.columns

# Scale the features using the same scaler used during training
scaler = StandardScaler()
X_to_predict_scaled = scaler.fit_transform(goalies_to_predict[features])

In [None]:
# Perform PCA
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_pca = pca.fit_transform(X_to_predict_scaled)

# Explained variance ratio
explained_variance = pca.explained_variance_ratio_
print("Explained variance ratio:", explained_variance)

# Cumulative explained variance
cumulative_explained_variance = np.cumsum(explained_variance)
print("Cumulative explained variance:", cumulative_explained_variance)

In [None]:
# Predict salaries using the loaded model
predicted_salaries = best_xgb_model.predict(X_to_predict_scaled)

# Add the predicted salaries to the DataFrame
goalies_to_predict['predicted_salary'] = predicted_salaries


In [None]:
goalies_predicted = goalies_to_predict_copy[['name']].copy()
goalies_predicted['predicted_salary'] = predicted_salaries

goalies_predicted.head(50)