In [101]:
import pandas as pd 
import numpy as np 
import os 
import requests
import joblib
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from scipy import stats 
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor

In [102]:
# Specify the folder where the CSV files are located
directory = '/Users/blairjdaniel/lighthouse/lighthouse/NHL/files/skaters'

# Use glob to find all CSV files in the dir
csv_files = glob.glob(os.path.join(directory, '*.csv'))

# Initialize an empty dataframe
dataframes = []

# Loop through the CSV file and read it into a DF
for csv_file in csv_files:
    goalies = pd.read_csv(csv_file)
    dataframes.append(goalies)

    # # Check if the file is goalies_2010.csv and print a message
    # if 'goalies_2010.csv' in csv_file:
    #     print(f"goalies_2010.csv file loaded successfully.")
    #     print(goalies.head())  # Display the first few rows of the DataFrame


# Concatenate all DF into one master DataFrame
skaters_df = pd.concat(dataframes, ignore_index=True)
len(skaters_df)

72055

In [103]:
# Check for non nulls and Dtypes
skaters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72055 entries, 0 to 72054
Columns: 154 entries, playerId to fenwickAgainstAfterShifts
dtypes: float64(147), int64(3), object(4)
memory usage: 84.7+ MB


In [104]:
# Check for any outliers or interesting stats
skaters_df.describe()

Unnamed: 0,playerId,season,games_played,icetime,shifts,gameScore,onIce_xGoalsPercentage,offIce_xGoalsPercentage,onIce_corsiPercentage,offIce_corsiPercentage,...,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts
count,72055.0,72055.0,72055.0,72055.0,72055.0,72055.0,72055.0,72055.0,72055.0,72055.0,...,72055.0,72055.0,72055.0,72055.0,72055.0,72055.0,72055.0,72055.0,72055.0,72055.0
mean,8474283.0,2015.616057,47.389564,18771.946263,404.597224,17.800659,0.437685,0.470318,0.445538,0.470248,...,36.055972,36.456045,740.613767,747.698577,0.351195,0.17794,7.825592,4.208799,6.095677,3.364846
std,4799.788,4.634268,28.269251,27500.810949,584.208266,21.382969,0.293517,0.265651,0.281146,0.25758,...,49.747079,49.835209,1049.660543,1052.400857,0.996172,0.455104,21.71353,10.327724,16.893392,8.276483
min,8445550.0,2008.0,1.0,0.0,0.0,-12.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8471263.0,2012.0,20.0,505.0,16.0,1.07,0.16,0.33,0.2,0.35,...,2.15,1.95,34.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8475162.0,2016.0,53.0,4217.0,90.0,9.32,0.47,0.49,0.48,0.49,...,9.87,10.32,128.0,132.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8477479.0,2020.0,74.0,28970.5,659.5,28.19,0.58,0.57,0.58,0.55,...,62.68,63.62,1399.5,1416.5,0.11,0.06,2.0,2.0,2.0,1.0
max,8484911.0,2023.0,85.0,144666.0,2730.0,154.73,1.0,1.0,1.0,1.0,...,271.99,246.92,4797.0,4499.0,11.64,5.67,232.0,100.0,174.0,78.0


In [105]:
# Initialize an empty list to store DataFrames
pivoted_dfs = []
cols_to_drop = ['team', 'games_played', 'icetime', 'shifts', 'gameScore', 'iceTimeRank', 'I_F_xPlayStopped', 'I_F_xPlayContinuedInZone',
 'I_F_xPlayContinuedOutsideZone', 'I_F_flurryAdjustedxGoals','OnIce_F_flurryAdjustedxGoals','OnIce_F_scoreVenueAdjustedxGoals',
 'OnIce_F_flurryScoreVenueAdjustedxGoals','I_F_scoreVenueAdjustedxGoals','I_F_flurryScoreVenueAdjustedxGoals', 'I_F_freeze',
 'I_F_playStopped','I_F_playContinuedInZone','I_F_playContinuedOutsideZone', 'penalties','I_F_penalityMinutes','I_F_faceOffsWon',
 'I_F_hits','I_F_takeaways','I_F_giveaways', 'I_F_dZoneGiveaways', 'I_F_xGoals_with_earned_rebounds_scoreAdjusted',
 'I_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted','I_F_shifts','I_F_oZoneShiftStarts','I_F_dZoneShiftStarts',
 'I_F_neutralZoneShiftStarts','I_F_flyShiftStarts','I_F_oZoneShiftEnds','I_F_dZoneShiftEnds','I_F_neutralZoneShiftEnds',
 'I_F_flyShiftEnds','faceoffsWon','faceoffsLost','timeOnBench','penalityMinutes','penalityMinutesDrawn','penaltiesDrawn',
 'shotsBlockedByPlayer', 'OnIce_A_xGoals_with_earned_rebounds_scoreAdjusted','OnIce_A_xGoals_with_earned_rebounds_scoreFlurryAdjusted',
 'OffIce_F_xGoals','OffIce_A_xGoals','OffIce_F_shotAttempts','OffIce_A_shotAttempts','xGoalsForAfterShifts','xGoalsAgainstAfterShifts',
 'corsiForAfterShifts','corsiAgainstAfterShifts','fenwickForAfterShifts','fenwickAgainstAfterShifts','ncentage',
 'fenwickAgainstAfterShiftsame','offIce_corsiPer', 'I_F_blockedShotAttempts', 'I_F_primaryAssists', 'I_F_secondaryAssists',
 'I_F_xFreeze', 'OnIce_A_blockedShotAttempts', 'OnIce_A_flurryAdjustedxGoals','OnIce_A_flurryScoreVenueAdjustedxGoals',
 'OnIce_F_blockedShotAttempts', 'offIce_corsiPercentage','offIce_fenwickPercentage','offIce_xGoalsPercentage','onIce_corsiPercentage',
 'onIce_fenwickPercentage', 'I_F_highDangerxGoals', 'I_F_lowDangerxGoals', 'I_F_mediumDangerxGoals', 'I_F_missedShots', 
 'I_F_points', 'I_F_points','I_F_reboundGoals','I_F_rebounds','I_F_reboundxGoals', 'I_F_xGoals',
 'I_F_xGoalsFromActualReboundsOfShots','I_F_xGoalsFromxReboundsOfShots','I_F_xGoals_with_earned_rebounds','I_F_xOnGoal',
 'I_F_xRebounds', 'OnIce_A_highDangerxGoals', 'OnIce_A_lowDangerxGoals', 'OnIce_A_mediumDangerxGoals',
 'OnIce_A_missedShots','OnIce_A_reboundGoals','OnIce_A_rebounds','OnIce_A_reboundxGoals', 'OnIce_A_scoreAdjustedShotsAttempts',
 'OnIce_A_scoreAdjustedUnblockedShotAttempts','OnIce_A_scoreVenueAdjustedxGoals', 'OnIce_A_goals',
 'OnIce_A_highDangerGoals','OnIce_A_highDangerShots','OnIce_A_highDangerxGoals','OnIce_A_lowDangerGoals',
 'OnIce_A_lowDangerShots','OnIce_A_lowDangerxGoals','OnIce_A_mediumDangerGoals','OnIce_A_mediumDangerShots',
 'OnIce_A_mediumDangerxGoals','OnIce_A_missedShots','OnIce_A_reboundGoals','OnIce_A_rebounds',
 'OnIce_A_reboundxGoals','OnIce_A_scoreAdjustedShotsAttempts','OnIce_A_scoreAdjustedUnblockedShotAttempts',
 'OnIce_A_scoreVenueAdjustedxGoals','OnIce_A_shotAttempts','OnIce_A_shotsOnGoal','OnIce_A_unblockedShotAttempts',
 'OnIce_A_xGoals','OnIce_A_xGoalsFromActualReboundsOfShots','OnIce_A_xGoalsFromxReboundsOfShots',
 'OnIce_A_xGoals_with_earned_rebounds','OnIce_A_xOnGoal','OnIce_F_goals','OnIce_F_highDangerGoals',
 'OnIce_F_highDangerShots','OnIce_F_highDangerxGoals','OnIce_F_lowDangerGoals','OnIce_F_lowDangerShots',
 'OnIce_F_lowDangerxGoals','OnIce_F_mediumDangerGoals','OnIce_F_mediumDangerShots','OnIce_F_mediumDangerxGoals',
 'OnIce_F_missedShots','OnIce_F_reboundGoals','OnIce_F_rebounds','OnIce_F_reboundxGoals','OnIce_F_scoreAdjustedShotsAttempts',
 'OnIce_F_scoreAdjustedUnblockedShotAttempts','OnIce_F_shotAttempts','OnIce_F_shotsOnGoal','OnIce_F_unblockedShotAttempts',
 'OnIce_F_xGoals','OnIce_F_xGoalsFromActualReboundsOfShots','OnIce_F_xGoalsFromxReboundsOfShots',
 'OnIce_F_xGoals_with_earned_rebounds','OnIce_F_xGoals_with_earned_rebounds_scoreAdjusted',
 'OnIce_F_xGoals_with_earned_rebounds_scoreFlurryAdjusted','OnIce_F_xOnGoal','onIce_xGoalsPercentage', 'I_F_scoreAdjustedShotsAttempts',
 'I_F_scoreAdjustedUnblockedShotAttempts', 'I_F_savedUnblockedShotAttempts', 'I_F_unblockedShotAttempts', 'I_F_savedShotsOnGoal']

# List of situations
situations = ['all']

# Loop through each situation and pivot the DataFrame
for situation in situations:
    # Filter the DataFrame for the specific situation
    situation_df = skaters_df[skaters_df['situation'] == situation]
    
    # Drop specific columns
    situation_df = situation_df.drop(columns=cols_to_drop)

    # Pivot the DataFrame
    pivoted_df = situation_df.pivot_table(index=['name', 'season'], aggfunc='first')
    
    # Flatten the columns
    pivoted_df.columns = [f'{col}_{situation}' for col in pivoted_df.columns]
    
    # Reset the index
    pivoted_df = pivoted_df.reset_index()
    
    # Append the pivoted DataFrame to the list
    pivoted_dfs.append(pivoted_df)

# Merge all pivoted DataFrames into one master DataFrame
df_all = pivoted_dfs[0]
for pivoted_df in pivoted_dfs[1:]:
    df_all = df_all.merge(pivoted_df, on=['name', 'season'], how='outer')

# Display the first few rows of the master DataFrame
print("Master DataFrame:")
df_all

KeyError: "['ncentage', 'fenwickAgainstAfterShiftsame', 'offIce_corsiPer'] not found in axis"

In [None]:
# Group the DataFrame by 'name' and 'season'
df_all_grouped = df_all.groupby(by=['playerId_all'])

# Convert the grouped DataFrame to a regular DataFrame
df_all_reset = df_all_grouped.sum().reset_index()


df_all = df_all_reset.drop(columns=['situation_all', 'name', 'season'])
df_all.head()

In [None]:
# Change the values in the 'position_all' column based on the position
df_all['position_all'] = np.where(df_all['position_all'].str.contains('L'), 'L', df_all['position_all'])
df_all['position_all'] = np.where(df_all['position_all'].str.contains('D'), 'D', df_all['position_all'])
df_all['position_all'] = np.where(df_all['position_all'].str.contains('C'), 'C', df_all['position_all'])
df_all['position_all'] = np.where(df_all['position_all'].str.contains('R'), 'R', df_all['position_all'])

# Display the updated 'position_all' column
print(df_all['position_all'])

In [None]:
# Use encoding to give the positions a numeric value
le = LabelEncoder()
df_all['position_all'] = df_all['position_all'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x) 

# Fit the encoder and transform the 'position_all' column
df_all['PositionEn_encoded'] = le.fit_transform(df_all['position_all'])
df_all = df_all.drop(columns=['position_all'])

In [None]:
# Histogram for each variable
df_all.hist(bins=30, figsize=(20, 15))
plt.suptitle('Histograms of all variables')
plt.show()

In [None]:
# Scatter plot between "Fresh" and "Milk"
plt.figure(figsize=(10, 6))
sns.scatterplot(x='I_F_goals_all', y='I_F_highDangerShots_all', data=df_all)
plt.title('Scatter plot between Goals and High Danger Shots')
plt.show()

In [None]:
# Heatmap to show correlation between variables
numeric_df = df_all.drop(columns=['PositionEn_encoded'])


plt.figure(figsize=(12, 8))
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Heatmap of correlation between variables')
plt.show()

In [None]:
# Create X, y
X = df_all.drop(columns=['I_F_goals_all', 'playerId_all'])
y = df_all['I_F_goals_all']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the mean squared error on the test data
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (Test): {mse}")

# Calculate the R^2 score on the test data
r2 = r2_score(y_test, y_pred)
print(f"R^2 Score (Test): {r2}")

# Calculate the number of observations (samples) and the number of predictors (features) for the test data
n = X_test.shape[0]
p = X_test.shape[1]

# Calculate the adjusted R-squared score on the test data
r2_adj = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)
print(f"Adjusted R^2 Score (Test): {r2_adj}")

# Make predictions on the training data
y_train_pred = model.predict(X_train)

# Calculate the mean squared error on the training data
mse_train = mean_squared_error(y_train, y_train_pred)
print(f"Mean Squared Error (Train): {mse_train}")

# Calculate the R^2 score on the training data
r2_train = r2_score(y_train, y_train_pred)
print(f"R^2 Score (Train): {r2_train}")

# Calculate the number of observations (samples) and the number of predictors (features) for the training data
n_train = X_train.shape[0]
p_train = X_train.shape[1]

# Calculate the adjusted R-squared score on the training data
r2_adj_train = 1 - ((1 - r2_train) * (n_train - 1)) / (n_train - p_train - 1)
print(f"Adjusted R^2 Score (Train): {r2_adj_train}")

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Define the model with specified parameters
model = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.6,
    min_samples_split=10,
    min_samples_leaf=1,
    random_state=42
)

# Train the model on the training data
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
adjusted_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)

# Print metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R²: {r2}")
print(f"Adjusted R²: {adjusted_r2}")

In [None]:
# Save the trained model to a file
joblib.dump(model, 'skater_gradient_boosting_model.pkl')

In [None]:
df_all