In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import pairwise_distances

# Load the datasets
pd.set_option('display.max_columns', None) # Display Preference

In [3]:
mod_AS = pd.read_csv('MP_AS_stats_bios_new_features.csv')
mod_5on5 = pd.read_csv('MP_5on5_stats_bios_new_features.csv')
mod_4on5 = pd.read_csv('MP_4on5_stats_bios_new_features.csv')
mod_5on4 = pd.read_csv('MP_5on4_stats_bios_new_features.csv')
mod_OS = pd.read_csv('MP_OS_stats_bios_new_features.csv')

## Random Forest Regressor:

#### Using the random forest regressor to see how accurately my model can predict the gameScore for future seasons 


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
        ('age_group', Pipeline([
            ('ordinal', OrdinalEncoder(categories=[['New Pro', 'Young Pro', 'Prime Age', 'Vet', 'Old Vet']])),
            ('scaler', StandardScaler())  # Scale the ordinal-encoded age_group
        ]), ['age_group']),
        ('position', Pipeline([
            ('onehot', OneHotEncoder()),  # Apply OneHotEncoder to 'position'
            ('scaler', StandardScaler(with_mean=False))  # Apply StandardScaler after OneHotEncoder
        ]), ['position'])
    ])

# My current Pipeline
MP_RFR_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # Random Forest Regressor
])

In [8]:
col_not_processed = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age' ,'gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating'] 
# gameScore is the target variable

col_not_processed_without_points = ['playerId', 'season' , 'name', 'team', 'situation', 'iceTimeRank', 'I_F_shifts',
                      'nationality' ,'birthDate', 'weight','height', 'shoots', 'age' , 'I_F_points','gameScore', 'ZR_gameScore', 'playerRating', 'ZR_playerRating'] 

In [9]:
# Drop the target column to create the feature matrix X
MP_AS_X = mod_AS.drop(columns=col_not_processed) 
MP_AS_y = mod_AS['gameScore']  # Target variable

# Split the data into training and testing sets
MP_AS_X_train, MP_AS_X_test, MP_AS_y_train, MP_AS_y_test = train_test_split(MP_AS_X, MP_AS_y, test_size=0.2, random_state=42)
# Fit the pipeline to your training data
AS_model = MP_RFR_pipeline.fit(MP_AS_X_train, MP_AS_y_train)

# Access the trained Random Forest model inside the pipeline
rf_model = AS_model.named_steps['regressor']

# Access the preprocessor step to get the transformed feature names
preprocessor = AS_model.named_steps['preprocessor']

# Get feature names after the transformation
def get_feature_names(column_transformer):
    output_features = []
    for name, transformer, features in column_transformer.transformers_:
        if transformer == 'drop' or transformer is None:
            continue
        if isinstance(transformer, Pipeline):
            transformer = transformer.named_steps['onehot'] if 'onehot' in transformer.named_steps else transformer
        try:
            if hasattr(transformer, 'get_feature_names_out'):
                feature_names = transformer.get_feature_names_out(features)
                output_features.extend(feature_names)
            else:
                output_features.extend(features)
        except NotFittedError:
            output_features.extend(features)
    return output_features

# Get the transformed feature names
transformed_feature_names = get_feature_names(preprocessor)

# Get feature importances from the Random Forest model
feature_importances = pd.Series(rf_model.feature_importances_, index=transformed_feature_names)
feature_importances.sort_values(ascending=False, inplace=True)

# Display the most important features
print(feature_importances.head(10))

I_F_points                                    0.904845
onIce_fenwickPercentage                       0.013960
onIce_corsiPercentage                         0.009082
OnIce_F_scoreAdjustedUnblockedShotAttempts    0.004477
I_F_scoreAdjustedShotsAttempts                0.003807
offIce_xGoalsPercentage                       0.003801
offIce_corsiPercentage                        0.003559
offIce_fenwickPercentage                      0.003338
onIce_xGoalsPercentage                        0.003032
I_F_oZoneShiftEnds                            0.001937
dtype: float64


### AS Model -  Comparing the accuracy of the model with and without the I_F_points column:

In [10]:
# Re assign the variable so that the comparison doesn't throw an error
MP_AS_X = mod_AS.drop(columns=col_not_processed) 
MP_AS_y = mod_AS['gameScore']  # Target variable

# Split the data into training and testing sets
MP_AS_X_train, MP_AS_X_test, MP_AS_y_train, MP_AS_y_test = train_test_split(MP_AS_X, MP_AS_y, test_size=0.2, random_state=42)

# Step 1: Train and evaluate with I_F_points included
# Assume your original training set includes I_F_points
AS_model_with_points = MP_RFR_pipeline.fit(MP_AS_X_train, MP_AS_y_train)
predictions_with_points = AS_model_with_points.predict(MP_AS_X_test)

# Evaluate the model
mse_with_points = mean_squared_error(MP_AS_y_test, predictions_with_points)
r2_with_points = r2_score(MP_AS_y_test, predictions_with_points)

print("Model with I_F_points:")
print(f"Mean Squared Error: {mse_with_points}")
print(f"R2 Score: {r2_with_points}")

# Step 2: Train and evaluate with I_F_points removed
# Remove the I_F_points column from your training and testing sets
MP_AS_X_train_no_points = MP_AS_X_train.drop(columns=['I_F_points'])
MP_AS_X_test_no_points = MP_AS_X_test.drop(columns=['I_F_points'])

AS_model_without_points = MP_RFR_pipeline.fit(MP_AS_X_train_no_points, MP_AS_y_train)
predictions_without_points = AS_model_without_points.predict(MP_AS_X_test_no_points)

# Evaluate the model
mse_without_points = mean_squared_error(MP_AS_y_test, predictions_without_points)
r2_without_points = r2_score(MP_AS_y_test, predictions_without_points)

print("Model without I_F_points:")
print(f"Mean Squared Error: {mse_without_points}")
print(f"R2 Score: {r2_without_points}/n")

# Step 3: Compare the two models
print("Comparison of Model Performance:")
print(f"Difference in MSE: {mse_without_points - mse_with_points}")
print(f"Difference in R2 Score: {r2_without_points - r2_with_points}")

Model with I_F_points:
Mean Squared Error: 16.62153429479167
R2 Score: 0.9723422374579082
Model without I_F_points:
Mean Squared Error: 22.02977477850695
R2 Score: 0.9633430783901441/n
Comparison of Model Performance:
Difference in MSE: 5.408240483715282
Difference in R2 Score: -0.008999159067764051


## Exploring how to weight xGoals for each scoring chance category low, med, high:

In [11]:
AS_total_xGoals = mod_AS['I_F_xGoals'].sum()
print(AS_total_xGoals)

AS_total_lowDangerxGoals = mod_AS['I_F_lowDangerxGoals'].sum()
print(AS_total_lowDangerxGoals)

AS_total_medDangerxGoals = mod_AS['I_F_mediumDangerxGoals'].sum()
print(AS_total_medDangerxGoals)

AS_total_highDangerxGoals = mod_AS['I_F_highDangerxGoals'].sum()
print(AS_total_highDangerxGoals)

#find the scoring chance xGoals percentage for weighting.
AS_LDxG_percent = 100 * (AS_total_lowDangerxGoals/AS_total_xGoals)
print('the AS LDxG_percentage is:  ', AS_LDxG_percent)

AS_MDxG_percent = 100 * (AS_total_medDangerxGoals/AS_total_xGoals)
print('the AS MDxG_percentage is:  ', AS_MDxG_percent)

AS_HDxG_percent = 100 * (AS_total_highDangerxGoals/AS_total_xGoals)
print('the AS HDxG_percentage is:  ', AS_HDxG_percent)


24634.84
7256.17
8331.32
9047.81
the AS LDxG_percentage is:   29.454910200350398
the AS MDxG_percentage is:   33.81925760427102
the AS HDxG_percentage is:   36.727699469531764
