# xG Model without Seconds and Period

This is a full model construction and evaluation for a model without `Seconds Elapsed` and `Period`. Even though shots in overtime are probably more likely to go in, the `Strength` attribute should account for that. Using the output will inform whether to include it in the final model. 

In [1]:
import pandas as pd
import hockey_scraper
import numpy as np
import math
import seaborn as sns
from tqdm.notebook import tqdm # This displays a loading bar for monitoring progress of for loops

## ML Workflow

In [2]:
import matplotlib.pyplot as plt
from sklearn.compose import make_column_transformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_predict
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression, RidgeCV
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    average_precision_score,
    roc_curve,
    roc_auc_score,
    mean_absolute_percentage_error,
    mean_squared_error,
    log_loss,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [3]:
shots2022 = pd.read_csv('shots2022.csv')

In [4]:
# Creating target variable column

for i in range(len(shots2022)):
    if shots2022.at[i,'Event'] == 'GOAL':
        shots2022.at[i,'Target'] = 1
    else:
        shots2022.at[i,'Target'] = 0
        
shots2022

Unnamed: 0,Game_Id,Date,Period,Event,Description,Time_Elapsed,Seconds_Elapsed,Strength,Ev_Zone,Type,...,xC,yC,Home_Coach,Away_Coach,Distance,Shot_Angle,Score_Diff,Is_Rebound,Change_of_Angle,Target
0,20001,2022-10-07,1,SHOT,"SJS ONGOAL - #28 MEIER, Wrist, Off. Zone, 45 ft.",0:23,23,5x5,Off,WRIST SHOT,...,44,8,John Hynes,David Quinn,45.705580,10.080598,0,0,,0.0
1,20001,2022-10-07,1,MISS,"SJS #44 VLASIC, Wrist, Wide of Net, Off. Zone,...",0:36,36,5x5,Off,WRIST SHOT,...,44,27,John Hynes,David Quinn,52.478567,30.963757,0,0,,0.0
2,20001,2022-10-07,1,BLOCK,"NSH #27 MCDONAGH BLOCKED BY SJS #62 LABANC, W...",0:56,56,5x5,Def,WRIST SHOT,...,-55,3,John Hynes,David Quinn,34.132096,5.042451,0,0,,0.0
3,20001,2022-10-07,1,SHOT,"NSH ONGOAL - #14 EKHOLM, Slap, Off. Zone, 56 ft.",0:59,59,5x5,Off,SLAP SHOT,...,-33,8,John Hynes,David Quinn,56.568542,8.130102,0,1,3.087651,0.0
4,20001,2022-10-07,1,GOAL,"NSH #44 SHERWOOD(1), Wrist, Off. Zone, 15 ft.A...",1:01,61,5x5,Off,WRIST SHOT,...,-74,-5,John Hynes,David Quinn,15.811388,18.434949,0,1,26.565051,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152504,21312,2023-04-13,3,SHOT,"SEA ONGOAL - #67 GEEKIE, Wrist, Off. Zone, 10 ft.",17:14,1034,5x5,Off,WRIST SHOT,...,81,7,Dave Hakstol,Bruce Cassidy,10.630146,41.185925,-1,1,10.535257,0.0
152505,21312,2023-04-13,3,MISS,"VGK #9 EICHEL, Wrist, Wide of Net, Def. Zone, ...",18:33,1113,5x5,Def,WRIST SHOT,...,55,41,Dave Hakstol,Bruce Cassidy,149.723078,15.892831,1,0,,0.0
152506,21312,2023-04-13,3,BLOCK,"SEA #4 SCHULTZ BLOCKED BY VGK #3 MCNABB, Wris...",18:43,1123,5x5,Def,WRIST SHOT,...,75,-1,Dave Hakstol,Bruce Cassidy,14.035669,4.085617,-1,0,,0.0
152507,21312,2023-04-13,3,GOAL,"VGK #20 STEPHENSON(16), Poke, Def. Zone, 137 ft.",19:22,1162,5x5,Def,POKE,...,47,19,Dave Hakstol,Bruce Cassidy,137.320792,7.953082,1,0,,1.0


In [5]:
train_df, test_df = train_test_split(shots2022, test_size=0.25, random_state=17)

In [6]:
model_columns = ['Strength','Type','Distance','Shot_Angle','Score_Diff',
                 'Is_Rebound','Change_of_Angle']
eval_columns = ['Game_Id','Date','Period','Seconds_Elapsed','Event','Description','Time_Elapsed','Ev_Zone','Ev_Team','Home_Zone','Away_Team',
                'Home_Team','p1_name','p1_ID','p2_name','p2_ID','p3_name','p3_ID','awayPlayer1','awayPlayer1_id',
                'awayPlayer2','awayPlayer2_id','awayPlayer3','awayPlayer3_id','awayPlayer4','awayPlayer4_id','awayPlayer5',
                'awayPlayer5_id','awayPlayer6','awayPlayer6_id','homePlayer1','homePlayer1_id','homePlayer2',
                'homePlayer2_id','homePlayer3','homePlayer3_id','homePlayer4','homePlayer4_id','homePlayer5',
                'homePlayer5_id','homePlayer6','homePlayer6_id','Away_Players','Home_Players','Away_Score','Home_Score',
                'Away_Goalie','Away_Goalie_Id','Home_Goalie','Home_Goalie_Id','xC','yC','Home_Coach','Away_Coach']

In [7]:
# Reset the y index since the index is reset during column transformations

X_train = train_df[model_columns]
X_train_eval = train_df[eval_columns]
y_train = train_df['Target'].reset_index(drop=True)


X_test = test_df[model_columns]
X_test_eval = test_df[eval_columns]
y_test = test_df['Target'].reset_index(drop=True)

In [8]:
# Choosing feature types

numeric_feats = ['Distance','Shot_Angle','Change_of_Angle']  # apply scaling
categorical_feats = ['Strength','Type','Score_Diff','Is_Rebound']  # apply one-hot encoding

In [9]:
# Column Transformation

ct = make_column_transformer(
    (StandardScaler(), numeric_feats),  # scaling on numeric features
    (OneHotEncoder(sparse_output=False), categorical_feats), # one-hot encoding on categorical features
)

# Train Data

transformed = ct.fit_transform(X_train)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

transformed_train_df = pd.DataFrame(transformed, columns=column_names)
transformed_train_df

Unnamed: 0,Distance,Shot_Angle,Change_of_Angle,Strength_3x3,Strength_3x4,Strength_3x5,Strength_4x3,Strength_4x4,Strength_4x5,Strength_5x3,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,1.241468,-0.083752,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.178112,-0.824571,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.475877,0.180684,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.744140,-0.255019,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.364910,-0.706403,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114376,-0.592175,1.388798,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
114377,3.136045,-0.551975,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
114378,-0.973507,-1.028666,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
114379,-0.520808,0.879279,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [10]:
imp = SimpleImputer(strategy='constant',fill_value=0)
imp.fit(transformed_train_df)
test = imp.transform(transformed_train_df)
df = pd.DataFrame(test)
transformed_train_df['Change_of_Angle'] = df[3]
transformed_train_df

Unnamed: 0,Distance,Shot_Angle,Change_of_Angle,Strength_3x3,Strength_3x4,Strength_3x5,Strength_4x3,Strength_4x4,Strength_4x5,Strength_5x3,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,1.241468,-0.083752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.178112,-0.824571,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.475877,0.180684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.744140,-0.255019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.364910,-0.706403,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114376,-0.592175,1.388798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
114377,3.136045,-0.551975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
114378,-0.973507,-1.028666,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
114379,-0.520808,0.879279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [11]:
# Test Data

transformed = ct.fit_transform(X_test)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

transformed_test_df = pd.DataFrame(transformed, columns=column_names)
transformed_test_df

Unnamed: 0,Distance,Shot_Angle,Change_of_Angle,Strength_3x3,Strength_3x4,Strength_3x5,Strength_4x3,Strength_4x4,Strength_4x5,Strength_5x3,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,0.123974,-0.294521,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.233246,-1.215393,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.212786,0.638550,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.679193,-0.485540,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.535155,-1.167614,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38123,-0.665240,1.913587,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
38124,-0.325514,-1.284821,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
38125,-0.826101,-0.686323,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
38126,1.945798,0.059939,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [12]:
imp = SimpleImputer(strategy='constant',fill_value=0)
imp.fit(transformed_test_df)
test = imp.transform(transformed_test_df)
df = pd.DataFrame(test)
transformed_test_df['Change_of_Angle'] = df[3]
transformed_test_df['Strength_5x6'] = 0
transformed_test_df['Strength_6x5'] = 0
transformed_test_df = transformed_test_df[transformed_train_df.columns]
transformed_test_df.columns

Index(['Distance', 'Shot_Angle', 'Change_of_Angle', 'Strength_3x3',
       'Strength_3x4', 'Strength_3x5', 'Strength_4x3', 'Strength_4x4',
       'Strength_4x5', 'Strength_5x3', 'Strength_5x4', 'Strength_5x5',
       'Strength_5x6', 'Strength_6x5', 'Type_BACKHAND', 'Type_BAT',
       'Type_BETWEEN LEGS', 'Type_CRADLE', 'Type_DEFLECTED', 'Type_POKE',
       'Type_SLAP SHOT', 'Type_SNAP SHOT', 'Type_TIP-IN', 'Type_WRAP-AROUND',
       'Type_WRIST SHOT', 'Score_Diff_-1', 'Score_Diff_-2', 'Score_Diff_-3-',
       'Score_Diff_0', 'Score_Diff_1', 'Score_Diff_2', 'Score_Diff_3+',
       'Is_Rebound_0', 'Is_Rebound_1'],
      dtype='object')

<br>

### Metrics

Custom log loss function is below. RMSE and MAPE are also used.

In [13]:
def my_custom_loss_func(y_true, y_pred):
    return log_loss(y_true, y_pred)
custom_logloss = make_scorer(my_custom_loss_func, greater_is_better=False)

<br>

### Attempt 1: Logistic Regression

In [14]:
lr = LogisticRegression(max_iter=1000)
lr.fit(transformed_train_df, y_train)

logloss = log_loss(y_train, lr.predict_proba(transformed_train_df))

print('The log loss of the Logistic Regression is',logloss)

The log loss of the Logistic Regression is 0.2011326672193592


In [15]:
%%time

# Hyperparameter Optimization

# param_grid = {
#     "max_iter": [100, 1000, 10000],
#     "C": [0.01, 0.1, 1, 10, 100, 1000]
# }
# random_search = RandomizedSearchCV(
#     lr, param_distributions=param_grid, n_jobs=-1, n_iter=17, scoring=custom_logloss,cv=5, random_state=17
# )
# random_search.fit(transformed_train_df, y_train)

# random_search.best_params_

CPU times: total: 0 ns
Wall time: 0 ns


In [16]:
# The best params are input below

lr = LogisticRegression(max_iter=1000,C=100)
lr.fit(transformed_test_df, y_test)

logloss = log_loss(y_test, lr.predict_proba(transformed_test_df))

print('The log loss of the optimized Logistic Regression is',logloss)

The log loss of the optimized Logistic Regression is 0.19936385815417376


<br>

### Attempt 2: Gradient Boosting Classifier

In [17]:
gbc = GradientBoostingClassifier(loss='log_loss')
gbc.fit(transformed_train_df, y_train)

logloss = log_loss(y_train, gbc.predict_proba(transformed_train_df))

print('The log loss of the Gradient Boosting Classifier is',logloss)

The log loss of the Gradient Boosting Classifier is 0.1942338452201572


In [18]:
%%time

# Hyperparameter Optimization

# param_grid = {
#     "n_estimators": [50,100,200,250],
#     "criterion": ['friedman_mse','squared_error'],
#     "max_depth": [3,5,7,10],
#     "max_features": [None, 'sqrt'],
#     "min_samples_split": [2,5,7,10]
# }
# random_search = RandomizedSearchCV(
#     gbc, param_distributions=param_grid, n_jobs=-1, scoring=custom_logloss, n_iter=17, cv=5, random_state=17
# )
# random_search.fit(transformed_train_df, y_train)

# random_search.best_params_

CPU times: total: 0 ns
Wall time: 0 ns


In [19]:
gbc = GradientBoostingClassifier(loss='log_loss',n_estimators=200,min_samples_split=10,max_features='sqrt',
                                 max_depth=3,criterion='friedman_mse')
gbc.fit(transformed_test_df, y_test)

logloss = log_loss(y_test, gbc.predict_proba(transformed_test_df))

print('The log loss of the optimized Gradient Boosting Classifier is',logloss)

The log loss of the optimized Gradient Boosting Classifier is 0.19098792263119566


<br>

### Attempt 3: Hist Gradient Boosting Classifier without NaN

In [20]:
# With imputed values

no_nan_hgbc = HistGradientBoostingClassifier(loss='log_loss')
no_nan_hgbc.fit(transformed_train_df, y_train)

logloss = log_loss(y_train, no_nan_hgbc.predict_proba(transformed_train_df))

print('The log loss of the Hist Gradient Boosting Classifier with imputed values is',logloss)

The log loss of the Hist Gradient Boosting Classifier with imputed values is 0.19033908729118026


In [21]:
%%time

# Hyperparameter Optimization

# param_grid = {
#     "max_iter": [50,100,250,500],
#     "max_depth": [None,3,5,7,10],
#     "min_samples_leaf": [10,15,20,25,30]
# }
# random_search = RandomizedSearchCV(
#     no_nan_hgbc, param_distributions=param_grid, n_jobs=-1, scoring=custom_logloss, n_iter=17, cv=5, random_state=17
# )
# random_search.fit(transformed_train_df, y_train)

# random_search.best_params_

CPU times: total: 0 ns
Wall time: 0 ns


In [22]:
no_nan_hgbc = HistGradientBoostingClassifier(loss='log_loss',min_samples_leaf=25,max_iter=100,max_depth=3)
no_nan_hgbc.fit(transformed_test_df, y_test)

logloss = log_loss(y_test, no_nan_hgbc.predict_proba(transformed_test_df))

print('The log loss of the optimized Hist Gradient Boosting Classifier with imputed values is',logloss)

The log loss of the optimized Hist Gradient Boosting Classifier with imputed values is 0.19220754185399136


<br>

### Attempt 4: Hist Gradient Boosting Classifier with NaN 

In [23]:
transformed = ct.fit_transform(X_train)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

train_no_impute = pd.DataFrame(transformed, columns=column_names)

transformed = ct.fit_transform(X_test)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

test_no_impute = pd.DataFrame(transformed, columns=column_names)
test_no_impute['Strength_5x6'] = 0
test_no_impute['Strength_6x5'] = 0
test_no_impute = test_no_impute[transformed_train_df.columns]

In [24]:
# With imputed values

with_nan_hgbc = HistGradientBoostingClassifier(loss='log_loss')
with_nan_hgbc.fit(train_no_impute, y_train)

logloss = log_loss(y_train, with_nan_hgbc.predict_proba(train_no_impute))

print('The log loss of the Hist Gradient Boosting Classifier without imputed values is',logloss)

The log loss of the Hist Gradient Boosting Classifier without imputed values is 0.18937514064593278


In [25]:
%%time

# Hyperparameter Optimization

param_grid = {
    "max_iter": [50,100,250,500],
    "max_depth": [None,3,5,7,10],
    "min_samples_leaf": [10,15,20,25,30]
}
random_search = RandomizedSearchCV(
    with_nan_hgbc, param_distributions=param_grid, n_jobs=-1, scoring=custom_logloss, n_iter=17, cv=5, random_state=17
)
random_search.fit(train_no_impute, y_train)

random_search.best_params_

CPU times: total: 6.59 s
Wall time: 1min 10s


{'min_samples_leaf': 15, 'max_iter': 500, 'max_depth': 7}

In [26]:
# With imputed values

with_nan_hgbc = HistGradientBoostingClassifier(loss='log_loss',min_samples_leaf=10,max_iter=250,max_depth=7)
with_nan_hgbc.fit(test_no_impute, y_test)

logloss = log_loss(y_test, with_nan_hgbc.predict_proba(test_no_impute))

print('The log loss of the Hist Gradient Boosting Classifier without imputed values is',logloss)

The log loss of the Hist Gradient Boosting Classifier without imputed values is 0.18052421163379745


<br>

## 2021 Season Evaluation

In [27]:
shots2021 = pd.read_csv('shots2021.csv')

shots2021 = shots2021.drop(index = shots2021.loc[shots2021.Strength == '2x5'].index)
shots2021 = shots2021.drop(index = shots2021.loc[shots2021.Strength == '5x1'].index)
shots2021 = shots2021.drop(index = shots2021.loc[shots2021.Strength == '6x4'].index).reset_index(drop=True)

shots2021.at[1,'Type'] = 'BAT'
shots2021.at[4,'Type'] = 'POKE'
shots2021.at[148349,'Type'] = 'BETWEEN LEGS'
shots2021.at[148352,'Type'] = 'CRADLE'

In [28]:
y_2021 = []

for i in range(len(shots2021)):
    if shots2021.at[i,'Event'] == 'GOAL':
        y_2021.append(1)
    else:
        y_2021.append(0)

In [29]:
model_columns = ['Strength','Type','Distance','Shot_Angle','Score_Diff',
                 'Is_Rebound','Change_of_Angle']
eval_columns = ['Game_Id','Date','Period','Seconds_Elapsed','Event','Description','Time_Elapsed','Ev_Zone','Ev_Team','Home_Zone','Away_Team',
                'Home_Team','p1_name','p1_ID','p2_name','p2_ID','p3_name','p3_ID','awayPlayer1','awayPlayer1_id',
                'awayPlayer2','awayPlayer2_id','awayPlayer3','awayPlayer3_id','awayPlayer4','awayPlayer4_id','awayPlayer5',
                'awayPlayer5_id','awayPlayer6','awayPlayer6_id','homePlayer1','homePlayer1_id','homePlayer2',
                'homePlayer2_id','homePlayer3','homePlayer3_id','homePlayer4','homePlayer4_id','homePlayer5',
                'homePlayer5_id','homePlayer6','homePlayer6_id','Away_Players','Home_Players','Away_Score','Home_Score',
                'Away_Goalie','Away_Goalie_Id','Home_Goalie','Home_Goalie_Id','xC','yC','Home_Coach','Away_Coach']

In [30]:
model_2021 = shots2021[model_columns]
eval_2021 = shots2021[eval_columns]

In [31]:
transformed = ct.fit_transform(model_2021)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

transformed_2021 = pd.DataFrame(transformed, columns=column_names)
transformed_2021['Strength_5x6'] = 0
transformed_2021['Strength_6x5'] = 0
transformed_2021 = transformed_2021[transformed_train_df.columns]
transformed_2021

Unnamed: 0,Distance,Shot_Angle,Change_of_Angle,Strength_3x3,Strength_3x4,Strength_3x5,Strength_4x3,Strength_4x4,Strength_4x5,Strength_5x3,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,0.419960,1.027750,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.028532,0.134528,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-0.103555,0.521585,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,2.976722,-0.583762,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.162275,-0.956908,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148356,-1.132085,1.390944,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148357,-1.043803,-0.730368,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148358,2.266421,-0.079650,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148359,0.810276,1.226071,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [32]:
frame = [transformed_2021,eval_2021]
eval_df = pd.concat(frame,axis=1)
eval_df

Unnamed: 0,Distance,Shot_Angle,Change_of_Angle,Strength_3x3,Strength_3x4,Strength_3x5,Strength_4x3,Strength_4x4,Strength_4x5,Strength_5x3,...,Away_Score,Home_Score,Away_Goalie,Away_Goalie_Id,Home_Goalie,Home_Goalie_Id,xC,yC,Home_Coach,Away_Coach
0,0.419960,1.027750,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,61,-32,Jon Cooper,Mike Sullivan
1,0.028532,0.134528,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,60,-17,Jon Cooper,Mike Sullivan
2,-0.103555,0.521585,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,-65,19,Jon Cooper,Mike Sullivan
3,2.976722,-0.583762,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,-8,-27,Jon Cooper,Mike Sullivan
4,-0.162275,-0.956908,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,-60,-4,Jon Cooper,Mike Sullivan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148356,-1.132085,1.390944,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,2,,,CHRIS DRIEDGER,8476904.0,85,6,Dave Hakstol,Bob Boughner
148357,-1.043803,-0.730368,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,2,,,CHRIS DRIEDGER,8476904.0,80,2,Dave Hakstol,Bob Boughner
148358,2.266421,-0.079650,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,2,,,CHRIS DRIEDGER,8476904.0,13,-37,Dave Hakstol,Bob Boughner
148359,0.810276,1.226071,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,3,KAAPO KAHKONEN,8478039.0,CHRIS DRIEDGER,8476904.0,-58,41,Dave Hakstol,Bob Boughner


<br>

#### Season Evaluation with NaN

In [33]:
xg_list = []

for i in tqdm(range(len(transformed_2021))):
    xg = with_nan_hgbc.predict_proba(transformed_2021.iloc[[i]])[0,1]
    xg_list.append(xg)

hgbc_with_na_league_eval = sum(xg_list)-sum(y_2021)


teams = list(set(eval_df.Ev_Team))
df = pd.DataFrame(columns=['Team','xG','GF'])
df['Team'] = teams

for i in tqdm(range(len(teams))):
    team = teams[i]
    xg_list = []
    g_list = []
    for j in range(len(transformed_2021)):
        if eval_df.at[j,'Ev_Team'] == team:
            xg = with_nan_hgbc.predict_proba(transformed_2021.iloc[[j]])[0,1]
            xg_list.append(xg)
            g_list.append(y_2021[j])
    df.at[i,'xG'] = sum(xg_list)
    df.at[i,'GF'] =sum(g_list)
    
hgbc_with_na_team_eval_mape = mean_absolute_percentage_error(df['GF'],df['xG'])
hgbc_with_na_team_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['xG']))


players = list(set(list(set(eval_df.p1_name)) + list(set(eval_df.p2_name))))
df = pd.DataFrame(columns = ['Player','xG','GF'])
df['Player'] = players
df['xG'] = 0
df['GF'] = 0
df = df.set_index('Player')

for i in tqdm(range(len(eval_df))):
    p1 = eval_df.at[i,'p1_name']
    if eval_df.at[i,'Event'] == 'BLOCK':
        p1 = eval_df.at[i,'p2_name']
    xg = with_nan_hgbc.predict_proba(transformed_2021.iloc[[i]])[0,1]
    g = y_2021[i]
    df.at[p1,'xG'] = df.at[p1,'xG'] + xg
    df.at[p1,'GF'] = df.at[p1,'GF'] + g

hgbc_with_na_skater_eval_mape = mean_absolute_percentage_error(df['GF'],df['xG'])
hgbc_with_na_skater_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['xG']))

print('The HGBC with NaN overestimates the number of goals by:',hgbc_with_na_league_eval)
print('The HGBC with NaN has a MAPE of xG for all teams is:',hgbc_with_na_team_eval_mape)
print('The HGBC with NaN has a RMSE of xG for all teams is:',hgbc_with_na_team_eval_rmse)
print('The HGBC with NaN has a MAPE of xG for all players is:',hgbc_with_na_skater_eval_mape)
print('The HGBC with NaN has a RMSE of xG for all players is:',hgbc_with_na_skater_eval_rmse)
df

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

The HGBC with NaN overestimates the number of goals by: -323.811214283106
The HGBC with NaN has a MAPE of xG for all teams is: 0.07406086274200116
The HGBC with NaN has a RMSE of xG for all teams is: 27.101454671654096
The HGBC with NaN has a MAPE of xG for all players is: 694364248826417.9
The HGBC with NaN has a RMSE of xG for all players is: 4.079861571494925


Unnamed: 0_level_0,xG,GF
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
NIKITA ZADOROV,7.112107,4
RYAN GETZLAF,10.344931,3
BRIAN BOYLE,6.753452,11
KURTIS MACDERMID,3.702143,2
BRANDON CARLO,6.312378,6
...,...,...
ALEC REGULA,1.412512,1
ROSS COLTON,16.293895,22
TY DELLANDREA,0.033966,0
JACK EICHEL,10.957092,14


#### Season Evaluation without NaN

In [34]:
imp = SimpleImputer(strategy='constant',fill_value=0)
imp.fit(transformed_2021)
test = imp.transform(transformed_2021)
df = pd.DataFrame(test)
transformed_2021['Change_of_Angle'] = df[3]
transformed_2021

Unnamed: 0,Distance,Shot_Angle,Change_of_Angle,Strength_3x3,Strength_3x4,Strength_3x5,Strength_4x3,Strength_4x4,Strength_4x5,Strength_5x3,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,0.419960,1.027750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.028532,0.134528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-0.103555,0.521585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,2.976722,-0.583762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.162275,-0.956908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148356,-1.132085,1.390944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148357,-1.043803,-0.730368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148358,2.266421,-0.079650,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148359,0.810276,1.226071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [35]:
model_list = [lr,gbc,no_nan_hgbc]

In [36]:
xg_model = []
for model in model_list:
    xg_list = []
    for i in tqdm(range(len(transformed_2021))):
        xg = model.predict_proba(transformed_2021.iloc[[i]])[0,1]
        xg_list.append(xg)
    xg_model.append(sum(xg_list))
    
lr_without_na_league_eval = xg_model[0]-sum(y_2021)
gbc_without_na_league_eval = xg_model[1]-sum(y_2021)
hgbc_without_na_league_eval = xg_model[2]-sum(y_2021)

print('The LR overestimates the number of goals by:',lr_without_na_league_eval)
print('The GBC overestimates the number of goals by:',gbc_without_na_league_eval)
print('The HGBC overestimates the number of goals by:',hgbc_without_na_league_eval)

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

The LR overestimates the number of goals by: -395.21438019911784
The GBC overestimates the number of goals by: -244.72753987145916
The HGBC overestimates the number of goals by: -313.433411758223


In [37]:
teams = list(set(eval_df.Ev_Team))
df = pd.DataFrame(columns=['Team','lr_xG','gbc_xG','hgbc_xG','GF'])
df['Team'] = teams
df['lr_xG'] = 0
df['gbc_xG'] = 0
df['hgbc_xG'] = 0
df['GF'] = 0
df = df.set_index('Team')

for j in range(len(model_list)):
    model = model_list[j]
    for i in tqdm(range(len(transformed_2021))):
        team = eval_df.at[i,'Ev_Team']
        xg = model.predict_proba(transformed_2021.iloc[[i]])[0,1]
        df.at[team,df.columns[j]] = df.at[team,df.columns[j]] + xg
        if j == 1: # So that goals are only counted once
            if eval_df.at[i,'Event'] == 'GOAL':
                df.at[team,df.columns[3]] = df.at[team,df.columns[3]] + 1
                
lr_team_eval_mape = mean_absolute_percentage_error(df['GF'],df['lr_xG'])
lr_team_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['lr_xG']))
gbc_team_eval_mape = mean_absolute_percentage_error(df['GF'],df['gbc_xG'])
gbc_team_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['gbc_xG']))
hgbc_without_na_team_eval_mape = mean_absolute_percentage_error(df['GF'],df['hgbc_xG'])
hgbc_without_na_team_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['hgbc_xG']))
df.sort_values(by='Team')

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

Unnamed: 0_level_0,lr_xG,gbc_xG,hgbc_xG,GF
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ANA,227.072406,227.344073,225.800742,226
ARI,191.545461,194.049514,195.35197,206
BOS,248.033948,250.220277,245.119294,250
BUF,199.202559,199.009333,202.313959,228
CAR,279.326781,284.321114,279.253452,277
CBJ,232.641843,236.554101,235.359683,258
CGY,272.008505,278.320942,274.72132,291
CHI,211.935813,211.546453,212.304794,213
COL,269.815001,273.695581,271.870984,308
DAL,248.314636,256.676283,253.108619,233


In [38]:
players = list(set(list(set(eval_df.p1_name)) + list(set(eval_df.p2_name))))
df = pd.DataFrame(columns = ['Player','lr_xG','gbc_xG','hgbc_xG','GF'])
df['Player'] = players
df['lr_xG'] = 0
df['gbc_xG'] = 0
df['hgbc_xG'] = 0
df['GF'] = 0
df = df.set_index('Player')

for j in range(len(model_list)):
    model = model_list[j]
    for i in tqdm(range(len(eval_df))):
        p1 = eval_df.at[i,'p1_name']
        if eval_df.at[i,'Event'] == 'BLOCK':
            p1 = eval_df.at[i,'p2_name']
        xg = model.predict_proba(transformed_2021.iloc[[i]])[0,1]
        g = y_2021[i]
        df.at[p1,df.columns[j]] = df.at[p1,df.columns[j]] + xg
        if j == 1:
            df.at[p1,df.columns[3]] = df.at[p1,df.columns[3]] + g
    
df = df.sort_values(by='GF',ascending=False)

lr_skater_eval_mape = mean_absolute_percentage_error(df['GF'],df['lr_xG'])
lr_skater_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['lr_xG']))
gbc_skater_eval_mape = mean_absolute_percentage_error(df['GF'],df['gbc_xG'])
gbc_skater_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['gbc_xG']))
hgbc_without_na_skater_eval_mape = mean_absolute_percentage_error(df['GF'],df['hgbc_xG'])
hgbc_without_na_skater_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['hgbc_xG']))
df

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

Unnamed: 0_level_0,lr_xG,gbc_xG,hgbc_xG,GF
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AUSTON MATTHEWS,35.496451,37.916472,37.016079,59
LEON DRAISAITL,33.611041,34.496751,34.112138,55
CHRIS KREIDER,29.755817,31.642998,30.741894,52
ALEX OVECHKIN,33.036185,33.713325,33.919331,50
KIRILL KAPRIZOV,28.583816,29.112491,27.585282,47
...,...,...,...,...
CONNOR BUNNAMAN,0.977417,0.985771,1.000918,0
STEVEN FOGARTY,0.093513,0.083862,0.093333,0
CARL DAHLSTROM,0.147828,0.132399,0.146821,0
WILLIAM LOCKWOOD,1.449263,1.836746,1.711976,0


<br>

### Summary

In [39]:
print('League-level:')
print('The Logistic Regression overestimated the true number of goals by:',lr_without_na_league_eval)
print('The Gradient Boosting Classifier overestimated the true number of goals by:',gbc_without_na_league_eval)
print('The Hist Gradient Boosting Classifier without NaN overestimated the true number of goals by:',hgbc_without_na_league_eval)
print('The Hist Gradient Boosting Classifier with NaN overestimated the true number of goals by:',hgbc_with_na_league_eval)
print('\n')
print('Team-level:')
print('The Logistic Regression had a MAPE of team goals of:',lr_team_eval_mape)
print('The Gradient Boosting Classifier had a MAPE of team goals of:',gbc_team_eval_mape)
print('The Hist Gradient Boosting Classifier without NaN had a MAPE of team goals of:',hgbc_without_na_team_eval_mape)
print('The Hist Gradient Boosting Classifier with NaN had a MAPE of team goals of:',hgbc_with_na_team_eval_mape)
print('\n')
print('Player-level:')
print('The Logistic Regression had a RMSE of player goals of:',lr_skater_eval_rmse)
print('The Gradient Boosting Classifier had a RMSE of player goals of:',gbc_skater_eval_rmse)
print('The Hist Gradient Boosting Classifier without NaN had RMSE of player goals of:',hgbc_without_na_skater_eval_rmse)
print('The Hist Gradient Boosting Classifier with NaN had a RMSE of player goals of:',hgbc_with_na_skater_eval_rmse)

League-level:
The Logistic Regression overestimated the true number of goals by: -395.21438019911784
The Gradient Boosting Classifier overestimated the true number of goals by: -244.72753987145916
The Hist Gradient Boosting Classifier without NaN overestimated the true number of goals by: -313.433411758223
The Hist Gradient Boosting Classifier with NaN overestimated the true number of goals by: -323.811214283106


Team-level:
The Logistic Regression had a MAPE of team goals of: 0.07144009872803538
The Gradient Boosting Classifier had a MAPE of team goals of: 0.07394243357024313
The Hist Gradient Boosting Classifier without NaN had a MAPE of team goals of: 0.07208369145750898
The Hist Gradient Boosting Classifier with NaN had a MAPE of team goals of: 0.07406086274200116


Player-level:
The Logistic Regression had a RMSE of player goals of: 4.28344522064702
The Gradient Boosting Classifier had a RMSE of player goals of: 4.078645690773094
The Hist Gradient Boosting Classifier without NaN 