# xG Model with Seconds and Period

This model should improve on the first attempt and give a full evaluation of the 2021-22 NHL season. From the first attempt, the best models to use are a `Logistic Regression` or a `Gradient Boosting Classifier`. This script will focus on engineering and testing the best features to use. Of particular interest is investigating the importance of including time elapsed and period as well as coding a method for identifying rebound shots and change of angle on those rebound shots. 

In [1]:
import pandas as pd
import hockey_scraper
import numpy as np
import math
import seaborn as sns
from tqdm.notebook import tqdm # This displays a loading bar for monitoring progress of for loops

In [2]:
pbp2022 = pd.read_csv('pbp2022.csv')
pbp2022

Unnamed: 0,Game_Id,Date,Period,Event,Description,Time_Elapsed,Seconds_Elapsed,Strength,Ev_Zone,Type,...,Away_Score,Home_Score,Away_Goalie,Away_Goalie_Id,Home_Goalie,Home_Goalie_Id,xC,yC,Home_Coach,Away_Coach
0,20001,2022-10-07,1,PSTR,Period Start- Local time: 8:14 CEST,0:00,0,5x5,,,...,0,0,JAMES REIMER,8473503.0,JUUSE SAROS,8477424.0,0.0,0.0,John Hynes,David Quinn
1,20001,2022-10-07,1,FAC,NSH won Neu. Zone - SJS #48 HERTL vs NSH #64 G...,0:00,0,5x5,Neu,,...,0,0,JAMES REIMER,8473503.0,JUUSE SAROS,8477424.0,,,John Hynes,David Quinn
2,20001,2022-10-07,1,HIT,"SJS #11 KUNIN HIT NSH #59 JOSI, Def. Zone",0:11,11,5x5,Def,,...,0,0,JAMES REIMER,8473503.0,JUUSE SAROS,8477424.0,-31.0,-36.0,John Hynes,David Quinn
3,20001,2022-10-07,1,HIT,"SJS #4 HARRINGTON HIT NSH #95 DUCHENE, Def. Zone",0:17,17,5x5,Def,,...,0,0,JAMES REIMER,8473503.0,JUUSE SAROS,8477424.0,-80.0,37.0,John Hynes,David Quinn
4,20001,2022-10-07,1,SHOT,"SJS ONGOAL - #28 MEIER, Wrist, Off. Zone, 45 ft.",0:23,23,5x5,Off,WRIST SHOT,...,0,0,JAMES REIMER,8473503.0,JUUSE SAROS,8477424.0,44.0,8.0,John Hynes,David Quinn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408865,20166,2022-11-03,3,STOP,ICING,19:41,1181,5x5,,,...,5,2,ILYA SOROKIN,8478009.0,JORDAN BINNINGTON,8476412.0,,,Craig Berube,Lane Lambert
408866,20166,2022-11-03,3,FAC,NYI won Off. Zone - NYI #53 CIZIKAS vs STL #90...,19:41,1181,5x5,Off,,...,5,2,ILYA SOROKIN,8478009.0,JORDAN BINNINGTON,8476412.0,-69.0,-22.0,Craig Berube,Lane Lambert
408867,20166,2022-11-03,3,MISS,"NYI #17 MARTIN, Tip-In, Over Net, Off. Zone, 1...",19:56,1196,5x5,Off,TIP-IN,...,5,2,ILYA SOROKIN,8478009.0,JORDAN BINNINGTON,8476412.0,-75.0,-7.0,Craig Berube,Lane Lambert
408868,20166,2022-11-03,3,PEND,Period End- Local time: 9:41 CDT,20:00,1200,5x5,,,...,5,2,ILYA SOROKIN,8478009.0,JORDAN BINNINGTON,8476412.0,,,Craig Berube,Lane Lambert


First, the same process of filtering shots as in *Attempt 1*. Shots with NaN shot coordinates will be removed from the dataset. There are 126 shots with NaN coordinates so this should not have a great effect on the dataset.

In [3]:
# # Selecting only regular season games
# shots2022 = pbp2022.loc[pbp2022.Game_Id < 30000].sort_values(by=['Game_Id','Period'],ascending=[True,True]).reset_index(drop=True)
# # Selecting only shots from regulation and OT
# shots2022 = shots2022.loc[(shots2022.Period == 1) | (shots2022.Period == 2) | (shots2022.Period == 3) | (shots2022.Period == 4)]
# # Selecting shot events
# shots2022 = shots2022.loc[(shots2022.Event == 'SHOT') | (shots2022.Event == 'MISS') | (shots2022.Event == 'BLOCK') | (shots2022.Event == 'GOAL')]
# # Removing penalty shots
# shots2022 = shots2022.drop(index = shots2022.loc[shots2022.Strength == '0x0'].index).reset_index(drop=True)

# # Removing shots with NaN shot coordinates (126 shots)
# array = []

# for i in range(len(shots2022)):
#     if math.isnan(shots2022.at[i,'xC']) == True:
#         array.append(i)
        
# shots2022 = shots2022.drop(index = array).reset_index(drop = True)
# shots2022

Then the `Distance`, `Shot_Angle`, and `Score_Diff` categories will be calculated. To achieve this, functions from *Attempt 1* will be copied here.

In [4]:
def get_net_locations(df,game_id):

    temp = df.loc[(df.Game_Id == game_id) & (df.Event == 'SHOT')].reset_index(drop=True)
    for i in range(len(temp)):
        if (temp.at[i,'Ev_Zone'] == 'Neu') | (temp.at[i,'Ev_Zone'] == 'Def'):
            continue
        if temp.at[i,'Ev_Zone'] == 'Off':
            if temp.at[i,'Ev_Team'] == temp.at[i,'Away_Team']:
                if temp.at[i,'xC'] > 0:
                    home_short_change_net_coord = (89,0)
                    away_short_change_net_coord = (-89,0)
                    home_long_change_net_coord = (-89,0)
                    away_long_change_net_coord = (89,0)
                    break
                if temp.at[i,'xC'] < 0:
                    home_short_change_net_coord = (-89,0)
                    away_short_change_net_coord = (89,0)
                    home_long_change_net_coord = (89,0)
                    away_long_change_net_coord = (-89,0)
                    break
            if temp.at[i,'Ev_Team'] == temp.at[i,'Home_Team']:
                if temp.at[i,'xC'] > 0:
                    home_short_change_net_coord = (-89,0)
                    away_short_change_net_coord = (89,0)
                    home_long_change_net_coord = (89,0)
                    away_long_change_net_coord = (-89,0)
                    break
                if temp.at[i,'xC'] < 0:
                    home_short_change_net_coord = (89,0)
                    away_short_change_net_coord = (-89,0)
                    home_long_change_net_coord = (-89,0)
                    away_long_change_net_coord = (89,0)
                    break
                    
    return home_short_change_net_coord,away_short_change_net_coord,home_long_change_net_coord,away_long_change_net_coord

In [5]:
def get_shot_distances(df,game_id, period, team, away_team, home_team, x_coord, y_coord):
    
    hscn_coord, ascn_coord, hlcn_coord, alcn_coord = get_net_locations(df,game_id)
    
    if (period == 1) | (period == 3):
        if team == away_team:
            distance = math.sqrt((hscn_coord[0]-x_coord)**2 + (hscn_coord[1]-y_coord)**2) # yC for the net is always zero
        if team == home_team:
            distance = math.sqrt((ascn_coord[0]-x_coord)**2 + (ascn_coord[1]-y_coord)**2) # yC for the net is always zero
            
    if (period == 2) | (period == 4):
        if team == away_team:
            distance = math.sqrt((hlcn_coord[0]-x_coord)**2 + (hlcn_coord[1]-y_coord)**2) # yC for the net is always zero
        if team == home_team:
            distance = math.sqrt((alcn_coord[0]-x_coord)**2 + (alcn_coord[1]-y_coord)**2) # yC for the net is always zero
            
    return distance

In [6]:
def get_shot_angles(df,game_id, period, team, away_team, home_team, x_coord, y_coord):
    
    hscn_coord, ascn_coord, hlcn_coord, alcn_coord = get_net_locations(df,game_id)
    
    if (period == 1) | (period == 3):
        if hscn_coord[0] > 0:
            if team == away_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(hscn_coord[0]-x_coord))) 
            if team == home_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-ascn_coord[0])))
        else:
            if team == away_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-hscn_coord[0]))) 
            if team == home_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:    
                    angle = math.degrees(math.atan(abs(y_coord)/(ascn_coord[0]-x_coord)))
            
    if (period == 2) | (period == 4):
        if hlcn_coord[0] > 0:
            if team == away_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(hlcn_coord[0]-x_coord))) 
            if team == home_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-alcn_coord[0])))
        else:
            if team == away_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-hlcn_coord[0]))) 
            if team == home_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(alcn_coord[0]-x_coord)))
            
    return angle

In [7]:
def get_score_diff(team, away_team, home_team, away_score, home_score):
    
    if team == home_team:
        if abs(home_score-away_score) >= 3:
            if home_score > away_score:
                diff = '3+'
            else:
                diff = '-3-'
        else:
            diff = str(home_score-away_score)
            
    if team == away_team:
        if abs(away_score-home_score) >= 3:
            if away_score > home_score:
                diff = '3+'
            else:
                diff = '-3-'
        else:
            diff = str(away_score-home_score)
                       
    return diff

The calculations are below.

In [8]:
# for i in tqdm(range(len(shots2022))):
#     shots2022.at[i,'Distance'] = get_shot_distances(shots2022,shots2022.at[i,'Game_Id'],shots2022.at[i,'Period'],
#                                                     shots2022.at[i,'Ev_Team'],shots2022.at[i,'Away_Team'],
#                                                     shots2022.at[i,'Home_Team'],shots2022.at[i,'xC'],shots2022.at[i,'yC'])
#     shots2022.at[i,'Shot_Angle'] = get_shot_angles(shots2022,shots2022.at[i,'Game_Id'],shots2022.at[i,'Period'],
#                                                    shots2022.at[i,'Ev_Team'],shots2022.at[i,'Away_Team'],
#                                                    shots2022.at[i,'Home_Team'],shots2022.at[i,'xC'],shots2022.at[i,'yC'])
#     shots2022.at[i,'Score_Diff'] = get_score_diff(shots2022.at[i,'Ev_Team'],shots2022.at[i,'Away_Team'],
#                                                   shots2022.at[i,'Home_Team'],shots2022.at[i,'Away_Score'],
#                                                   shots2022.at[i,'Home_Score'])
    
# shots2022

Then, rebound shots need to be identified. **A rebound will be a shot that occurs after another shot within three seconds.** Some studies corroborate this assumption such as this article: https://towardsdatascience.com/nhl-analytics-shots-rebounds-and-weak-signals-c293ba8c635f. 

The function below takes a play-by-play dataset and returns a binary value of whether the shot is a rebound where a rebound shot is represented as a 1 and a non-rebound shot is represented as a 0.

In [9]:
def get_rebound_status(df,diff):
    
    shot_events = ['SHOT','GOAL','BLOCK','MISS']
    df.at[0,'Is_Rebound'] = 0
    
    for i in tqdm(range(1,len(df))):
        if df.at[i,'Event'] in shot_events:
            if df.at[i-1,'Event'] in shot_events:
                if df.at[i,'Seconds_Elapsed']-df.at[i-1,'Seconds_Elapsed'] <= diff:
                    df.at[i,'Is_Rebound'] = 1
                else:
                    df.at[i,'Is_Rebound'] = 0
            else:
                df.at[i,'Is_Rebound'] = 0
                
    return df

Now to get the change of angle for each rebound. 

In [10]:
def get_change_of_angle(df):
    
    for i in tqdm(range(len(df.index))):
        if df.at[i,'Is_Rebound'] == 1:
            if (df.at[i-1,'yC'] > 0) & (df.at[i,'yC'] > 0):
                df.at[i,'Change_of_Angle'] = abs(df.at[i,'Shot_Angle']-df.at[i-1,'Shot_Angle'])
            if (df.at[i-1,'yC'] < 0) & (df.at[i,'yC'] < 0):
                df.at[i,'Change_of_Angle'] = abs(df.at[i,'Shot_Angle']-df.at[i-1,'Shot_Angle'])
            if ((df.at[i-1,'yC'] < 0) & (df.at[i,'yC'] > 0)) | ((df.at[i-1,'yC'] > 0) & (df.at[i,'yC'] < 0)):
                df.at[i,'Change_of_Angle'] = df.at[i,'Shot_Angle']+df.at[i-1,'Shot_Angle']
                
    return df

In [11]:
# shots2022 = get_rebound_status(shots2022,3)
# shots2022 = get_change_of_angle(shots2022)
# shots2022

In [12]:
# Getting rid of NaN shot types. 

# for i in tqdm(range(len(shots2022))):
#     if pd.isnull(shots2022.at[i,'Type']) == True:
#         string1 = shots2022.at[i,'Description'].split(',')[1].lstrip().upper()
#         if (string1 == 'OFF. ZONE') | (string1 == 'DEF. ZONE'):
#             shots2022.at[i,'Type'] = 'WRIST SHOT'
#         else:
#             shots2022.at[i,'Type'] = string1
            
# set(shots2022.Type)

Again, the above dataset will be saved in a CSV file so I don't have to redo these calculations every single time.

In [13]:
# shots2022.to_csv('shots2022.csv')

<br>

## ML Workflow

From *Attempt 1*, the best models to use are `Logistic Regression` and `Gradient Boosting Classifier`. Now there are an addifition al two features that I want to add into the models. The goal for this model is to train both models and test which performs better with the additional features as well as test removing certain features such as `Seconds_Elapsed`. 

In [14]:
import matplotlib.pyplot as plt
from sklearn.compose import make_column_transformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_predict
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression, RidgeCV
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    average_precision_score,
    roc_curve,
    roc_auc_score,
    mean_absolute_percentage_error,
    mean_squared_error,
    log_loss,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [15]:
shots2022 = pd.read_csv('shots2022.csv')

In [16]:
# Creating target variable column

for i in range(len(shots2022)):
    if shots2022.at[i,'Event'] == 'GOAL':
        shots2022.at[i,'Target'] = 1
    else:
        shots2022.at[i,'Target'] = 0
        
shots2022

Unnamed: 0,Game_Id,Date,Period,Event,Description,Time_Elapsed,Seconds_Elapsed,Strength,Ev_Zone,Type,...,xC,yC,Home_Coach,Away_Coach,Distance,Shot_Angle,Score_Diff,Is_Rebound,Change_of_Angle,Target
0,20001,2022-10-07,1,SHOT,"SJS ONGOAL - #28 MEIER, Wrist, Off. Zone, 45 ft.",0:23,23,5x5,Off,WRIST SHOT,...,44,8,John Hynes,David Quinn,45.705580,10.080598,0,0,,0.0
1,20001,2022-10-07,1,MISS,"SJS #44 VLASIC, Wrist, Wide of Net, Off. Zone,...",0:36,36,5x5,Off,WRIST SHOT,...,44,27,John Hynes,David Quinn,52.478567,30.963757,0,0,,0.0
2,20001,2022-10-07,1,BLOCK,"NSH #27 MCDONAGH BLOCKED BY SJS #62 LABANC, W...",0:56,56,5x5,Def,WRIST SHOT,...,-55,3,John Hynes,David Quinn,34.132096,5.042451,0,0,,0.0
3,20001,2022-10-07,1,SHOT,"NSH ONGOAL - #14 EKHOLM, Slap, Off. Zone, 56 ft.",0:59,59,5x5,Off,SLAP SHOT,...,-33,8,John Hynes,David Quinn,56.568542,8.130102,0,1,3.087651,0.0
4,20001,2022-10-07,1,GOAL,"NSH #44 SHERWOOD(1), Wrist, Off. Zone, 15 ft.A...",1:01,61,5x5,Off,WRIST SHOT,...,-74,-5,John Hynes,David Quinn,15.811388,18.434949,0,1,26.565051,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152504,21312,2023-04-13,3,SHOT,"SEA ONGOAL - #67 GEEKIE, Wrist, Off. Zone, 10 ft.",17:14,1034,5x5,Off,WRIST SHOT,...,81,7,Dave Hakstol,Bruce Cassidy,10.630146,41.185925,-1,1,10.535257,0.0
152505,21312,2023-04-13,3,MISS,"VGK #9 EICHEL, Wrist, Wide of Net, Def. Zone, ...",18:33,1113,5x5,Def,WRIST SHOT,...,55,41,Dave Hakstol,Bruce Cassidy,149.723078,15.892831,1,0,,0.0
152506,21312,2023-04-13,3,BLOCK,"SEA #4 SCHULTZ BLOCKED BY VGK #3 MCNABB, Wris...",18:43,1123,5x5,Def,WRIST SHOT,...,75,-1,Dave Hakstol,Bruce Cassidy,14.035669,4.085617,-1,0,,0.0
152507,21312,2023-04-13,3,GOAL,"VGK #20 STEPHENSON(16), Poke, Def. Zone, 137 ft.",19:22,1162,5x5,Def,POKE,...,47,19,Dave Hakstol,Bruce Cassidy,137.320792,7.953082,1,0,,1.0


In [17]:
train_df, test_df = train_test_split(shots2022, test_size=0.25, random_state=17)

## Model That Includes Seconds Elapsed and Period

In [18]:
model_columns = ['Period','Seconds_Elapsed','Strength','Type','Distance','Shot_Angle','Score_Diff',
                 'Is_Rebound','Change_of_Angle']
eval_columns = ['Game_Id','Date','Period','Event','Description','Time_Elapsed','Ev_Zone','Ev_Team','Home_Zone','Away_Team',
                'Home_Team','p1_name','p1_ID','p2_name','p2_ID','p3_name','p3_ID','awayPlayer1','awayPlayer1_id',
                'awayPlayer2','awayPlayer2_id','awayPlayer3','awayPlayer3_id','awayPlayer4','awayPlayer4_id','awayPlayer5',
                'awayPlayer5_id','awayPlayer6','awayPlayer6_id','homePlayer1','homePlayer1_id','homePlayer2',
                'homePlayer2_id','homePlayer3','homePlayer3_id','homePlayer4','homePlayer4_id','homePlayer5',
                'homePlayer5_id','homePlayer6','homePlayer6_id','Away_Players','Home_Players','Away_Score','Home_Score',
                'Away_Goalie','Away_Goalie_Id','Home_Goalie','Home_Goalie_Id','xC','yC','Home_Coach','Away_Coach']

In [19]:
# Reset the y index since the index is reset during column transformations

X_train = train_df[model_columns]
X_train_eval = train_df[eval_columns]
y_train = train_df['Target'].reset_index(drop=True)


X_test = test_df[model_columns]
X_test_eval = test_df[eval_columns]
y_test = test_df['Target'].reset_index(drop=True)

In [20]:
# Choosing feature types

numeric_feats = ['Seconds_Elapsed','Distance','Shot_Angle','Change_of_Angle']  # apply scaling
categorical_feats = ['Period','Strength','Type','Score_Diff','Is_Rebound']  # apply one-hot encoding

In [21]:
# Column Transformation

ct = make_column_transformer(
    (StandardScaler(), numeric_feats),  # scaling on numeric features
    (OneHotEncoder(sparse_output=False), categorical_feats), # one-hot encoding on categorical features
)

# Train Data

transformed = ct.fit_transform(X_train)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

transformed_train_df = pd.DataFrame(transformed, columns=column_names)
transformed_train_df

Unnamed: 0,Seconds_Elapsed,Distance,Shot_Angle,Change_of_Angle,Period_1,Period_2,Period_3,Period_4,Strength_3x3,Strength_3x4,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,0.324808,1.241468,-0.083752,,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.051682,-0.178112,-0.824571,,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.080412,0.475877,0.180684,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.092094,0.744140,-0.255019,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.430796,0.364910,-0.706403,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114376,-0.040065,-0.592175,1.388798,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
114377,-1.643210,3.136045,-0.551975,,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
114378,-1.094463,-0.973507,-1.028666,,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
114379,-0.577320,-0.520808,0.879279,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [22]:
imp = SimpleImputer(strategy='constant',fill_value=0)
imp.fit(transformed_train_df)
test = imp.transform(transformed_train_df)
df = pd.DataFrame(test)
transformed_train_df['Change_of_Angle'] = df[3]

In [23]:
# Test Data

transformed = ct.fit_transform(X_test)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

transformed_test_df = pd.DataFrame(transformed, columns=column_names)
transformed_test_df

Unnamed: 0,Seconds_Elapsed,Distance,Shot_Angle,Change_of_Angle,Period_1,Period_2,Period_3,Period_4,Strength_3x3,Strength_3x4,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,-0.017593,0.123974,-0.294521,,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.980770,0.233246,-1.215393,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.243473,0.212786,0.638550,,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.530359,-0.679193,-0.485540,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,-0.872513,0.535155,-1.167614,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38123,1.399624,-0.665240,1.913587,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
38124,1.405361,-0.325514,-1.284821,,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
38125,0.645113,-0.826101,-0.686323,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
38126,-1.231121,1.945798,0.059939,,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [24]:
imp = SimpleImputer(strategy='constant',fill_value=0)
imp.fit(transformed_test_df)
test = imp.transform(transformed_test_df)
df = pd.DataFrame(test)
transformed_test_df['Change_of_Angle'] = df[3]

In [25]:
transformed_test_df['Strength_5x6'] = 0
transformed_test_df['Strength_6x5'] = 0
transformed_test_df = transformed_test_df[transformed_train_df.columns]
transformed_test_df.columns

Index(['Seconds_Elapsed', 'Distance', 'Shot_Angle', 'Change_of_Angle',
       'Period_1', 'Period_2', 'Period_3', 'Period_4', 'Strength_3x3',
       'Strength_3x4', 'Strength_3x5', 'Strength_4x3', 'Strength_4x4',
       'Strength_4x5', 'Strength_5x3', 'Strength_5x4', 'Strength_5x5',
       'Strength_5x6', 'Strength_6x5', 'Type_BACKHAND', 'Type_BAT',
       'Type_BETWEEN LEGS', 'Type_CRADLE', 'Type_DEFLECTED', 'Type_POKE',
       'Type_SLAP SHOT', 'Type_SNAP SHOT', 'Type_TIP-IN', 'Type_WRAP-AROUND',
       'Type_WRIST SHOT', 'Score_Diff_-1', 'Score_Diff_-2', 'Score_Diff_-3-',
       'Score_Diff_0', 'Score_Diff_1', 'Score_Diff_2', 'Score_Diff_3+',
       'Is_Rebound_0', 'Is_Rebound_1'],
      dtype='object')

<br>

### Metrics

From *Attempt 1*, I now know that accuracy and f1 score are not as descriptive towards the performance of the model as log loss. So log loss will be the metric I use to evaluate model performance. The function that follows will be used in the `RandomizedSearchCV`.

In addition, RMSE and MAPE will be used to evaluate the model season-long predictions.

In [26]:
def my_custom_loss_func(y_true, y_pred):
    return log_loss(y_true, y_pred)
custom_logloss = make_scorer(my_custom_loss_func, greater_is_better=False)

<br>

### Attempt 0: Dummy Classifier

This is to provide a baseline for model comparison.

In [27]:
goals = sum(shots2022['Target'])
shots = len(shots2022['Target'])

dummy = DummyClassifier(strategy='stratified')
dummy.fit(transformed_train_df, y_train)

predictions = dummy.predict(transformed_train_df)

y_pred_dummy = []
for i in range(len(y_train)):
    y_pred_dummy.append(goals/shots)
    
logloss = log_loss(y_train, y_pred_dummy)

print('The log loss of the Dummy Classifier is',logloss)

The log loss of the Dummy Classifier is 0.2101393853741702


<br>

### Attempt 1: Logistic Regression

After the imputation in the `Change_of_Angle` feature, I don't know whether that will affect the fitting of a Logistic Regression as a linear model. 

In [28]:
lr = LogisticRegression(max_iter=1000)
lr.fit(transformed_train_df, y_train)

logloss = log_loss(y_train, lr.predict_proba(transformed_train_df))

print('The log loss of the Logistic Regression is',logloss)

The log loss of the Logistic Regression is 0.20061750515304427


In [29]:
%%time

# Hyperparameter Optimization

# param_grid = {
#     "max_iter": [100, 1000, 10000],
#     "C": [0.01, 0.1, 1, 10, 100, 1000]
# }
# random_search = RandomizedSearchCV(
#     lr, param_distributions=param_grid, n_jobs=-1, n_iter=17, scoring=custom_logloss,cv=5, random_state=17
# )
# random_search.fit(transformed_train_df, y_train)

# random_search.best_params_

CPU times: total: 0 ns
Wall time: 0 ns


In [30]:
# The best params are input below

lr = LogisticRegression(max_iter=1000,C=100)
lr.fit(transformed_test_df, y_test)

logloss = log_loss(y_test, lr.predict_proba(transformed_test_df))

print('The log loss of the optimized Logistic Regression is',logloss)

The log loss of the optimized Logistic Regression is 0.1990673071323891


<br>

### Attempt 2: Gradient Boosting Classifier

In [31]:
gbc = GradientBoostingClassifier(loss='log_loss')
gbc.fit(transformed_train_df, y_train)

logloss = log_loss(y_train, gbc.predict_proba(transformed_train_df))

print('The log loss of the Gradient Boosting Classifier is',logloss)

The log loss of the Gradient Boosting Classifier is 0.1914460990264785


In [32]:
%%time

# Hyperparameter Optimization

# param_grid = {
#     "n_estimators": [50,100,200,250],
#     "criterion": ['friedman_mse','squared_error'],
#     "max_depth": [3,5,7,10],
#     "max_features": [None, 'sqrt'],
#     "min_samples_split": [2,5,7,10]
# }
# random_search = RandomizedSearchCV(
#     gbc, param_distributions=param_grid, n_jobs=-1, scoring=custom_logloss, n_iter=17, cv=5, random_state=17
# )
# random_search.fit(transformed_train_df, y_train)

# random_search.best_params_

CPU times: total: 0 ns
Wall time: 0 ns


In [33]:
gbc = GradientBoostingClassifier(loss='log_loss',n_estimators=200,min_samples_split=10,max_features='sqrt',
                                 max_depth=3,criterion='friedman_mse')
gbc.fit(transformed_test_df, y_test)

logloss = log_loss(y_test, gbc.predict_proba(transformed_test_df))

print('The log loss of the optimized Gradient Boosting Classifier is',logloss)

The log loss of the optimized Gradient Boosting Classifier is 0.18732924559249498


<br>

### Attempt 3: Hist Gradient Boosting Classifier

This is a (supposedly) faster GBC than can natively deal with NaN values. Perhaps it is a good option. I will pass this through with and without the imputed `Change of Angle` value and compare performace.

In [34]:
# With imputed values

no_nan_hgbc = HistGradientBoostingClassifier(loss='log_loss')
no_nan_hgbc.fit(transformed_train_df, y_train)

logloss = log_loss(y_train, no_nan_hgbc.predict_proba(transformed_train_df))

print('The log loss of the Hist Gradient Boosting Classifier with imputed values is',logloss)

The log loss of the Hist Gradient Boosting Classifier with imputed values is 0.18422960237481445


In [35]:
%%time

# Hyperparameter Optimization

# param_grid = {
#     "max_iter": [50,100,250,500],
#     "max_depth": [None,3,5,7,10],
#     "min_samples_leaf": [10,15,20,25,30]
# }
# random_search = RandomizedSearchCV(
#     hgbc, param_distributions=param_grid, n_jobs=-1, scoring=custom_logloss, n_iter=17, cv=5, random_state=17
# )
# random_search.fit(transformed_train_df, y_train)

# random_search.best_params_

CPU times: total: 0 ns
Wall time: 0 ns


In [36]:
no_nan_hgbc = HistGradientBoostingClassifier(loss='log_loss',min_samples_leaf=25,max_iter=500,max_depth=None)
no_nan_hgbc.fit(transformed_test_df, y_test)

logloss = log_loss(y_test, no_nan_hgbc.predict_proba(transformed_test_df))

print('The log loss of the optimized Hist Gradient Boosting Classifier with imputed values is',logloss)

The log loss of the optimized Hist Gradient Boosting Classifier with imputed values is 0.18010798711813544


Now to try it with the NaN values.

In [37]:
transformed = ct.fit_transform(X_train)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

train_no_impute = pd.DataFrame(transformed, columns=column_names)
train_no_impute

Unnamed: 0,Seconds_Elapsed,Distance,Shot_Angle,Change_of_Angle,Period_1,Period_2,Period_3,Period_4,Strength_3x3,Strength_3x4,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,0.324808,1.241468,-0.083752,,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.051682,-0.178112,-0.824571,,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.080412,0.475877,0.180684,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.092094,0.744140,-0.255019,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.430796,0.364910,-0.706403,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114376,-0.040065,-0.592175,1.388798,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
114377,-1.643210,3.136045,-0.551975,,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
114378,-1.094463,-0.973507,-1.028666,,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
114379,-0.577320,-0.520808,0.879279,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [38]:
transformed = ct.fit_transform(X_test)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

test_no_impute = pd.DataFrame(transformed, columns=column_names)

In [39]:
# With imputed values

with_nan_hgbc = HistGradientBoostingClassifier(loss='log_loss')
with_nan_hgbc.fit(train_no_impute, y_train)

logloss = log_loss(y_train, with_nan_hgbc.predict_proba(train_no_impute))

print('The log loss of the Hist Gradient Boosting Classifier without imputed values is',logloss)

The log loss of the Hist Gradient Boosting Classifier without imputed values is 0.18449813429095796


In [40]:
%%time

# Hyperparameter Optimization

# param_grid = {
#     "max_iter": [50,100,250,500],
#     "max_depth": [None,3,5,7,10],
#     "min_samples_leaf": [10,15,20,25,30]
# }
# random_search = RandomizedSearchCV(
#     hgbc, param_distributions=param_grid, n_jobs=-1, scoring=custom_logloss, n_iter=17, cv=5, random_state=17
# )
# random_search.fit(train_no_impute, y_train)

# random_search.best_params_

CPU times: total: 0 ns
Wall time: 0 ns


In [41]:
# With imputed values

hgbc = HistGradientBoostingClassifier(loss='log_loss',min_samples_leaf=25,max_iter=500,max_depth=None)
hgbc.fit(test_no_impute, y_test)

logloss = log_loss(y_test, hgbc.predict_proba(test_no_impute))

print('The log loss of the Hist Gradient Boosting Classifier without imputed values is',logloss)

The log loss of the Hist Gradient Boosting Classifier without imputed values is 0.17635375366844508


The performance of the two Hist Gradient Boosting Classifiers are very similar but the `HGBC` without imputed values seems to perform just a little better.

<br>

## 2021 Season Evaluations

This part will run through the calculations for evaluating the 2021 NHL season on the league, team, and skater level. Goalie-level will be included later since I feel that this is less important than evaluating these three since evaluating goalies is a measurement of GSAx. The following section relates to the model using `Seconds_Elapsed` and `Period` as features. Following the calculations will be a comparison and conclusion. First, the data prep on the 2021-22 season.

In [42]:
shots2021 = pd.read_csv('shots2021.csv')

shots2021 = shots2021.drop(index = shots2021.loc[shots2021.Strength == '2x5'].index)
shots2021 = shots2021.drop(index = shots2021.loc[shots2021.Strength == '5x1'].index)
shots2021 = shots2021.drop(index = shots2021.loc[shots2021.Strength == '6x4'].index).reset_index(drop=True)

In [43]:
shots2021.at[1,'Type'] = 'BAT'
shots2021.at[4,'Type'] = 'POKE'
shots2021.at[148349,'Type'] = 'BETWEEN LEGS'
shots2021.at[148352,'Type'] = 'CRADLE'

In [44]:
y_2021 = []

for i in range(len(shots2021)):
    if shots2021.at[i,'Event'] == 'GOAL':
        y_2021.append(1)
    else:
        y_2021.append(0)

In [45]:
model_columns = ['Period','Seconds_Elapsed','Strength','Type','Distance','Shot_Angle','Score_Diff',
                 'Is_Rebound','Change_of_Angle']
eval_columns = ['Game_Id','Date','Period','Event','Description','Time_Elapsed','Ev_Zone','Ev_Team','Home_Zone','Away_Team',
                'Home_Team','p1_name','p1_ID','p2_name','p2_ID','p3_name','p3_ID','awayPlayer1','awayPlayer1_id',
                'awayPlayer2','awayPlayer2_id','awayPlayer3','awayPlayer3_id','awayPlayer4','awayPlayer4_id','awayPlayer5',
                'awayPlayer5_id','awayPlayer6','awayPlayer6_id','homePlayer1','homePlayer1_id','homePlayer2',
                'homePlayer2_id','homePlayer3','homePlayer3_id','homePlayer4','homePlayer4_id','homePlayer5',
                'homePlayer5_id','homePlayer6','homePlayer6_id','Away_Players','Home_Players','Away_Score','Home_Score',
                'Away_Goalie','Away_Goalie_Id','Home_Goalie','Home_Goalie_Id','xC','yC','Home_Coach','Away_Coach']

In [46]:
model_2021 = shots2021[model_columns]
eval_2021 = shots2021[eval_columns]

In [47]:
transformed = ct.fit_transform(model_2021)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

transformed_2021 = pd.DataFrame(transformed, columns=column_names)
transformed_2021

Unnamed: 0,Seconds_Elapsed,Distance,Shot_Angle,Change_of_Angle,Period_1,Period_2,Period_3,Period_4,Strength_3x3,Strength_3x4,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,-1.543086,0.419960,1.027750,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-1.479842,0.028532,0.134528,,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-1.425223,-0.103555,0.521585,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,-1.376354,2.976722,-0.583762,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-1.244118,-0.162275,-0.956908,,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148356,1.489713,-1.132085,1.390944,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148357,1.532834,-1.043803,-0.730368,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148358,1.575954,2.266421,-0.079650,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148359,1.708190,0.810276,1.226071,,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [48]:
frame = [transformed_2021,eval_2021]
eval_df = pd.concat(frame,axis=1)
eval_df

Unnamed: 0,Seconds_Elapsed,Distance,Shot_Angle,Change_of_Angle,Period_1,Period_2,Period_3,Period_4,Strength_3x3,Strength_3x4,...,Away_Score,Home_Score,Away_Goalie,Away_Goalie_Id,Home_Goalie,Home_Goalie_Id,xC,yC,Home_Coach,Away_Coach
0,-1.543086,0.419960,1.027750,,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,61,-32,Jon Cooper,Mike Sullivan
1,-1.479842,0.028532,0.134528,,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,60,-17,Jon Cooper,Mike Sullivan
2,-1.425223,-0.103555,0.521585,,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,-65,19,Jon Cooper,Mike Sullivan
3,-1.376354,2.976722,-0.583762,,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,-8,-27,Jon Cooper,Mike Sullivan
4,-1.244118,-0.162275,-0.956908,,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,TRISTAN JARRY,8477465.0,ANDREI VASILEVSKIY,8476883.0,-60,-4,Jon Cooper,Mike Sullivan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148356,1.489713,-1.132085,1.390944,,0.0,0.0,1.0,0.0,0.0,0.0,...,0,2,,,CHRIS DRIEDGER,8476904.0,85,6,Dave Hakstol,Bob Boughner
148357,1.532834,-1.043803,-0.730368,,0.0,0.0,1.0,0.0,0.0,0.0,...,0,2,,,CHRIS DRIEDGER,8476904.0,80,2,Dave Hakstol,Bob Boughner
148358,1.575954,2.266421,-0.079650,,0.0,0.0,1.0,0.0,0.0,0.0,...,0,2,,,CHRIS DRIEDGER,8476904.0,13,-37,Dave Hakstol,Bob Boughner
148359,1.708190,0.810276,1.226071,,0.0,0.0,1.0,0.0,0.0,0.0,...,0,3,KAAPO KAHKONEN,8478039.0,CHRIS DRIEDGER,8476904.0,-58,41,Dave Hakstol,Bob Boughner


<br>

#### HGBC with NaN Evaluation

In [49]:
xg_list = []

for i in tqdm(range(len(transformed_2021))):
    xg = with_nan_hgbc.predict_proba(transformed_2021.iloc[[i]])[0,1]
    xg_list.append(xg)

hgbc_with_na_league_eval = sum(xg_list)-sum(y_2021)
print('The HGBC overestimates the number of goals by:',hgbc_with_na_league_eval)

teams = list(set(eval_df.Ev_Team))
df = pd.DataFrame(columns=['Team','xG','GF'])
df['Team'] = teams

for i in tqdm(range(len(teams))):
    team = teams[i]
    xg_list = []
    g_list = []
    for j in range(len(transformed_2021)):
        if eval_df.at[j,'Ev_Team'] == team:
            xg = with_nan_hgbc.predict_proba(transformed_2021.iloc[[j]])[0,1]
            xg_list.append(xg)
            g_list.append(y_2021[j])
    df.at[i,'xG'] = sum(xg_list)
    df.at[i,'GF'] =sum(g_list)
    
hgbc_with_na_team_eval_mape = mean_absolute_percentage_error(df['GF'],df['xG'])
hgbc_with_na_team_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['xG']))
    
print('The MAPE of xG for all teams is:',hgbc_with_na_team_eval_mape)
print('The RMSE of xG for all teams is:',hgbc_with_na_team_eval_rmse)
df.sort_values(by='Team')

players = list(set(list(set(eval_df.p1_name)) + list(set(eval_df.p2_name))))
df = pd.DataFrame(columns = ['Player','xG','GF'])
df['Player'] = players
df['xG'] = 0
df['GF'] = 0
df = df.set_index('Player')

for i in tqdm(range(len(eval_df))):
    p1 = eval_df.at[i,'p1_name']
    if eval_df.at[i,'Event'] == 'BLOCK':
        p1 = eval_df.at[i,'p2_name']
    xg = with_nan_hgbc.predict_proba(transformed_2021.iloc[[i]])[0,1]
    g = y_2021[i]
    df.at[p1,'xG'] = df.at[p1,'xG'] + xg
    df.at[p1,'GF'] = df.at[p1,'GF'] + g
    
df = df.sort_values(by='xG',ascending=False)

hgbc_with_na_skater_eval_mape = mean_absolute_percentage_error(df['GF'],df['xG'])
hgbc_with_na_skater_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['xG']))

print('The MAPE of xG for all players is:',hgbc_with_na_skater_eval_mape)
print('The RMSE of xG for all players is:',hgbc_with_na_skater_eval_rmse)
df

  0%|          | 0/148361 [00:00<?, ?it/s]

The HGBC overestimates the number of goals by: -326.66914936354897


  0%|          | 0/32 [00:00<?, ?it/s]

The MAPE of xG for all teams is: 0.07180282085830009
The RMSE of xG for all teams is: 26.422009044588577


  0%|          | 0/148361 [00:00<?, ?it/s]

The MAPE of xG for all players is: 664890862036461.1
The RMSE of xG for all players is: 3.9497752538612554


Unnamed: 0_level_0,xG,GF
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
ALEX OVECHKIN,38.301048,50
CONNOR MCDAVID,37.132854,43
AUSTON MATTHEWS,37.031336,59
LEON DRAISAITL,36.584081,55
SEBASTIAN AHO,34.934827,39
...,...,...
JACK CAMPBELL,0.000000,0
,0.000000,0
JUSTUS ANNUNEN,0.000000,0
ANTHONY STOLARZ,0.000000,0


#### Skater Evaluation

In [50]:
players = list(set(list(set(eval_df.p1_name)) + list(set(eval_df.p2_name))))
df = pd.DataFrame(columns = ['Player','xG','GF'])
df['Player'] = players
df['xG'] = 0
df['GF'] = 0
df = df.set_index('Player')

In [51]:
for i in tqdm(range(len(eval_df))):
    p1 = eval_df.at[i,'p1_name']
    if eval_df.at[i,'Event'] == 'BLOCK':
        p1 = eval_df.at[i,'p2_name']
    xg = with_nan_hgbc.predict_proba(transformed_2021.iloc[[i]])[0,1]
    g = y_2021[i]
    df.at[p1,'xG'] = df.at[p1,'xG'] + xg
    df.at[p1,'GF'] = df.at[p1,'GF'] + g
    
df = df.sort_values(by='xG',ascending=False)

hgbc_with_na_skater_eval_mape = mean_absolute_percentage_error(df['GF'],df['xG'])
hgbc_with_na_skater_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['xG']))

print('The MAPE of xG for all players is:',hgbc_with_na_skater_eval_mape)
print('The RMSE of xG for all players is:',hgbc_with_na_skater_eval_rmse)
df

  0%|          | 0/148361 [00:00<?, ?it/s]

The MAPE of xG for all players is: 664890862036461.1
The RMSE of xG for all players is: 3.9497752538612554


Unnamed: 0_level_0,xG,GF
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
ALEX OVECHKIN,38.301048,50
CONNOR MCDAVID,37.132854,43
AUSTON MATTHEWS,37.031336,59
LEON DRAISAITL,36.584081,55
SEBASTIAN AHO,34.934827,39
...,...,...
JACK CAMPBELL,0.000000,0
,0.000000,0
JUSTUS ANNUNEN,0.000000,0
ANTHONY STOLARZ,0.000000,0


Below is making the imputed version of `transformed_2021`.

In [52]:
transformed_2021

Unnamed: 0,Seconds_Elapsed,Distance,Shot_Angle,Change_of_Angle,Period_1,Period_2,Period_3,Period_4,Strength_3x3,Strength_3x4,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,-1.543086,0.419960,1.027750,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-1.479842,0.028532,0.134528,,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-1.425223,-0.103555,0.521585,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,-1.376354,2.976722,-0.583762,,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-1.244118,-0.162275,-0.956908,,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148356,1.489713,-1.132085,1.390944,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148357,1.532834,-1.043803,-0.730368,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148358,1.575954,2.266421,-0.079650,,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148359,1.708190,0.810276,1.226071,,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [53]:
imp = SimpleImputer(strategy='constant',fill_value=0)
imp.fit(transformed_2021)
test = imp.transform(transformed_2021)
df = pd.DataFrame(test)
transformed_2021['Change_of_Angle'] = df[3]
transformed_2021

Unnamed: 0,Seconds_Elapsed,Distance,Shot_Angle,Change_of_Angle,Period_1,Period_2,Period_3,Period_4,Strength_3x3,Strength_3x4,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,-1.543086,0.419960,1.027750,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-1.479842,0.028532,0.134528,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-1.425223,-0.103555,0.521585,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,-1.376354,2.976722,-0.583762,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-1.244118,-0.162275,-0.956908,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148356,1.489713,-1.132085,1.390944,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148357,1.532834,-1.043803,-0.730368,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148358,1.575954,2.266421,-0.079650,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
148359,1.708190,0.810276,1.226071,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


Now to evaluate `Logistic Regression`, `Gradient Boosting Classifier`, and `Hist Gradient Boosting Classifier`. 

#### League Evaluation

In [54]:
model_list = [lr,gbc,no_nan_hgbc]

In [55]:
xg_model = []
for model in model_list:
    xg_list = []
    for i in tqdm(range(len(transformed_2021))):
        xg = model.predict_proba(transformed_2021.iloc[[i]])[0,1]
        xg_list.append(xg)
    xg_model.append(sum(xg_list))
    
lr_without_na_league_eval = xg_model[0]-sum(y_2021)
gbc_without_na_league_eval = xg_model[1]-sum(y_2021)
hgbc_without_na_league_eval = xg_model[2]-sum(y_2021)

print('The LR overestimates the number of goals by:',lr_without_na_league_eval)
print('The GBC overestimates the number of goals by:',gbc_without_na_league_eval)
print('The HGBC overestimates the number of goals by:',hgbc_without_na_league_eval)

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

The LR overestimates the number of goals by: -407.44090380773287
The GBC overestimates the number of goals by: -303.208851500136
The HGBC overestimates the number of goals by: -314.86279669611395


#### Team Evaluation

In [56]:
teams = list(set(eval_df.Ev_Team))
df = pd.DataFrame(columns=['Team','lr_xG','gbc_xG','hgbc_xG','GF'])
df['Team'] = teams
df['lr_xG'] = 0
df['gbc_xG'] = 0
df['hgbc_xG'] = 0
df['GF'] = 0
df = df.set_index('Team')

for j in range(len(model_list)):
    model = model_list[j]
    for i in tqdm(range(len(transformed_2021))):
        team = eval_df.at[i,'Ev_Team']
        xg = model.predict_proba(transformed_2021.iloc[[i]])[0,1]
        df.at[team,df.columns[j]] = df.at[team,df.columns[j]] + xg
        if j == 1: # So that goals are only counted once
            if eval_df.at[i,'Event'] == 'GOAL':
                df.at[team,df.columns[3]] = df.at[team,df.columns[3]] + 1

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

In [57]:
lr_team_eval_mape = mean_absolute_percentage_error(df['GF'],df['lr_xG'])
lr_team_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['lr_xG']))
gbc_team_eval_mape = mean_absolute_percentage_error(df['GF'],df['gbc_xG'])
gbc_team_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['gbc_xG']))
hgbc_without_na_team_eval_mape = mean_absolute_percentage_error(df['GF'],df['hgbc_xG'])
hgbc_without_na_team_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['hgbc_xG']))
    
print('The LR MAPE for all teams is:',lr_team_eval_mape)
print('The LR RMSE for all teams is:',lr_team_eval_rmse)
print('The GBC MAPE for all teams is:',gbc_team_eval_mape)
print('The GBC RMSE for all teams is:',gbc_team_eval_rmse)
print('The HGBC MAPE for all teams is:',hgbc_without_na_team_eval_mape)
print('The HGBC RMSE for all teams is:',hgbc_without_na_team_eval_rmse)
df.sort_values(by='Team')

The LR MAPE for all teams is: 0.07214249669555078
The LR RMSE for all teams is: 27.33952623861439
The GBC MAPE for all teams is: 0.07496985615584917
The GBC RMSE for all teams is: 26.948328678589533
The HGBC MAPE for all teams is: 0.0745493132481856
The HGBC RMSE for all teams is: 27.433396703723417


Unnamed: 0_level_0,lr_xG,gbc_xG,hgbc_xG,GF
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ANA,226.611057,223.931558,225.871896,226
ARI,190.749383,192.143222,200.57988,206
BOS,247.981699,246.030706,243.174705,250
BUF,198.997856,198.719738,202.980099,228
CAR,278.823474,282.627366,280.847074,277
CBJ,231.400685,234.028231,234.761445,258
CGY,271.047684,276.8172,275.023681,291
CHI,212.709337,209.69504,213.954565,213
COL,269.244081,272.581051,268.706927,308
DAL,248.186973,255.421889,257.761061,233


#### Skater Evaluation

In [58]:
players = list(set(list(set(eval_df.p1_name)) + list(set(eval_df.p2_name))))
df = pd.DataFrame(columns = ['Player','lr_xG','gbc_xG','hgbc_xG','GF'])
df['Player'] = players
df['lr_xG'] = 0
df['gbc_xG'] = 0
df['hgbc_xG'] = 0
df['GF'] = 0
df = df.set_index('Player')

In [59]:
for j in range(len(model_list)):
    model = model_list[j]
    for i in tqdm(range(len(eval_df))):
        p1 = eval_df.at[i,'p1_name']
        if eval_df.at[i,'Event'] == 'BLOCK':
            p1 = eval_df.at[i,'p2_name']
        xg = model.predict_proba(transformed_2021.iloc[[i]])[0,1]
        g = y_2021[i]
        df.at[p1,df.columns[j]] = df.at[p1,df.columns[j]] + xg
        if j == 1:
            df.at[p1,df.columns[3]] = df.at[p1,df.columns[3]] + g
    
df = df.sort_values(by='GF',ascending=False)

lr_skater_eval_mape = mean_absolute_percentage_error(df['GF'],df['lr_xG'])
lr_skater_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['lr_xG']))
gbc_skater_eval_mape = mean_absolute_percentage_error(df['GF'],df['gbc_xG'])
gbc_skater_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['gbc_xG']))
hgbc_without_na_skater_eval_mape = mean_absolute_percentage_error(df['GF'],df['hgbc_xG'])
hgbc_without_na_skater_eval_rmse = math.sqrt(mean_squared_error(df['GF'],df['hgbc_xG']))

print('The LR MAPE of xG for all players is:',lr_skater_eval_mape)
print('The LR RMSE of xG for all players is:',lr_skater_eval_rmse)
print('The GBC MAPE of xG for all players is:',gbc_skater_eval_mape)
print('The GBC RMSE of xG for all players is:',gbc_skater_eval_rmse)
print('The HGBC MAPE of xG for all players is:',hgbc_without_na_skater_eval_mape)
print('The HGBC RMSE of xG for all players is:',hgbc_without_na_skater_eval_rmse)
df

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

  0%|          | 0/148361 [00:00<?, ?it/s]

The LR MAPE of xG for all players is: 691830562065176.6
The LR RMSE of xG for all players is: 4.2803357886260365
The GBC MAPE of xG for all players is: 693797750329244.6
The GBC RMSE of xG for all players is: 4.091530578487648
The HGBC MAPE of xG for all players is: 677444899665334.2
The HGBC RMSE of xG for all players is: 4.014467409990703


Unnamed: 0_level_0,lr_xG,gbc_xG,hgbc_xG,GF
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AUSTON MATTHEWS,35.246223,37.279357,38.416884,59
LEON DRAISAITL,33.558333,33.662873,34.333280,55
CHRIS KREIDER,30.360573,32.774695,30.627111,52
ALEX OVECHKIN,33.301670,34.369116,37.177889,50
KIRILL KAPRIZOV,28.665349,28.444902,28.281302,47
...,...,...,...,...
RYAN MACINNIS,0.051800,0.048662,0.048003,0
ADAM RASKA,0.384602,0.350374,0.531323,0
DARREN RADDYSH,0.171843,0.130112,0.142691,0
JOSH BROWN,2.407437,2.506932,2.334125,0


<br>

### Concluding Remarks about including `Seconds Elapsed` and `Period`

While including `Seconds_Elapsed` and `Period`, there is an apparent trend of underestimating xG. Perhaps this is a product of the model. Another possible reason is that in optimizing the models for log loss, it is skewing the model since there is a definite class imbalance between shots and goals. 

The models that include `Seconds Elapsed` and `Period` performed ok on the league-level, pretty good on the team-level, and unsatisfactorily on the player-level. 

The best models appear to be `Gradient Boosting Classifier` and `Hist Gradient Boosting Classifier`.

In [60]:
print('League-level:')
print('The Logistic Regression overestimated the true number of goals by:',lr_without_na_league_eval)
print('The Gradient Boosting Classifier overestimated the true number of goals by:',gbc_without_na_league_eval)
print('The Hist Gradient Boosting Classifier without NaN overestimated the true number of goals by:',hgbc_without_na_league_eval)
print('The Hist Gradient Boosting Classifier with NaN overestimated the true number of goals by:',hgbc_with_na_league_eval)
print('\n')
print('Team-level:')
print('The Logistic Regression had a MAPE of team goals of:',lr_team_eval_mape)
print('The Gradient Boosting Classifier had a MAPE of team goals of:',gbc_team_eval_mape)
print('The Hist Gradient Boosting Classifier without NaN had a MAPE of team goals of:',hgbc_without_na_team_eval_mape)
print('The Hist Gradient Boosting Classifier with NaN had a MAPE of team goals of:',hgbc_with_na_team_eval_mape)
print('\n')
print('Player-level:')
print('The Logistic Regression had a RMSE of player goals of:',lr_skater_eval_rmse)
print('The Gradient Boosting Classifier had a RMSE of player goals of:',gbc_skater_eval_rmse)
print('The Hist Gradient Boosting Classifier without NaN had RMSE of player goals of:',hgbc_without_na_skater_eval_rmse)
print('The Hist Gradient Boosting Classifier with NaN had a RMSE of player goals of:',hgbc_with_na_skater_eval_rmse)

League-level:
The Logistic Regression overestimated the true number of goals by: -407.44090380773287
The Gradient Boosting Classifier overestimated the true number of goals by: -303.208851500136
The Hist Gradient Boosting Classifier without NaN overestimated the true number of goals by: -314.86279669611395
The Hist Gradient Boosting Classifier with NaN overestimated the true number of goals by: -326.66914936354897


Team-level:
The Logistic Regression had a MAPE of team goals of: 0.07214249669555078
The Gradient Boosting Classifier had a MAPE of team goals of: 0.07496985615584917
The Hist Gradient Boosting Classifier without NaN had a MAPE of team goals of: 0.0745493132481856
The Hist Gradient Boosting Classifier with NaN had a MAPE of team goals of: 0.07180282085830009


Player-level:
The Logistic Regression had a RMSE of player goals of: 4.2803357886260365
The Gradient Boosting Classifier had a RMSE of player goals of: 4.091530578487648
The Hist Gradient Boosting Classifier without N