# xG Model 1.0

During my investigations into including `Seconds Elapsed` and `Period`, I found that the best results are found using a `Gradient Boosting Classifier` without including `Seconds Elapsed` or `Period`. This model along with all the other models I tested showed a tendency to underestimate actual goal rates. The goal for this notebook is to train the xG model to predict xG. 

In [1]:
import pandas as pd
import hockey_scraper
import numpy as np
import math
import seaborn as sns
from tqdm.notebook import tqdm # This displays a loading bar for monitoring progress of for loops

## ML Workflow

In [2]:
import matplotlib.pyplot as plt
from sklearn.compose import make_column_transformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_predict
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression, RidgeCV
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    average_precision_score,
    roc_curve,
    roc_auc_score,
    mean_absolute_percentage_error,
    mean_squared_error,
    log_loss,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

The 2021-22 and 2022-23 NHL season shot data is imported here. 

In [None]:
shots2021 = pd.read_csv('shots2021.csv')
shots2022 = pd.read_csv('shots2022.csv')

In [4]:
allshots = pd.concat([shots2021,shots2022])
allshots = allshots.reset_index(drop=True)
allshots

Unnamed: 0,Game_Id,Date,Period,Event,Description,Time_Elapsed,Seconds_Elapsed,Strength,Ev_Zone,Type,...,Home_Goalie_Id,xC,yC,Home_Coach,Away_Coach,Distance,Shot_Angle,Score_Diff,Is_Rebound,Change_of_Angle
0,20001,2021-10-12,1,SHOT,"TBL ONGOAL - #91 STAMKOS, Wrist, Off. Zone, 42...",1:03,63,5x5,Off,WRIST SHOT,...,8476883.0,61,-32,Jon Cooper,Mike Sullivan,42.520583,48.814075,0,0,
1,20001,2021-10-12,1,BLOCK,"TBL #24 BOGOSIAN BLOCKED BY PIT #23 MCGINN, W...",1:25,85,5x5,Def,WRIST SHOT,...,8476883.0,60,-17,Jon Cooper,Mike Sullivan,33.615473,30.379126,0,0,
2,20001,2021-10-12,1,SHOT,"PIT ONGOAL - #23 MCGINN, Wrist, Off. Zone, 30 ft.",1:44,104,5x5,Off,WRIST SHOT,...,8476883.0,-65,19,Jon Cooper,Mike Sullivan,30.610456,38.367485,0,0,
3,20001,2021-10-12,1,SHOT,"TBL ONGOAL - #44 RUTTA, Wrist, Neu. Zone, 100 ft.",2:01,121,5x5,Neu,WRIST SHOT,...,8476883.0,-8,-27,Jon Cooper,Mike Sullivan,100.687636,15.554571,0,0,
4,20001,2021-10-12,1,SHOT,"PIT ONGOAL - #43 HEINEN, Wrist, Off. Zone, 29 ft.",2:47,167,5x5,Off,WRIST SHOT,...,8476883.0,-60,-4,Jon Cooper,Mike Sullivan,29.274562,7.853313,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300869,21312,2023-04-13,3,SHOT,"SEA ONGOAL - #67 GEEKIE, Wrist, Off. Zone, 10 ft.",17:14,1034,5x5,Off,WRIST SHOT,...,8475831.0,81,7,Dave Hakstol,Bruce Cassidy,10.630146,41.185925,-1,1,10.535257
300870,21312,2023-04-13,3,MISS,"VGK #9 EICHEL, Wrist, Wide of Net, Def. Zone, ...",18:33,1113,5x5,Def,WRIST SHOT,...,,55,41,Dave Hakstol,Bruce Cassidy,149.723078,15.892831,1,0,
300871,21312,2023-04-13,3,BLOCK,"SEA #4 SCHULTZ BLOCKED BY VGK #3 MCNABB, Wris...",18:43,1123,5x5,Def,WRIST SHOT,...,,75,-1,Dave Hakstol,Bruce Cassidy,14.035669,4.085617,-1,0,
300872,21312,2023-04-13,3,GOAL,"VGK #20 STEPHENSON(16), Poke, Def. Zone, 137 ft.",19:22,1162,5x5,Def,POKE,...,,47,19,Dave Hakstol,Bruce Cassidy,137.320792,7.953082,1,0,


In [5]:
# Creating target variable column

for i in range(len(allshots)):
    if allshots.at[i,'Event'] == 'GOAL':
        allshots.at[i,'Target'] = 1
    else:
        allshots.at[i,'Target'] = 0

Now to split the data into training and testing sets.

In [6]:
train_df, test_df = train_test_split(allshots, test_size=0.25, random_state=17)

Below are features that will be included into the model. `eval_columns` will be separated but are stored so that the descriptive data can be used to evaluate the model later.

In [7]:
model_columns = ['Strength','Period','Type','Distance','Shot_Angle','Score_Diff',
                 'Is_Rebound','Change_of_Angle']
eval_columns = ['Game_Id','Date','Period','Seconds_Elapsed','Event','Description','Time_Elapsed','Ev_Zone','Ev_Team','Home_Zone','Away_Team',
                'Home_Team','p1_name','p1_ID','p2_name','p2_ID','p3_name','p3_ID','awayPlayer1','awayPlayer1_id',
                'awayPlayer2','awayPlayer2_id','awayPlayer3','awayPlayer3_id','awayPlayer4','awayPlayer4_id','awayPlayer5',
                'awayPlayer5_id','awayPlayer6','awayPlayer6_id','homePlayer1','homePlayer1_id','homePlayer2',
                'homePlayer2_id','homePlayer3','homePlayer3_id','homePlayer4','homePlayer4_id','homePlayer5',
                'homePlayer5_id','homePlayer6','homePlayer6_id','Away_Players','Home_Players','Away_Score','Home_Score',
                'Away_Goalie','Away_Goalie_Id','Home_Goalie','Home_Goalie_Id','xC','yC','Home_Coach','Away_Coach']

In [8]:
X_train = train_df[model_columns]
X_train_eval = train_df[eval_columns]
y_train = train_df['Target'].reset_index(drop=True)


X_test = test_df[model_columns]
X_test_eval = test_df[eval_columns]
y_test = test_df['Target'].reset_index(drop=True)

Feature encoding types are shown below. 

In [9]:
numeric_feats = ['Distance','Shot_Angle','Change_of_Angle']  # apply scaling
categorical_feats = ['Strength','Type','Score_Diff','Is_Rebound']  # apply one-hot encoding

Now to transform the feature columns for training and testing ensuring that both training and testing sets have the same columns. 

In [10]:
ct = make_column_transformer(
    (StandardScaler(), numeric_feats),  # scaling on numeric features
    (OneHotEncoder(sparse_output=False), categorical_feats), # one-hot encoding on categorical features
)

In [11]:
# Train Data

transformed = ct.fit_transform(X_train)
column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())
transformed_train_df = pd.DataFrame(transformed, columns=column_names)

imp = SimpleImputer(strategy='constant',fill_value=0)
imp.fit(transformed_train_df)
temp = imp.transform(transformed_train_df)
df = pd.DataFrame(temp)
transformed_train_df['Change_of_Angle'] = df[3]
transformed_train_df

Unnamed: 0,Distance,Shot_Angle,Change_of_Angle,Strength_2x5,Strength_3x3,Strength_3x4,Strength_3x5,Strength_4x3,Strength_4x4,Strength_4x5,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,-1.120874,0.813339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.172454,-0.089716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.160496,2.184637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.426058,0.072815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,-0.979996,1.565919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225650,3.640549,-0.433090,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
225651,0.836971,-0.010120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
225652,-0.416446,-0.959796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
225653,0.615657,-0.367390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
# Test Data

transformed = ct.fit_transform(X_test)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

transformed_test_df = pd.DataFrame(transformed, columns=column_names)
imp = SimpleImputer(strategy='constant',fill_value=0)
imp.fit(transformed_test_df)
temp = imp.transform(transformed_test_df)
df = pd.DataFrame(temp)
transformed_test_df['Change_of_Angle'] = df[3]
transformed_test_df

Unnamed: 0,Distance,Shot_Angle,Change_of_Angle,Strength_3x3,Strength_3x4,Strength_3x5,Strength_4x3,Strength_4x4,Strength_4x5,Strength_5x3,...,Type_WRIST SHOT,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_0,Is_Rebound_1
0,-0.599625,-0.444477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.916550,-0.555753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-0.209471,-1.222485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.293205,-1.110775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.726558,2.782932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75214,-1.107046,0.279680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
75215,-0.012968,-0.470882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
75216,-1.073014,-1.319580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
75217,0.017817,0.234354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


There are some shots that are poorly recorded as they have impossible strength states (or maybe just one team had a terrible line change). I get rid of these shots below. 

In [13]:
transformed_train_df['Strength_6x4'] = 0
transformed_train_df = transformed_train_df[transformed_test_df.columns]

<br>

### Metric

Custom log loss function is below. RMSE and MAPE are also used.

In [14]:
def my_custom_loss_func(y_true, y_pred):
    return log_loss(y_true, y_pred)
custom_logloss = make_scorer(my_custom_loss_func, greater_is_better=False)

<br>

### Training the GBC

In [15]:
gbc = GradientBoostingClassifier(loss='log_loss')
gbc.fit(transformed_train_df, y_train)

logloss = log_loss(y_train, gbc.predict_proba(transformed_train_df))

In [16]:
# Hyperparameter Optimization

# param_grid = {
#     "n_estimators": [50,100,200,250],
#     "criterion": ['friedman_mse','squared_error'],
#     "max_depth": [3,5,7,10],
#     "max_features": [None, 'sqrt'],
#     "min_samples_split": [2,5,7,10]
# }
# random_search = RandomizedSearchCV(
#     gbc, param_distributions=param_grid, n_jobs=-1, scoring=custom_logloss, n_iter=17, cv=5, random_state=17
# )
# random_search.fit(transformed_train_df, y_train)

# random_search.best_params_

In [17]:
gbc = GradientBoostingClassifier(loss='log_loss',n_estimators=200,min_samples_split=10,max_features='sqrt',
                                 max_depth=3,criterion='friedman_mse')
gbc.fit(transformed_test_df, y_test)

logloss = log_loss(y_test, gbc.predict_proba(transformed_test_df))

<br>

### Functions

In [18]:
def return_feature_cols():
    return list(transformed_train_df.columns)

In [19]:
def get_xG(array):
    xG = gbc.predict_proba(array)[0,1] 
    return xG

### Additional Functions

In [20]:
def get_net_locations(df,game_id):

    temp = df.loc[(df.Game_Id == game_id) & (df.Event == 'SHOT')].reset_index(drop=True)
    for i in range(len(temp)):
        if (temp.at[i,'Ev_Zone'] == 'Neu') | (temp.at[i,'Ev_Zone'] == 'Def'):
            continue
        if temp.at[i,'Ev_Zone'] == 'Off':
            if temp.at[i,'Ev_Team'] == temp.at[i,'Away_Team']:
                if temp.at[i,'xC'] > 0:
                    home_short_change_net_coord = (89,0)
                    away_short_change_net_coord = (-89,0)
                    home_long_change_net_coord = (-89,0)
                    away_long_change_net_coord = (89,0)
                    break
                if temp.at[i,'xC'] < 0:
                    home_short_change_net_coord = (-89,0)
                    away_short_change_net_coord = (89,0)
                    home_long_change_net_coord = (89,0)
                    away_long_change_net_coord = (-89,0)
                    break
            if temp.at[i,'Ev_Team'] == temp.at[i,'Home_Team']:
                if temp.at[i,'xC'] > 0:
                    home_short_change_net_coord = (-89,0)
                    away_short_change_net_coord = (89,0)
                    home_long_change_net_coord = (89,0)
                    away_long_change_net_coord = (-89,0)
                    break
                if temp.at[i,'xC'] < 0:
                    home_short_change_net_coord = (89,0)
                    away_short_change_net_coord = (-89,0)
                    home_long_change_net_coord = (-89,0)
                    away_long_change_net_coord = (89,0)
                    break
                    
    return home_short_change_net_coord,away_short_change_net_coord,home_long_change_net_coord,away_long_change_net_coord

In [21]:
def get_shot_distances(df,game_id, period, team, away_team, home_team, x_coord, y_coord):
    
    hscn_coord, ascn_coord, hlcn_coord, alcn_coord = get_net_locations(df,game_id)
    
    if (period == 1) | (period == 3):
        if team == away_team:
            distance = math.sqrt((hscn_coord[0]-x_coord)**2 + (hscn_coord[1]-y_coord)**2) # yC for the net is always zero
        if team == home_team:
            distance = math.sqrt((ascn_coord[0]-x_coord)**2 + (ascn_coord[1]-y_coord)**2) # yC for the net is always zero
            
    if (period == 2) | (period == 4):
        if team == away_team:
            distance = math.sqrt((hlcn_coord[0]-x_coord)**2 + (hlcn_coord[1]-y_coord)**2) # yC for the net is always zero
        if team == home_team:
            distance = math.sqrt((alcn_coord[0]-x_coord)**2 + (alcn_coord[1]-y_coord)**2) # yC for the net is always zero
            
    return distance

In [22]:
def get_shot_angles(df,game_id, period, team, away_team, home_team, x_coord, y_coord):
    
    hscn_coord, ascn_coord, hlcn_coord, alcn_coord = get_net_locations(df,game_id)
    
    if (period == 1) | (period == 3):
        if hscn_coord[0] > 0:
            if team == away_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(hscn_coord[0]-x_coord))) 
            if team == home_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-ascn_coord[0])))
        else:
            if team == away_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-hscn_coord[0]))) 
            if team == home_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:    
                    angle = math.degrees(math.atan(abs(y_coord)/(ascn_coord[0]-x_coord)))
            
    if (period == 2) | (period == 4):
        if hlcn_coord[0] > 0:
            if team == away_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(hlcn_coord[0]-x_coord))) 
            if team == home_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-alcn_coord[0])))
        else:
            if team == away_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-hlcn_coord[0]))) 
            if team == home_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(alcn_coord[0]-x_coord)))
            
    return angle

In [23]:
def get_score_diff(team, away_team, home_team, away_score, home_score):
    
    if team == home_team:
        if abs(home_score-away_score) >= 3:
            if home_score > away_score:
                diff = '3+'
            else:
                diff = '-3-'
        else:
            diff = str(home_score-away_score)
            
    if team == away_team:
        if abs(away_score-home_score) >= 3:
            if away_score > home_score:
                diff = '3+'
            else:
                diff = '-3-'
        else:
            diff = str(away_score-home_score)
                       
    return diff

In [24]:
def get_rebound_status(df,diff):
    
    shot_events = ['SHOT','GOAL','BLOCK','MISS']
    df.at[0,'Is_Rebound'] = 0
    
    for i in range(1,len(df)):
        if df.at[i,'Event'] in shot_events:
            if df.at[i-1,'Event'] in shot_events:
                if df.at[i,'Seconds_Elapsed']-df.at[i-1,'Seconds_Elapsed'] <= diff:
                    df.at[i,'Is_Rebound'] = 1
                else:
                    df.at[i,'Is_Rebound'] = 0
            else:
                df.at[i,'Is_Rebound'] = 0
                
    return df

In [25]:
def get_change_of_angle(df):
    
    for i in range(len(df.index)):
        if df.at[i,'Is_Rebound'] == 1:
            if (df.at[i-1,'yC'] > 0) & (df.at[i,'yC'] > 0):
                df.at[i,'Change_of_Angle'] = abs(df.at[i,'Shot_Angle']-df.at[i-1,'Shot_Angle'])
            if (df.at[i-1,'yC'] < 0) & (df.at[i,'yC'] < 0):
                df.at[i,'Change_of_Angle'] = abs(df.at[i,'Shot_Angle']-df.at[i-1,'Shot_Angle'])
            if ((df.at[i-1,'yC'] < 0) & (df.at[i,'yC'] > 0)) | ((df.at[i-1,'yC'] > 0) & (df.at[i,'yC'] < 0)):
                df.at[i,'Change_of_Angle'] = df.at[i,'Shot_Angle']+df.at[i-1,'Shot_Angle']
                
    return df