# xG Model v1.1

This xG model is built with the same processes as v1.0 but using data scraped from the new NHL API as well as an improvement for penalty shots and shootouts. 

<br>

In [1]:
import pandas as pd
import numpy as np
import math
from datetime import datetime

## ML Workflow

In [2]:
import matplotlib.pyplot as plt
from sklearn.compose import make_column_transformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_predict
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression, RidgeCV
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    average_precision_score,
    roc_curve,
    roc_auc_score,
    mean_absolute_percentage_error,
    mean_squared_error,
    log_loss,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

The 2021-22 and 2022-23 NHL season shot data is imported here. 

In [3]:
shots2021 = pd.read_csv('shots2021.csv')
shots2022 = pd.read_csv('shots2022.csv')

In [4]:
allshots = pd.concat([shots2021,shots2022])
allshots = allshots.reset_index(drop=True)
for i in range(len(allshots)):
    val = int(allshots.at[i,'Strength'])
    allshots.at[i,'Strength'] = str(val)
allshots

Unnamed: 0,Game_Id,Date,Period,Event_tc,Event,Time_Elapsed,Strength,Ev_Zone,Type,Ev_Team,...,Home_Goalie,Home_Goalie_Id,xC,yC,Distance,Shot_Angle,Score_Diff,Is_Rebound,Seconds_Elapsed,Change_of_Angle
0,2021020001,2021-10-12,1,506,SHOT_ON_GOAL,01:03,1551,O,WRIST,TBL,...,,,61.0,-32.0,42.520583,48.814075,0,0.0,63.0,0.000000
1,2021020001,2021-10-12,1,508,BLOCKED_SHOT,01:25,1551,D,,PIT,...,,,60.0,-17.0,149.966663,6.508956,0,0.0,85.0,0.000000
2,2021020001,2021-10-12,1,506,SHOT_ON_GOAL,01:44,1551,O,WRIST,PIT,...,ANDREI VASILEVSKIY,8476883.0,-65.0,19.0,30.610456,38.367485,0,0.0,104.0,0.000000
3,2021020001,2021-10-12,1,506,SHOT_ON_GOAL,02:01,1551,N,WRIST,TBL,...,,,-8.0,-27.0,100.687636,15.554571,0,0.0,121.0,0.000000
4,2021020001,2021-10-12,1,506,SHOT_ON_GOAL,02:47,1551,O,WRIST,PIT,...,ANDREI VASILEVSKIY,8476883.0,-60.0,-4.0,29.274562,7.853313,0,0.0,167.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301363,2022021312,2023-04-13,3,506,SHOT_ON_GOAL,17:14,1551,O,WRIST,SEA,...,,,81.0,7.0,10.630146,41.185925,-1,1.0,1034.0,35.137419
301364,2022021312,2023-04-13,3,507,MISSED_SHOT,18:33,1560,D,WRIST,VGK,...,,,55.0,41.0,149.723078,15.892831,1,0.0,1113.0,0.000000
301365,2022021312,2023-04-13,3,508,BLOCKED_SHOT,18:43,1560,D,,VGK,...,,,75.0,-1.0,164.003049,0.349360,1,0.0,1123.0,0.000000
301366,2022021312,2023-04-13,3,505,GOAL,19:22,1560,D,POKE,VGK,...,,,47.0,19.0,137.320792,7.953082,2,0.0,1162.0,0.000000


In [5]:
# Creating target variable column

for i in range(len(allshots)):
    if allshots.at[i,'Event'] == 'GOAL':
        allshots.at[i,'Target'] = 1
    else:
        allshots.at[i,'Target'] = 0

Now to split the data into training and testing sets.

In [6]:
train_df, test_df = train_test_split(allshots, test_size=0.25, random_state=17)

Below are features that will be included into the model. `eval_columns` will be separated but are stored so that the descriptive data can be used to evaluate the model later.

In [7]:
model_columns = ['Strength','Period','Type','Distance','Shot_Angle','Score_Diff','Is_Rebound','Change_of_Angle']
eval_columns = ['Game_Id','Date','Period','Seconds_Elapsed','Event_tc','Event','Time_Elapsed','Ev_Zone','Ev_Team','Away_Team',
                'Home_Team','p1_name','p1_ID','p2_name','p2_ID','p3_name','p3_ID','awayPlayer1','awayPlayer1_id',
                'awayPlayer2','awayPlayer2_id','awayPlayer3','awayPlayer3_id','awayPlayer4','awayPlayer4_id','awayPlayer5',
                'awayPlayer5_id','awayPlayer6','awayPlayer6_id','homePlayer1','homePlayer1_id','homePlayer2',
                'homePlayer2_id','homePlayer3','homePlayer3_id','homePlayer4','homePlayer4_id','homePlayer5',
                'homePlayer5_id','homePlayer6','homePlayer6_id','Away_Score','Home_Score',
                'Away_Goalie','Away_Goalie_Id','Home_Goalie','Home_Goalie_Id','xC','yC']

In [8]:
X_train = train_df[model_columns]
X_train_eval = train_df[eval_columns]
y_train = train_df['Target'].reset_index(drop=True)


X_test = test_df[model_columns]
X_test_eval = test_df[eval_columns]
y_test = test_df['Target'].reset_index(drop=True)

Feature encoding types are shown below. 

In [9]:
numeric_feats = ['Distance','Shot_Angle','Change_of_Angle']  # apply scaling
categorical_feats = ['Strength','Type','Score_Diff','Is_Rebound']  # apply one-hot encoding

Now to transform the feature columns for training and testing ensuring that both training and testing sets have the same columns. 

In [10]:
ct = make_column_transformer(
    (StandardScaler(), numeric_feats),  # scaling on numeric features
    (OneHotEncoder(sparse_output=False,drop='if_binary'), categorical_feats), # one-hot encoding on categorical features
)

In [11]:
# Train Data

transformed = ct.fit_transform(X_train)
column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())
transformed_train_df = pd.DataFrame(transformed, columns=column_names)

imp = SimpleImputer(strategy='constant',fill_value=0)
imp.fit(transformed_train_df)
temp = imp.transform(transformed_train_df)
df = pd.DataFrame(temp)
transformed_train_df['Change_of_Angle'] = df[3]
transformed_train_df

Unnamed: 0,Distance,Shot_Angle,Change_of_Angle,Strength_101,Strength_1010,Strength_1331,Strength_1340,Strength_1341,Strength_1350,Strength_1351,...,Type_WRIST,Type_nan,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_1.0
0,1.549747,-0.839370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-0.094440,0.431814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.807312,-0.889646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.570077,0.479610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.817690,0.788087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226021,-0.376265,0.692231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
226022,-0.200418,-0.444115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
226023,1.755659,-0.961758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
226024,1.761309,-0.852847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [12]:
def return_feature_cols():
    return list(transformed_train_df.columns)

In [13]:
# Test Data

transformed = ct.fit_transform(X_test)

column_names = (numeric_feats+ct.named_transformers_['onehotencoder'].get_feature_names_out().tolist())

transformed_test_df = pd.DataFrame(transformed, columns=column_names)
imp = SimpleImputer(strategy='constant',fill_value=0)
imp.fit(transformed_test_df)
temp = imp.transform(transformed_test_df)
df = pd.DataFrame(temp)
transformed_test_df['Change_of_Angle'] = df[3]
transformed_test_df

# Making sure all columns are present
add_cols = []
for feat in return_feature_cols():
    if feat not in transformed_test_df.columns:
        add_cols.append(feat)
for col in add_cols:
    transformed_test_df[col] = 0
transformed_test_df = transformed_test_df[return_feature_cols()]
transformed_test_df

Unnamed: 0,Distance,Shot_Angle,Change_of_Angle,Strength_101,Strength_1010,Strength_1331,Strength_1340,Strength_1341,Strength_1350,Strength_1351,...,Type_WRIST,Type_nan,Score_Diff_-1,Score_Diff_-2,Score_Diff_-3-,Score_Diff_0,Score_Diff_1,Score_Diff_2,Score_Diff_3+,Is_Rebound_1.0
0,-0.775181,-0.284533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.588787,1.341978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.152145,-0.337529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.851086,-0.945941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.742946,-1.021788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75337,1.235276,-0.670986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75338,-0.891460,-0.492187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75339,1.613977,-0.788357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75340,-0.618349,2.134852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


<br>

### Metric

Custom log loss function is below. RMSE and MAPE are also used.

In [14]:
def my_custom_loss_func(y_true, y_pred):
    return log_loss(y_true, y_pred)
custom_logloss = make_scorer(my_custom_loss_func, greater_is_better=False)

<br>

### Training the GBC

In [15]:
gbc = GradientBoostingClassifier(loss='log_loss')
gbc.fit(transformed_train_df, y_train)

logloss = log_loss(y_train, gbc.predict_proba(transformed_train_df))

In [16]:
# Hyperparameter Optimization

# param_grid = {
#     "n_estimators": [50,100,200,250],
#     "criterion": ['friedman_mse','squared_error'],
#     "max_depth": [3,5,7,10],
#     "max_features": [None, 'sqrt'],
#     "min_samples_split": [2,5,7,10]
# }
# random_search = RandomizedSearchCV(
#     gbc, param_distributions=param_grid, n_jobs=-1, scoring=custom_logloss, n_iter=17, cv=5, random_state=17
# )
# random_search.fit(transformed_train_df, y_train)

# random_search.best_params_

In [17]:
gbc = GradientBoostingClassifier(loss='log_loss',n_estimators=200,min_samples_split=10,max_features='sqrt',
                                 max_depth=3,criterion='friedman_mse')
gbc.fit(transformed_test_df, y_test)

logloss = log_loss(y_test, gbc.predict_proba(transformed_test_df))

<br>

### Functions

In [18]:
def get_xG(array):
    
    xG = gbc.predict_proba(array)[0,1] 
    
    return xG

### Additional Functions

In [19]:
def get_net_locations(df,game_id):

    temp = df.loc[(df.Game_Id == game_id) & ((df.Event_tc == 505) | (df.Event_tc == 506) | (df.Event_tc == 507) | (df.Event_tc == 508))].reset_index(drop=True)
    for i in range(len(temp)):
        if (temp.at[i,'Ev_Zone'] == 'N') | (temp.at[i,'Ev_Zone'] == 'D'):
            continue
        if temp.at[i,'Ev_Zone'] == 'O':
            if temp.at[i,'Ev_Team'] == temp.at[i,'Away_Team']:
                if temp.at[i,'xC'] > 0:
                    home_short_change_net_coord = (89,0)
                    away_short_change_net_coord = (-89,0)
                    home_long_change_net_coord = (-89,0)
                    away_long_change_net_coord = (89,0)
                    break
                if temp.at[i,'xC'] < 0:
                    home_short_change_net_coord = (-89,0)
                    away_short_change_net_coord = (89,0)
                    home_long_change_net_coord = (89,0)
                    away_long_change_net_coord = (-89,0)
                    break
            if temp.at[i,'Ev_Team'] == temp.at[i,'Home_Team']:
                if temp.at[i,'xC'] > 0:
                    home_short_change_net_coord = (-89,0)
                    away_short_change_net_coord = (89,0)
                    home_long_change_net_coord = (89,0)
                    away_long_change_net_coord = (-89,0)
                    break
                if temp.at[i,'xC'] < 0:
                    home_short_change_net_coord = (89,0)
                    away_short_change_net_coord = (-89,0)
                    home_long_change_net_coord = (-89,0)
                    away_long_change_net_coord = (89,0)
                    break
                    
    return home_short_change_net_coord,away_short_change_net_coord,home_long_change_net_coord,away_long_change_net_coord

In [20]:
def get_shot_distance(df,game_id, period, team, away_team, home_team, x_coord, y_coord):
    
    hscn_coord, ascn_coord, hlcn_coord, alcn_coord = get_net_locations(df,game_id)
    
    if (period == 1) | (period == 3):
        if team == away_team:
            distance = math.sqrt((hscn_coord[0]-x_coord)**2 + (hscn_coord[1]-y_coord)**2) # yC for the net is always zero
        if team == home_team:
            distance = math.sqrt((ascn_coord[0]-x_coord)**2 + (ascn_coord[1]-y_coord)**2) # yC for the net is always zero
            
    if (period == 2) | (period == 4):
        if team == away_team:
            distance = math.sqrt((hlcn_coord[0]-x_coord)**2 + (hlcn_coord[1]-y_coord)**2) # yC for the net is always zero
        if team == home_team:
            distance = math.sqrt((alcn_coord[0]-x_coord)**2 + (alcn_coord[1]-y_coord)**2) # yC for the net is always zero
            
    return distance

In [21]:
def get_shot_angle(df,game_id, period, team, away_team, home_team, x_coord, y_coord):
    
    hscn_coord, ascn_coord, hlcn_coord, alcn_coord = get_net_locations(df,game_id)
    
    if (period == 1) | (period == 3):
        if hscn_coord[0] > 0:
            if team == away_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(hscn_coord[0]-x_coord))) 
            if team == home_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-ascn_coord[0])))
        else:
            if team == away_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-hscn_coord[0]))) 
            if team == home_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:    
                    angle = math.degrees(math.atan(abs(y_coord)/(ascn_coord[0]-x_coord)))
            
    if (period == 2) | (period == 4):
        if hlcn_coord[0] > 0:
            if team == away_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(hlcn_coord[0]-x_coord))) 
            if team == home_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-alcn_coord[0])))
        else:
            if team == away_team:
                if x_coord == -89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(x_coord-hlcn_coord[0]))) 
            if team == home_team:
                if x_coord == 89: # To deal with divide by zero issues
                    angle = 90
                else:
                    angle = math.degrees(math.atan(abs(y_coord)/(alcn_coord[0]-x_coord)))
            
    return angle

In [22]:
def get_score_diff(team, away_team, home_team, away_score, home_score):
    
    if team == home_team:
        if abs(home_score-away_score) >= 3:
            if home_score > away_score:
                diff = '3+'
            else:
                diff = '-3-'
        else:
            diff = str(home_score-away_score)
            
    if team == away_team:
        if abs(away_score-home_score) >= 3:
            if away_score > home_score:
                diff = '3+'
            else:
                diff = '-3-'
        else:
            diff = str(away_score-home_score)
                       
    return diff

In [23]:
def get_seconds_elapsed(df):
    
    epochtime = datetime(1900,1,1)
    
    for i in range(len(df)):
        
        delta = datetime.strptime(allshots.at[i,'Time_Elapsed'],'%M:%S') - epochtime
        df.at[i,'Seconds_Elapsed'] = delta.total_seconds()
        
    return df

In [24]:
def get_rebound_status(df,tdelta):
    
    shot_events = [505,506,507,508]
    df.at[0,'Is_Rebound'] = 0
    
    for i in range(1,len(df)):
        if (df.at[i,'Event_tc'] in shot_events) & (df.at[i-1,'Event_tc'] in shot_events):
            if (df.at[i,'Period'] == df.at[i-1,'Period']):
                if df.at[i,'Seconds_Elapsed']-df.at[i-1,'Seconds_Elapsed'] <= tdelta:
                    df.at[i,'Is_Rebound'] = 1
                else:
                    df.at[i,'Is_Rebound'] = 0
            else:
                df.at[i,'Is_Rebound'] = 0
        else:
            df.at[i,'Is_Rebound'] = 0
                
    return df

In [25]:
def get_change_of_angle(df):
    
    for i in range(len(df.index)):
        if df.at[i,'Is_Rebound'] == 1:
            if (df.at[i-1,'yC'] > 0) & (df.at[i,'yC'] > 0):
                df.at[i,'Change_of_Angle'] = abs(df.at[i,'Shot_Angle']-df.at[i-1,'Shot_Angle'])
            if (df.at[i-1,'yC'] < 0) & (df.at[i,'yC'] < 0):
                df.at[i,'Change_of_Angle'] = abs(df.at[i,'Shot_Angle']-df.at[i-1,'Shot_Angle'])
            if ((df.at[i-1,'yC'] < 0) & (df.at[i,'yC'] > 0)) | ((df.at[i-1,'yC'] > 0) & (df.at[i,'yC'] < 0)):
                df.at[i,'Change_of_Angle'] = df.at[i,'Shot_Angle']+df.at[i-1,'Shot_Angle']
        if df.at[i,'Is_Rebound'] == 0:
            df.at[i,'Change_of_Angle'] = 0
                
    return df