<a href="https://colab.research.google.com/github/ceb263/nhl/blob/main/xG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Imports and input data
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier

%matplotlib inline

pd.set_option('display.max_rows', 150)

In [2]:
# read data
plays = pd.read_pickle('pbp_2012.pkl')

In [36]:
# data preprocessing
def preprocess_plays(df):
    # remove null location data
    df = df.loc[~df['xC'].isnull()]
    df = df.loc[~df['yC'].isnull()]

    # get previous event time and location
    df = df.sort_values(by=['Game_Id','Period','Seconds_Elapsed'])
    df['prev_Event'] = df['Event'].shift(1)
    df['prev_Seconds_Elapsed'] = df['Seconds_Elapsed'].shift(1)
    df['prev_xC'] = df['xC'].shift(1)
    df['prev_yC'] = df['yC'].shift(1)

    # get time elapsed, and distance from previous event
    df['timeSincePrev'] = df['Seconds_Elapsed'] - df['prev_Seconds_Elapsed']
    df['distanceSincePrev'] = np.sqrt(np.square(df['xC']-df['prev_xC']) + np.square(df['yC']-df['prev_yC']))
    df['yDistanceSincePrev'] = np.abs(df['yC'] - df['prev_yC'])

    # remove invalid data
    df = df.loc[df['timeSincePrev']<0]

    # filter for only shots and shot attempts
    df = df.loc[df['Event'].isin(['BLOCK','MISS','SHOT','GOAL'])]

    # get previous shot time and location, and then calculate derived metrics
    df['prevShot_Seconds_Elapsed'] = df['Seconds_Elapsed'].shift(1)
    df['prevShot_xC'] = df['xC'].shift(1)
    df['prevShot_yC'] = df['yC'].shift(1)
    df['prevShot_Ev_Team'] = df['Ev_Team'].shift(1)
    df['prevShot_sameTeam'] = (df['prevShot_Ev_Team']==df['Ev_Team']).astype(int)
    df['timeSincePrevShot'] = df['Seconds_Elapsed'] - df['prevShot_Seconds_Elapsed']
    df['distanceSincePrevShot'] = np.sqrt(np.square(df['xC']-df['prevShot_xC']) + np.square(df['yC']-df['prevShot_yC']))
    df['yDistanceSincePrevShot'] = np.abs(df['yC'] - df['prevShot_yC'])

    # adjust shot locations so everything is on the same side of the ice
    df['loc_adjust_factor'] = (((df['xC']>0).astype(int).astype(float)) - 0.5) * 2
    df['x_adj'] = df['xC']*df['loc_adjust_factor']
    df['y_adj'] = df['yC']*df['loc_adjust_factor']
    df['prev_loc_adjust_factor'] = (((df['prev_xC']>0).astype(int).astype(float)) - 0.5) * 2
    df['prev_x_adj'] = df['prev_xC']*df['prev_loc_adjust_factor']
    df['prev_y_adj'] = df['prev_yC']*df['prev_loc_adjust_factor']
    df['prevShot_loc_adjust_factor'] = (((df['prevShot_xC']>0).astype(int).astype(float)) - 0.5) * 2
    df['prevShot_x_adj'] = df['prevShot_xC']*df['prevShot_loc_adjust_factor']
    df['prevShot_y_adj'] = df['prevShot_yC']*df['prevShot_loc_adjust_factor']

    # add target variable
    df['goal'] = (df['Event']=='GOAL').astype(int)

    return df

In [37]:
shots = preprocess_plays(plays)

In [31]:
shots['x_adj'].describe()

count    98.000000
mean     75.336735
std       7.323094
min      43.000000
25%      70.000000
50%      76.000000
75%      81.000000
max      87.000000
Name: x_adj, dtype: float64

In [38]:
plays.loc[plays['Event']=='BLOCK'].head()

Unnamed: 0,Game_Id,Date,Period,Event,Description,Time_Elapsed,Seconds_Elapsed,Strength,Ev_Zone,Type,Ev_Team,Home_Zone,Away_Team,Home_Team,p1_name,p1_ID,p2_name,p2_ID,p3_name,p3_ID,awayPlayer1,awayPlayer1_id,awayPlayer2,awayPlayer2_id,awayPlayer3,awayPlayer3_id,awayPlayer4,awayPlayer4_id,awayPlayer5,awayPlayer5_id,awayPlayer6,awayPlayer6_id,homePlayer1,homePlayer1_id,homePlayer2,homePlayer2_id,homePlayer3,homePlayer3_id,homePlayer4,homePlayer4_id,homePlayer5,homePlayer5_id,homePlayer6,homePlayer6_id,Away_Players,Home_Players,Away_Score,Home_Score,Away_Goalie,Away_Goalie_Id,Home_Goalie,Home_Goalie_Id,xC,yC,Home_Coach,Away_Coach
4,20003,2013-01-19,1,BLOCK,"CHI #81 HOSSA BLOCKED BY L.A #8 DOUGHTY, Wris...",0:26,26.0,5x5,Def,WRIST SHOT,CHI,Def,CHI,L.A,DREW DOUGHTY,8474563.0,MARIAN HOSSA,8466148.0,,,JONATHAN TOEWS,8473604.0,MARIAN HOSSA,8466148.0,DAN CARCILLO,8470666.0,DUNCAN KEITH,8470281.0,BRENT SEABROOK,8470607.0,COREY CRAWFORD,8470645.0,TREVOR LEWIS,8473453.0,JARRET STOLL,8468526.0,DWIGHT KING,8474100.0,ROB SCUDERI,8467452.0,DREW DOUGHTY,8474563.0,JONATHAN QUICK,8471734.0,6,6,0,0,COREY CRAWFORD,8470645.0,JONATHAN QUICK,8471734.0,72.0,12.0,DARRYL SUTTER,JOEL QUENNEVILLE
9,20003,2013-01-19,1,BLOCK,"L.A #10 RICHARDS BLOCKED BY CHI #27 ODUYA, Ti...",1:14,74.0,5x5,Def,TIP-IN,L.A,Off,CHI,L.A,JOHNNY ODUYA,8469665.0,MIKE RICHARDS,8470617.0,,,DAVE BOLLAND,8471245.0,PATRICK SHARP,8469544.0,PATRICK KANE,8474141.0,NIKLAS HJALMARSSON,8471769.0,JOHNNY ODUYA,8469665.0,COREY CRAWFORD,8470645.0,MIKE RICHARDS,8470617.0,SIMON GAGNE,8467346.0,DUSTIN BROWN,8470606.0,SLAVA VOYNOV,8474594.0,ALEC MARTINEZ,8474166.0,JONATHAN QUICK,8471734.0,6,6,0,0,COREY CRAWFORD,8470645.0,JONATHAN QUICK,8471734.0,-75.0,4.0,DARRYL SUTTER,JOEL QUENNEVILLE
30,20003,2013-01-19,1,BLOCK,"CHI #29 BICKELL BLOCKED BY L.A #2 GREENE, Wri...",5:12,312.0,5x5,Def,WRIST SHOT,CHI,Def,CHI,L.A,MATT GREENE,8470121.0,BRYAN BICKELL,8471254.0,,,MARCUS KRUGER,8475323.0,MICHAEL FROLIK,8473564.0,BRYAN BICKELL,8471254.0,NIKLAS HJALMARSSON,8471769.0,JOHNNY ODUYA,8469665.0,COREY CRAWFORD,8470645.0,COLIN FRASER,8470662.0,JORDAN NOLAN,8475325.0,KYLE CLIFFORD,8475160.0,MATT GREENE,8470121.0,DAVIS DREWISKE,8474518.0,JONATHAN QUICK,8471734.0,6,6,1,0,COREY CRAWFORD,8470645.0,JONATHAN QUICK,8471734.0,74.0,-33.0,DARRYL SUTTER,JOEL QUENNEVILLE
32,20003,2013-01-19,1,BLOCK,CHI #4 HJALMARSSON BLOCKED BY L.A #13 CLIFFOR...,5:25,325.0,5x5,Def,SLAP SHOT,CHI,Def,CHI,L.A,KYLE CLIFFORD,8475160.0,NIKLAS HJALMARSSON,8471769.0,,,MARCUS KRUGER,8475323.0,MICHAEL FROLIK,8473564.0,BRYAN BICKELL,8471254.0,NIKLAS HJALMARSSON,8471769.0,JOHNNY ODUYA,8469665.0,COREY CRAWFORD,8470645.0,COLIN FRASER,8470662.0,JORDAN NOLAN,8475325.0,KYLE CLIFFORD,8475160.0,MATT GREENE,8470121.0,DAVIS DREWISKE,8474518.0,JONATHAN QUICK,8471734.0,6,6,1,0,COREY CRAWFORD,8470645.0,JONATHAN QUICK,8471734.0,35.0,-17.0,DARRYL SUTTER,JOEL QUENNEVILLE
42,20003,2013-01-19,1,BLOCK,"L.A #7 SCUDERI BLOCKED BY CHI #17 BROOKBANK, ...",7:54,474.0,5x5,Def,WRIST SHOT,L.A,Off,CHI,L.A,SHELDON BROOKBANK,8469992.0,ROB SCUDERI,8467452.0,,,JONATHAN TOEWS,8473604.0,MARIAN HOSSA,8466148.0,DAN CARCILLO,8470666.0,NIKLAS HJALMARSSON,8471769.0,SHELDON BROOKBANK,8469992.0,COREY CRAWFORD,8470645.0,MIKE RICHARDS,8470617.0,SIMON GAGNE,8467346.0,DUSTIN BROWN,8470606.0,ROB SCUDERI,8467452.0,DREW DOUGHTY,8474563.0,JONATHAN QUICK,8471734.0,6,6,1,0,COREY CRAWFORD,8470645.0,JONATHAN QUICK,8471734.0,-71.0,-6.0,DARRYL SUTTER,JOEL QUENNEVILLE
