#### Overall Scoring and Shootout Statistics 

In [41]:
# imports
import pandas as pd
import numpy as np
import statistics
from sklearn.impute import SimpleImputer
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

**Scoring Dataframe**

In [5]:
# read scoring csv file
scoring = pd.read_csv('data/Scoring.csv')
scoring.head()

Unnamed: 0,playerID,year,stint,tmID,lgID,pos,GP,G,A,Pts,...,PostA,PostPts,PostPIM,Post+/-,PostPPG,PostPPA,PostSHG,PostSHA,PostGWG,PostSOG
0,aaltoan01,1997,1,ANA,NHL,C,3.0,0.0,0.0,0.0,...,,,,,,,,,,
1,aaltoan01,1998,1,ANA,NHL,C,73.0,3.0,5.0,8.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,aaltoan01,1999,1,ANA,NHL,C,63.0,7.0,11.0,18.0,...,,,,,,,,,,
3,aaltoan01,2000,1,ANA,NHL,C,12.0,1.0,1.0,2.0,...,,,,,,,,,,
4,abbeybr01,1975,1,CIN,WHA,D,17.0,1.0,0.0,1.0,...,,,,,,,,,,


In [7]:
scoring.shape

(45967, 31)

In [8]:
scoring.isnull().sum()

playerID        0
year            0
stint           0
tmID            0
lgID            0
pos           582
GP            268
G             268
A             268
Pts           268
PIM           268
+/-          9702
PPG          8219
PPA         22927
SHG          8223
SHA         22753
GWG          9400
GTG         17861
SOG          9603
PostGP      26814
PostG       26814
PostA       26873
PostPts     26873
PostPIM     26873
Post+/-     34963
PostPPG     31475
PostPPA     37113
PostSHG     31475
PostSHA     37112
PostGWG     30310
PostSOG     35575
dtype: int64

**Imputing missing values**

In [9]:
# drop columns with over 26k missing values out of the total 46k values 
scoring = scoring.drop(columns = ['PostGP', 'PostG', 'PostA', 'PostPts', 'PostPIM', 'Post+/-', 'PostPPG', 'PostSHG', 'PostSHA', 'PostGWG', 'PostSOG'])

In [10]:
# simple imputer missing values with the mean 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
scoring['GP'] = imputer.fit_transform(scoring['GP'].values.reshape(-1,1))
scoring['G'] = imputer.fit_transform(scoring['G'].values.reshape(-1,1))
scoring['A'] = imputer.fit_transform(scoring['A'].values.reshape(-1,1))
scoring['Pts'] = imputer.fit_transform(scoring['Pts'].values.reshape(-1,1))
scoring['PIM'] = imputer.fit_transform(scoring['PIM'].values.reshape(-1,1))
scoring['+/-'] = imputer.fit_transform(scoring['+/-'].values.reshape(-1,1))

In [11]:
# scoring stats starts in 1909
scoring['year'].min()

1909

In [12]:
# scoring stats ends in 2011
scoring['year'].max()

2011

**Engineering Features**

In [48]:
# display format
pd.options.display.float_format = '{:.1f}'.format

In [49]:
# engineer a column of number of goals out of games played
pd.options.mode.chained_assignment = None  # default='warn'
scoring['G/GP'] = scoring.apply(lambda x: x['G'] if x['G'] < 1 else x['G']/x['GP'], axis=1)
scoring.head()

Unnamed: 0,playerID,year,stint,tmID,lgID,pos,GP,G,A,Pts,...,+/-,PPG,PPA,SHG,SHA,GWG,GTG,SOG,PostPPA,G/GP
0,aaltoan01,1997,1,ANA,NHL,C,3.0,0.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0
1,aaltoan01,1998,1,ANA,NHL,C,73.0,3.0,5.0,8.0,...,-12.0,2.0,1.0,0.0,0.0,0.0,0.0,61.0,0.0,0.0
2,aaltoan01,1999,1,ANA,NHL,C,63.0,7.0,11.0,18.0,...,-13.0,1.0,0.0,0.0,0.0,1.0,0.0,102.0,,0.1
3,aaltoan01,2000,1,ANA,NHL,C,12.0,1.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,,0.1
4,abbeybr01,1975,1,CIN,WHA,D,17.0,1.0,0.0,1.0,...,-3.0,0.0,,0.0,,0.0,,2.0,,0.1


In [61]:
# see what players scored 2 or more goals per game played 
scoring.loc[scoring['G/GP'] >= 2].sort_values(by='year')

Unnamed: 0,playerID,year,stint,tmID,lgID,pos,GP,G,A,Pts,...,PPA,SHG,SHA,GWG,GTG,SOG,PostPPA,G/GP,A/GP,Pts/GP
1884,bawlfni01,1909,1,HAI,NHA,,4.0,10.0,0.0,10.0,...,,,,,,,,2.5,0.0,2.5
39150,smithha01,1909,2,COB,NHA,,10.0,28.0,0.0,28.0,...,,,,,,,,2.8,0.0,2.8
36465,russeer01,1909,1,MOW,NHA,,12.0,32.0,0.0,32.0,...,,,,,,,,2.7,0.0,2.7
32177,patrile01,1909,1,REN,NHA,,11.0,23.0,0.0,23.0,...,,,,,,,,2.1,0.0,2.1
40635,stuarbr02,1909,1,OT1,NHA,,7.0,14.0,0.0,14.0,...,,,,,,,,2.0,0.0,2.0
22250,lalonne01,1909,2,REN,NHA,,5.0,22.0,0.0,22.0,...,,,,,,,,4.4,0.0,4.4
22249,lalonne01,1909,1,LES,NHA,,6.0,16.0,0.0,16.0,...,,,,,,,,2.7,0.0,2.7
25529,malleke01,1909,1,OT1,NHA,,1.0,2.0,0.0,2.0,...,,,,,,,,2.0,0.0,2.0
18497,hylanha01,1909,1,MOW,NHA,,10.0,20.0,0.0,20.0,...,,,,,,,,2.0,0.0,2.0
8474,currial01,1909,1,HAI,NHA,,7.0,14.0,0.0,14.0,...,,,,,,,,2.0,0.0,2.0


In [54]:
# engineer a column of number of assists out of games played
pd.options.mode.chained_assignment = None  # default='warn'
scoring['A/GP'] = scoring.apply(lambda x: x['A'] if x['A'] < 1 else x['A']/x['GP'], axis=1)
scoring.head()

Unnamed: 0,playerID,year,stint,tmID,lgID,pos,GP,G,A,Pts,...,PPG,PPA,SHG,SHA,GWG,GTG,SOG,PostPPA,G/GP,A/GP
0,aaltoan01,1997,1,ANA,NHL,C,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0
1,aaltoan01,1998,1,ANA,NHL,C,73.0,3.0,5.0,8.0,...,2.0,1.0,0.0,0.0,0.0,0.0,61.0,0.0,0.0,0.1
2,aaltoan01,1999,1,ANA,NHL,C,63.0,7.0,11.0,18.0,...,1.0,0.0,0.0,0.0,1.0,0.0,102.0,,0.1,0.2
3,aaltoan01,2000,1,ANA,NHL,C,12.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,18.0,,0.1,0.1
4,abbeybr01,1975,1,CIN,WHA,D,17.0,1.0,0.0,1.0,...,0.0,,0.0,,0.0,,2.0,,0.1,0.0


In [60]:
# see what players had 2 or more assists per game played 
scoring.loc[scoring['A/GP'] >= 2].sort_values(by='year')

Unnamed: 0,playerID,year,stint,tmID,lgID,pos,GP,G,A,Pts,...,PPA,SHG,SHA,GWG,GTG,SOG,PostPPA,G/GP,A/GP,Pts/GP
32202,patrimu01,1937,1,NYR,NHL,D,1.0,0.0,2.0,2.0,...,,,,,,,,0.0,2.0,2.0
26699,mcateju01,1943,1,DET,NHL,L,1.0,0.0,2.0,2.0,...,,,,,,,,0.0,2.0,2.0
21849,kylebi01,1950,1,NYR,NHL,C,1.0,0.0,3.0,3.0,...,,,,,,,,0.0,3.0,3.0
29574,murdobo01,1970,1,MTL,NHL,D,1.0,0.0,2.0,2.0,...,,0.0,,0.0,0.0,2.0,,0.0,2.0,2.0
39671,sprinfr01,1975,1,CLF,NHL,R,1.0,0.0,2.0,2.0,...,,0.0,,0.0,0.0,1.0,,0.0,2.0,2.0
42565,tuhoiha01,1978,1,FIN,WHA,,1.0,0.0,2.0,2.0,...,,0.0,,0.0,,,,0.0,2.0,2.0
40782,summara01,1983,1,EDM,NHL,L,2.0,1.0,4.0,5.0,...,,0.0,,0.0,0.0,3.0,,0.5,2.0,2.5
14888,gretzwa01,1985,1,EDM,NHL,C,80.0,52.0,163.0,215.0,...,,3.0,,6.0,1.0,350.0,,0.7,2.0,2.7
23681,leroufr01,1990,1,EDM,NHL,D,1.0,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,2.0,2.0
35215,richato01,1990,1,HAR,NHL,D,2.0,0.0,4.0,4.0,...,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,2.0


In [56]:
# engineer a column of number of assists out of games played
pd.options.mode.chained_assignment = None  # default='warn'
scoring['Pts/GP'] = scoring.apply(lambda x: x['Pts'] if x['Pts'] < 1 else x['Pts']/x['GP'], axis=1)
scoring.head()

Unnamed: 0,playerID,year,stint,tmID,lgID,pos,GP,G,A,Pts,...,PPA,SHG,SHA,GWG,GTG,SOG,PostPPA,G/GP,A/GP,Pts/GP
0,aaltoan01,1997,1,ANA,NHL,C,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0
1,aaltoan01,1998,1,ANA,NHL,C,73.0,3.0,5.0,8.0,...,1.0,0.0,0.0,0.0,0.0,61.0,0.0,0.0,0.1,0.1
2,aaltoan01,1999,1,ANA,NHL,C,63.0,7.0,11.0,18.0,...,0.0,0.0,0.0,1.0,0.0,102.0,,0.1,0.2,0.3
3,aaltoan01,2000,1,ANA,NHL,C,12.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,18.0,,0.1,0.1,0.2
4,abbeybr01,1975,1,CIN,WHA,D,17.0,1.0,0.0,1.0,...,,0.0,,0.0,,2.0,,0.1,0.0,0.1


In [59]:
# see what players had 2 or more assists per game played 
scoring.loc[scoring['Pts/GP'] >= 2].sort_values(by='year')

Unnamed: 0,playerID,year,stint,tmID,lgID,pos,GP,G,A,Pts,...,PPA,SHG,SHA,GWG,GTG,SOG,PostPPA,G/GP,A/GP,Pts/GP
22249,lalonne01,1909,1,LES,NHA,,6.0,16.0,0.0,16.0,...,,,,,,,,2.7,0.0,2.7
36465,russeer01,1909,1,MOW,NHA,,12.0,32.0,0.0,32.0,...,,,,,,,,2.7,0.0,2.7
18497,hylanha01,1909,1,MOW,NHA,,10.0,20.0,0.0,20.0,...,,,,,,,,2.0,0.0,2.0
40635,stuarbr02,1909,1,OT1,NHA,,7.0,14.0,0.0,14.0,...,,,,,,,,2.0,0.0,2.0
20466,kerral02,1909,1,OT1,NHA,,4.0,8.0,0.0,8.0,...,,,,,,,,2.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23613,lemiema01,1992,1,PIT,NHL,C,60.0,69.0,91.0,160.0,...,39.0,6.0,3.0,10.0,0.0,286.0,6.0,1.1,1.5,2.7
11676,fedotan01,1992,1,WIN,NHL,D,1.0,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,2.0,2.0
4669,browncu01,1994,1,BUF,NHL,C,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,4.0,,1.0,1.0,2.0
23615,lemiema01,1995,1,PIT,NHL,C,70.0,69.0,92.0,161.0,...,48.0,8.0,1.0,8.0,0.0,338.0,9.0,1.0,1.3,2.3


In [70]:
# filter scoring stats to extract center stats
scoring_center = scoring.loc[scoring['pos'] == 'C']

In [71]:
# average goals in a year for center
statistics.mean(scoring_center['G'])

10.52025608882313

In [72]:
# average +/- score in a year
statistics.mean(scoring_center['+/-'])

-0.5284266249469673

In [73]:
# average assists in a year
statistics.mean(scoring_center['A'])

16.30504034412419

In [74]:
# average pts / game for centerman historically
statistics.mean(scoring_center['Pts/GP'])

0.4972524653709728

In [75]:
# average goals / game for centerman historically
statistics.mean(scoring_center['G/GP'])

0.19761462325840776

**Scoring Shootouts**

In [69]:
# read scoring shootouts csv file
shoot = pd.read_csv('data/ScoringShootout.csv')
shoot.head()

Unnamed: 0,playerID,year,stint,tmID,S,G,GDG
0,adamske01,2006,1,PHO,1,0,0
1,afanadm01,2005,1,TBL,1,0,0
2,afanadm01,2006,1,TBL,2,1,1
3,afinoma01,2005,1,BUF,5,3,2
4,afinoma01,2006,1,BUF,6,2,1


In [32]:
# engineer a column of number of goals out of the number of shots
pd.options.mode.chained_assignment = None  # default='warn'
shoot['G/S'] = shoot.apply(lambda x: x['G'] if x['G'] == 0 or x['S'] == 0 else x['G']/x['S'], axis=1)
shoot.head()

Unnamed: 0,playerID,year,stint,tmID,S,G,GDG,G/S
0,adamske01,2006,1,PHO,1,0,0,0.0
1,afanadm01,2005,1,TBL,1,0,0,0.0
2,afanadm01,2006,1,TBL,2,1,1,0.5
3,afinoma01,2005,1,BUF,5,3,2,0.6
4,afinoma01,2006,1,BUF,6,2,1,0.333333


In [33]:
# average g/s
statistics.mean(shoot['G/S'])

0.5466528948409201

In [22]:
# median of game deciding goals
statistics.median(shoot['GDG'])

0.0

In [23]:
# average game deciding goals
statistics.mean(shoot['GDG'])

0.5492277992277992