In [28]:
# imports
import pandas as pd
import numpy as np
import statistics
from sklearn.impute import SimpleImputer

In [8]:
# read scoring csv file
scoring = pd.read_csv('data/Scoring.csv')
scoring.head()

Unnamed: 0,playerID,year,stint,tmID,lgID,pos,GP,G,A,Pts,...,PostA,PostPts,PostPIM,Post+/-,PostPPG,PostPPA,PostSHG,PostSHA,PostGWG,PostSOG
0,aaltoan01,1997,1,ANA,NHL,C,3.0,0.0,0.0,0.0,...,,,,,,,,,,
1,aaltoan01,1998,1,ANA,NHL,C,73.0,3.0,5.0,8.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,aaltoan01,1999,1,ANA,NHL,C,63.0,7.0,11.0,18.0,...,,,,,,,,,,
3,aaltoan01,2000,1,ANA,NHL,C,12.0,1.0,1.0,2.0,...,,,,,,,,,,
4,abbeybr01,1975,1,CIN,WHA,D,17.0,1.0,0.0,1.0,...,,,,,,,,,,


In [11]:
scoring.columns

Index(['playerID', 'year', 'stint', 'tmID', 'lgID', 'pos', 'GP', 'G', 'A',
       'Pts', 'PIM', '+/-', 'PPG', 'PPA', 'SHG', 'SHA', 'GWG', 'GTG', 'SOG',
       'PostGP', 'PostG', 'PostA', 'PostPts', 'PostPIM', 'Post+/-', 'PostPPG',
       'PostPPA', 'PostSHG', 'PostSHA', 'PostGWG', 'PostSOG'],
      dtype='object')

In [14]:
scoring.shape

(45967, 31)

In [13]:
scoring.isnull().sum()

playerID        0
year            0
stint           0
tmID            0
lgID            0
pos           582
GP            268
G             268
A             268
Pts           268
PIM           268
+/-          9702
PPG          8219
PPA         22927
SHG          8223
SHA         22753
GWG          9400
GTG         17861
SOG          9603
PostGP      26814
PostG       26814
PostA       26873
PostPts     26873
PostPIM     26873
Post+/-     34963
PostPPG     31475
PostPPA     37113
PostSHG     31475
PostSHA     37112
PostGWG     30310
PostSOG     35575
dtype: int64

In [29]:
# drop columns with over 26k missing values out of the total 46k values 
scoring = scoring.drop(columns = ['PostGP', 'PostG', 'PostA', 'PostPts', 'PostPIM', 'Post+/-', 'PostPPG', 'PostSHG', 'PostSHA', 'PostGWG', 'PostSOG'])

In [46]:
# simple imputer missing values with the mean 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
scoring['GP'] = imputer.fit_transform(scoring['GP'].values.reshape(-1,1))
scoring['G'] = imputer.fit_transform(scoring['G'].values.reshape(-1,1))
scoring['A'] = imputer.fit_transform(scoring['A'].values.reshape(-1,1))
scoring['Pts'] = imputer.fit_transform(scoring['Pts'].values.reshape(-1,1))
scoring['PIM'] = imputer.fit_transform(scoring['PIM'].values.reshape(-1,1))
scoring['+/-'] = imputer.fit_transform(scoring['+/-'].values.reshape(-1,1))

In [17]:
# scoring stats starts in 1909
scoring['year'].min()

1909

In [18]:
# scoring stats ends in 2011
scoring['year'].max()

2011

In [21]:
# groupby scoring stats up to 1920
scoring_1910s = scoring.loc[scoring['year'] < 1920]
scoring_1910

Unnamed: 0,playerID,year,stint,tmID,lgID,pos,GP,G,A,Pts,...,PostA,PostPts,PostPIM,Post+/-,PostPPG,PostPPA,PostSHG,PostSHA,PostGWG,PostSOG
80,adamsbi01,1919,1,VML,PCHA,R,16.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,,,,
129,adamsja01,1917,1,TOA,NHL,C,8.0,0.0,0.0,0.0,...,0.0,1.0,6.0,,,,,,,
130,adamsja01,1918,1,TOA,NHL,C,17.0,3.0,3.0,6.0,...,,,,,,,,,,
131,adamsja01,1919,1,VML,PCHA,C,22.0,9.0,4.0,13.0,...,0.0,0.0,0.0,,,,,,,
723,arbouam01,1915,1,MOC,NHA,,20.0,5.0,0.0,5.0,...,,,,,,,,,,


In [48]:
# filter scoring stats to extract left-winger stats
scoring_left_winger = scoring.loc[scoring['pos'] == 'L']

In [49]:
# average goals in a year for left-wingers
statistics.mean(scoring_left_winger['G'])

9.952905446128636

In [50]:
# average +/- score in a year
statistics.mean(scoring_left_winger['+/-'])

-0.526567380805169

In [51]:
# average assists in a year
statistics.mean(scoring_left_winger['A'])

12.21738277398811