In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## LOADING

In [3]:
basedir = '/Users/sahara/Documents/GW/ML1/nhlgames/'

In [4]:
game = pd.read_csv(f'{basedir}/game_data/game.csv')
team_stats = pd.read_csv(f'{basedir}/game_data/game_teams_stats.csv')
team_info = pd.read_csv(f'{basedir}/game_data/team_info.csv')

In [223]:
game = game[game['type'].isin(['R','P'])]
team_stats = team_stats[team_stats['settled_in'].isin(['REG', 'OT'])]
datgame = game[['game_id', 'season', 'date_time_GMT']]
datteam = team_stats[['game_id', 'team_id', 'HoA', 'won', 'settled_in', 'head_coach', 'shots', 'hits', 'pim', 'powerPlayOpportunities', 'faceOffWinPercentage', 'giveaways', 'takeaways', 'blocked']]

dat = datteam.merge(datgame, on = 'game_id', how = 'inner')
dat = dat.drop_duplicates(subset = ['game_id', 'team_id'])

dat = dat.sort_values(by = ['team_id', 'date_time_GMT'])

In [224]:
print(dat.shape)
print(datteam.shape)
print(datgame.shape)

(47432, 16)
(52562, 14)
(26295, 3)


## Feature Engineering

In [225]:
dat['pts'] = dat.apply(lambda x: 2 if x.won else 1 if not x.won and x.settled_in == 'OT' else 0, axis = 1)

In [226]:
seasonsorted = sorted(dat['season'].unique())
dat['season_num'] = dat['season'].apply(lambda x: seasonsorted.index(x))
grouped = dat.groupby(['team_id', 'season_num']).agg({'game_id':'count', 'pts':'sum'})
grouped = grouped.reset_index()
grouped['pts_perc_last_szn'] = grouped['pts']/(grouped['game_id']*2)
grouped['season_num'] = grouped['season_num']+1

to_merge = grouped[['team_id', 'season_num', 'pts_perc_last_szn']]
dat = dat.merge(to_merge, on = ['season_num', 'team_id'], how = 'left')

In [227]:
dat['L10_pts_perc'] = dat.groupby(['team_id', 'season_num'])['pts'].transform(lambda x: x.rolling(10,10).sum()/20)
dat['rolling_pts%'] = dat.groupby(['team_id', 'season_num']).apply(lambda x: x.pts.expanding().sum()/(x.pts.expanding().count()*2)).values


In [228]:
dat

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,shots,hits,pim,powerPlayOpportunities,...,giveaways,takeaways,blocked,season,date_time_GMT,pts,season_num,pts_perc_last_szn,L10_pts_perc,rolling_pts%
0,2000020007,1,home,True,REG,Larry Robinson,34.0,,26.0,4.0,...,,,,20002001,2000-10-06T23:00:00Z,2,0,,,1.000000
1,2000020046,1,away,False,REG,Larry Robinson,31.0,,14.0,4.0,...,,,,20002001,2000-10-13T23:00:00Z,0,0,,,0.500000
2,2000020055,1,home,True,REG,Larry Robinson,31.0,,9.0,5.0,...,,,,20002001,2000-10-14T23:30:00Z,2,0,,,0.666667
3,2000020075,1,away,False,OT,Larry Robinson,37.0,,8.0,3.0,...,,,,20002001,2000-10-17T23:30:00Z,1,0,,,0.625000
4,2000020088,1,away,False,REG,Larry Robinson,36.0,,16.0,4.0,...,,,,20002001,2000-10-19T23:00:00Z,0,0,,,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47427,2019030321,54,home,False,REG,Peter DeBoer,25.0,47.0,4.0,4.0,...,22.0,7.0,18.0,20192020,2020-09-07T00:00:00Z,0,18,0.567416,0.50,0.620690
47428,2019030322,54,home,True,REG,Peter DeBoer,32.0,55.0,4.0,6.0,...,10.0,8.0,11.0,20192020,2020-09-09T00:00:00Z,2,18,0.567416,0.60,0.625000
47429,2019030323,54,away,False,OT,Peter DeBoer,40.0,42.0,2.0,4.0,...,12.0,6.0,16.0,20192020,2020-09-11T00:00:00Z,1,18,0.567416,0.55,0.623596
47430,2019030324,54,away,False,REG,Peter DeBoer,33.0,31.0,8.0,5.0,...,22.0,7.0,14.0,20192020,2020-09-13T00:00:00Z,0,18,0.567416,0.45,0.616667


## Transforming

In [229]:
dat['home'] = dat['HoA'].apply(lambda x: 1 if x=='home' else 0)

In [239]:
dat['date_time_GMT']

0        2000-10-06T23:00:00Z
1        2000-10-13T23:00:00Z
2        2000-10-14T23:30:00Z
3        2000-10-17T23:30:00Z
4        2000-10-19T23:00:00Z
                 ...         
47427    2020-09-07T00:00:00Z
47428    2020-09-09T00:00:00Z
47429    2020-09-11T00:00:00Z
47430    2020-09-13T00:00:00Z
47431    2020-09-15T00:00:00Z
Name: date_time_GMT, Length: 47432, dtype: object

In [230]:
coach_group = dat.groupby(['head_coach']).apply(lambda x: x.won.expanding().sum()/x.won.expanding().count())

In [233]:
coach_group.reset_index()

Unnamed: 0,head_coach,level_1,won
0,Adam Oates,1092,0.000000
1,Adam Oates,1093,0.500000
2,Adam Oates,1094,0.333333
3,Adam Oates,1095,0.250000
4,Adam Oates,1096,0.400000
...,...,...,...
47427,Willie Desjardins,40089,0.429022
47428,Willie Desjardins,40090,0.427673
47429,Willie Desjardins,40091,0.429467
47430,Willie Desjardins,40092,0.428125


In [214]:
coach_group

head_coach              
Adam Oates         1092     0.000000
                   1093     0.500000
                   1094     0.333333
                   1095     0.250000
                   1096     0.400000
                              ...   
Willie Desjardins  40107    0.429022
                   40108    0.427673
                   40109    0.429467
                   40110    0.428125
                   40111    0.429907
Name: won, Length: 47432, dtype: float64

In [218]:
dat[dat['head_coach'].isna()]

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,shots,hits,pim,powerPlayOpportunities,...,season,date_time_GMT,pts,season_num,pts_perc_last,L10_pts_perc,rolling_pts%,pts_perc_last_szn,Home,home
4480,2016030217,3,away,False,tbc,,,,,,...,20162017,2017-05-11T23:30:00Z,0,15,0.591954,0.6,0.615789,0.591954,0,0
7659,2016030417,5,home,False,tbc,,0.0,0.0,0.0,0.0,...,20162017,2017-06-15T00:00:00Z,0,15,0.65566,0.6,0.671296,0.65566,1,1
7754,2017030227,5,away,False,tbc,,0.0,0.0,0.0,0.0,...,20172018,2018-05-09T23:30:00Z,0,16,0.671296,0.45,0.594737,0.671296,0,0
13948,2016030217,9,home,False,tbc,,,,,,...,20162017,2017-05-11T23:30:00Z,0,15,0.518293,0.65,0.605263,0.518293,1,1
15482,2016030137,10,away,False,tbc,,0.0,0.0,0.0,0.0,...,20162017,2017-04-25T23:30:00Z,0,15,0.420732,0.45,0.573034,0.420732,0,0
22580,2016030137,15,home,False,tbc,,0.0,0.0,0.0,0.0,...,20162017,2017-04-25T23:30:00Z,0,15,0.712766,0.7,0.719101,0.712766,1,1
22682,2017030227,15,home,False,tbc,,0.0,0.0,0.0,0.0,...,20172018,2018-05-09T23:30:00Z,0,16,0.697917,0.7,0.647368,0.697917,1,1
22695,2017030416,15,home,False,tbc,,0.0,0.0,0.0,0.0,...,20172018,2018-06-11T00:00:00Z,0,16,0.697917,0.6,0.643519,0.697917,1,1
22696,2017030417,15,away,False,tbc,,0.0,0.0,0.0,0.0,...,20172018,2018-06-14T00:00:00Z,0,16,0.697917,0.6,0.637615,0.697917,0,0
24230,2016030155,16,home,False,tbc,,0.0,0.0,0.0,0.0,...,20162017,2017-04-23T00:00:00Z,0,15,0.617978,0.25,0.632184,0.617978,1,1
