In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## LOADING

In [3]:
basedir = '/Users/sahara/Documents/GW/ML1/nhlgames/'

In [4]:
game = pd.read_csv(f'{basedir}/game_data/game.csv')
team_stats = pd.read_csv(f'{basedir}/game_data/game_teams_stats.csv')
team_info = pd.read_csv(f'{basedir}/game_data/team_info.csv')

In [308]:
game = game[game['type'].isin(['R','P'])]
team_stats = team_stats[team_stats['settled_in'].isin(['REG', 'OT'])]
datgame = game[['game_id', 'season', 'date_time_GMT']]
datteam = team_stats[['game_id', 'team_id', 'HoA', 'won', 'settled_in', 'head_coach', 'shots', 'hits', 'pim', 'powerPlayOpportunities', 'faceOffWinPercentage', 'giveaways', 'takeaways', 'blocked']]

dat = datteam.merge(datgame, on = 'game_id', how = 'inner')
dat = dat.drop_duplicates(subset = ['game_id', 'team_id'])

dat = dat.sort_values(by = ['team_id', 'date_time_GMT'])

In [309]:
print(dat.shape)
print(datteam.shape)
print(datgame.shape)

(47432, 16)
(52562, 14)
(26295, 3)


## Feature Engineering

In [310]:
dat['pts'] = dat.apply(lambda x: 2 if x.won else 1 if not x.won and x.settled_in == 'OT' else 0, axis = 1)

In [311]:
seasonsorted = sorted(dat['season'].unique())
dat['season_num'] = dat['season'].apply(lambda x: seasonsorted.index(x))
grouped = dat.groupby(['team_id', 'season_num']).agg({'game_id':'count', 'pts':'sum'})
grouped = grouped.reset_index()
grouped['pts_perc_last_szn'] = grouped['pts']/(grouped['game_id']*2)
grouped['season_num'] = grouped['season_num']+1

to_merge = grouped[['team_id', 'season_num', 'pts_perc_last_szn']]
dat = dat.merge(to_merge, on = ['season_num', 'team_id'], how = 'left')

In [312]:
dat['L10_pts_perc'] = dat.groupby(['team_id', 'season_num'])['pts'].transform(lambda x: x.rolling(10,10).sum()/20)
dat['rolling_pts%'] = dat.groupby(['team_id', 'season_num']).apply(lambda x: x.pts.expanding().sum()/(x.pts.expanding().count()*2)).values


In [228]:
dat

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,shots,hits,pim,powerPlayOpportunities,...,giveaways,takeaways,blocked,season,date_time_GMT,pts,season_num,pts_perc_last_szn,L10_pts_perc,rolling_pts%
0,2000020007,1,home,True,REG,Larry Robinson,34.0,,26.0,4.0,...,,,,20002001,2000-10-06T23:00:00Z,2,0,,,1.000000
1,2000020046,1,away,False,REG,Larry Robinson,31.0,,14.0,4.0,...,,,,20002001,2000-10-13T23:00:00Z,0,0,,,0.500000
2,2000020055,1,home,True,REG,Larry Robinson,31.0,,9.0,5.0,...,,,,20002001,2000-10-14T23:30:00Z,2,0,,,0.666667
3,2000020075,1,away,False,OT,Larry Robinson,37.0,,8.0,3.0,...,,,,20002001,2000-10-17T23:30:00Z,1,0,,,0.625000
4,2000020088,1,away,False,REG,Larry Robinson,36.0,,16.0,4.0,...,,,,20002001,2000-10-19T23:00:00Z,0,0,,,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47427,2019030321,54,home,False,REG,Peter DeBoer,25.0,47.0,4.0,4.0,...,22.0,7.0,18.0,20192020,2020-09-07T00:00:00Z,0,18,0.567416,0.50,0.620690
47428,2019030322,54,home,True,REG,Peter DeBoer,32.0,55.0,4.0,6.0,...,10.0,8.0,11.0,20192020,2020-09-09T00:00:00Z,2,18,0.567416,0.60,0.625000
47429,2019030323,54,away,False,OT,Peter DeBoer,40.0,42.0,2.0,4.0,...,12.0,6.0,16.0,20192020,2020-09-11T00:00:00Z,1,18,0.567416,0.55,0.623596
47430,2019030324,54,away,False,REG,Peter DeBoer,33.0,31.0,8.0,5.0,...,22.0,7.0,14.0,20192020,2020-09-13T00:00:00Z,0,18,0.567416,0.45,0.616667


## Transforming

In [313]:
dat['home'] = dat['HoA'].apply(lambda x: 1 if x=='home' else 0)

In [314]:
dat['date_time_GMT'] = pd.to_datetime(dat['date_time_GMT'])

In [315]:
dat['gametime_unix'] = dat['date_time_GMT'].apply(lambda x: x.timestamp())

In [316]:
coach_group = dat.groupby(['head_coach']).apply(lambda x: x.won.expanding().sum()/(x.won.expanding().count()*2))

In [317]:
coach_flat = coach_group.reset_index().rename(columns = {'won':'coach_pts%'})

In [318]:
dat = dat.merge(coach_flat, left_index=True, right_on = 'level_1')

In [319]:
dat.columns

Index(['game_id', 'team_id', 'HoA', 'won', 'settled_in', 'head_coach_x',
       'shots', 'hits', 'pim', 'powerPlayOpportunities',
       'faceOffWinPercentage', 'giveaways', 'takeaways', 'blocked', 'season',
       'date_time_GMT', 'pts', 'season_num', 'pts_perc_last_szn',
       'L10_pts_perc', 'rolling_pts%', 'home', 'gametime_unix', 'head_coach_y',
       'level_1', 'coach_pts%'],
      dtype='object')

In [320]:
df = dat.drop(['HoA', 'head_coach_x', 'date_time_GMT', 'season', 'head_coach_y', 'level_1', 'settled_in'], axis = 1)

In [321]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47432 entries, 28296 to 37926
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   game_id                 47432 non-null  int64  
 1   team_id                 47432 non-null  int64  
 2   won                     47432 non-null  bool   
 3   shots                   47432 non-null  float64
 4   hits                    42512 non-null  float64
 5   pim                     47432 non-null  float64
 6   powerPlayOpportunities  47432 non-null  float64
 7   faceOffWinPercentage    25292 non-null  float64
 8   giveaways               42512 non-null  float64
 9   takeaways               42512 non-null  float64
 10  blocked                 42512 non-null  float64
 11  pts                     47432 non-null  int64  
 12  season_num              47432 non-null  int64  
 13  pts_perc_last_szn       44706 non-null  float64
 14  L10_pts_perc            42275 non-