# Predicting NHL Playoff Games from Event Data

Kaggle data set: https://www.kaggle.com/martinellis/nhl-game-data/

In [120]:
import pandas as pd
import framequery as fq
import featuretools as ft
import hashlib
from featuretools import Feature 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [121]:
game_df = pd.read_csv("game.csv")

In [122]:
game_df.head()

Unnamed: 0,game_id,season,type,date_time,away_team_id,home_team_id,away_goals,home_goals,outcome,home_rink_side_start,venue,venue_link,venue_time_zone_id,venue_time_zone_offset,venue_time_zone_tz
0,2012030221,20122013,P,2013-05-16,3,6,2,3,home win OT,left,TD Garden,/api/v1/venues/null,America/New_York,-4,EDT
1,2012030222,20122013,P,2013-05-19,3,6,2,5,home win REG,left,TD Garden,/api/v1/venues/null,America/New_York,-4,EDT
2,2012030223,20122013,P,2013-05-21,6,3,2,1,away win REG,right,Madison Square Garden,/api/v1/venues/null,America/New_York,-4,EDT
3,2012030224,20122013,P,2013-05-23,6,3,3,4,home win OT,right,Madison Square Garden,/api/v1/venues/null,America/New_York,-4,EDT
4,2012030225,20122013,P,2013-05-25,3,6,1,3,home win REG,left,TD Garden,/api/v1/venues/null,America/New_York,-4,EDT


In [123]:
plays_df = pd.read_csv("game_plays.csv")

# select a ~10% sample of the games
plays_df = fq.execute("""
select *
FROM plays_df
where game_id % 10 = 1
""")


In [124]:
# drop some of the string type fields
plays_df = plays_df.drop(['secondaryType', 'periodType', 'dateTime', 'rink_side'], axis=1).fillna(0)

# convert the remaining strings to integer types via hashing
plays_df['event'] = plays_df['event'].apply(hash)
plays_df['description'] = plays_df['description'].apply(hash)

plays_df.head()


Unnamed: 0,play_id,game_id,play_num,team_id_for,team_id_against,event,x,y,period,periodTime,periodTimeRemaining,goals_away,goals_home,description,st_x,st_y
1,2012030221_45,2012030221,45,0.0,0.0,-8402842510844108862,0.0,0.0,1,501,699,0,0,-1912551273318350847,0.0,0.0
3,2012030221_1,2012030221,1,0.0,0.0,6197313212850204841,0.0,0.0,1,0,1200,0,0,6197313212850204841,0.0,0.0
4,2012030221_2,2012030221,2,0.0,0.0,-1274977264371641193,0.0,0.0,1,0,1200,0,0,-1274977264371641193,0.0,0.0
5,2012030221_3,2012030221,3,0.0,0.0,8193249223154536421,0.0,0.0,1,0,1200,0,0,8193249223154536421,0.0,0.0
6,2012030221_4,2012030221,4,6.0,3.0,4788985380621854529,0.0,0.0,1,0,1200,0,0,8479129636391762706,0.0,0.0


In [125]:

# create feature encodings for the event and description fields
es = ft.EntitySet(id="plays")
es = es.entity_from_dataframe(entity_id="plays", dataframe=plays_df, index="play_id",
           variable_types = { "event": ft.variable_types.Categorical, "description": ft.variable_types.Categorical })       
         
f1 = Feature(es["plays"]["event"])
f2 = Feature(es["plays"]["description"])

encoded, _= ft.encode_features(plays_df, [f1, f2], top_n=10)
encoded.reset_index(inplace=True)

# create an entry set of the encoded play data and games
es = ft.EntitySet(id="plays")
es = es.entity_from_dataframe(entity_id="plays", dataframe=encoded, index="play_id")
es = es.normalize_entity(base_entity_id="plays",new_entity_id="games", index="game_id")

In [126]:
es

Entityset: plays
  Entities:
    plays [Rows: 250250, Columns: 37]
    games [Rows: 764, Columns: 1]
  Relationships:
    plays.game_id -> games.game_id

In [127]:
features, defs = ft.dfs(entityset=es, target_entity="games", max_depth = 2)
features.reset_index(inplace=True)
features.shape

(764, 212)

In [128]:
# assign labels to the generated features
features = fq.execute("""
SELECT f.*, case when g.type = 'P' then 1 else 0 end as label
FROM features f 
JOIN game_df g
  on f.game_id = g.game_id
""")

In [129]:
# count the labels
fq.execute("""
select label, sum(1) as plays
FROM features
group by label
""")

Unnamed: 0,label,plays
0,0,674
1,1,90


In [130]:
y = features['label']
X = features.drop(['label', 'game_id'], axis=1).fillna(0)

In [131]:
lr = LogisticRegression()

model = lr.fit(X, y)
model.score(X, y)

0.9345549738219895

In [132]:

roc_auc_score(y, model.predict_proba(X)[:, 1] )
    

0.9803824596109463