# NHL Game Predictor - Feature Extraction

In [1]:
import pandas as pd
import csv
import os
from pathlib import Path
import glob

### Load files from data/games/*.csv directory
* The game_data is retrieved first for all the game data of the season
* Using game_data, the feature_data is populated which contains more scraped data from nhl.com

In [4]:
predictor_path = os.getcwd()
data_path = predictor_path.replace("model", "data/games/")
data_files = glob.glob(os.path.join(data_path,"*.csv"))
data_files

['/Users/bwhitlock/repos/NHL_Predictor/data/games/game_data_2018.csv',
 '/Users/bwhitlock/repos/NHL_Predictor/data/games/feature_data_2018.csv']

### Read data into DataFrame

In [8]:
# Only want the feature_data
feature_data = [k for k in data_files if 'feature_data' in k]
feature_data

['/Users/bwhitlock/repos/NHL_Predictor/data/games/feature_data_2018.csv']

In [11]:
seasons = []
for season_file in feature_data:
    data = pd.read_csv(season_file, header=0)
    seasons.append(data)
df = pd.concat(seasons)

print ("Data for {} games.".format(df.shape[0]))
df.head()

Data for 1271 games.


Unnamed: 0,game_id,visitor_team,visitor_gp,visitor_w,visitor_l,visitor_t,visitor_ot,visitor_p,visitor_gf,visitor_ga,...,home_pp%,home_ts,home_ppga,home_pk%,home_fow,home_fol,home_fow%,home_goals,visitor_goals,extra_time
0,0,Calgary Flames,0,0,0,0,0,0,0,0,...,0.0,0,0,0.0,0,0,0.0,0,3,
1,1,St. Louis Blues,0,0,0,0,0,0,0,0,...,0.0,0,0,0.0,0,0,0.0,5,4,OT
2,2,Philadelphia Flyers,0,0,0,0,0,0,0,0,...,0.0,0,0,0.0,0,0,0.0,5,3,
3,3,Toronto Maple Leafs,0,0,0,0,0,0,0,0,...,0.0,0,0,0.0,0,0,0.0,7,2,
4,4,Arizona Coyotes,0,0,0,0,0,0,0,0,...,0.0,0,0,0.0,0,0,0.0,4,5,


## Features to Extract:
| Feature | Description | Status |
| --- | --- |
| h_days_since_game | home days since last game played | |
| v_days_since_game | visitor days since last game played | |
| h_points | home points thus far this season | Done |
| v_points | visitor points thus far this season | Done |
| h_wins_hth | home wins head to head | |
| v_wins_hth | visitor wins head to head | |
| division_game | Binary - is the game within the division | |
| h_gf_ga | Home goals for to goals against ratio | |
| v_gf_ga | Visitor goals for to goals against ratio |  |

#### Future Considerations - Player considerations
| Feature | Description |
| --- | --- |
| Injuries | Are there any players injured on each team? |
| Star players | Does one team have star players, may be good for early season games | 
| Depth | Depth of team |

In [73]:
df['home_win'] = df['home_goals'] > df['visitor_goals']
df.head()

Unnamed: 0,date_game,visitor_team_name,home_team_name,visitor_goals,home_goals,overtimes,home_win
0,2017-10-04,Calgary Flames,Edmonton Oilers,0,3,False,True
1,2017-10-04,St. Louis Blues,Pittsburgh Penguins,5,4,True,False
2,2017-10-04,Philadelphia Flyers,San Jose Sharks,5,3,False,False
3,2017-10-04,Toronto Maple Leafs,Winnipeg Jets,7,2,False,False
4,2017-10-05,Arizona Coyotes,Anaheim Ducks,4,5,False,True


## Calculate the number of points for each team entering game

In [87]:
table = df.copy()

def get_teams_points(df, team):
    points_col = "{}_points".format(team)
    name_col = "{}_team_name".format(team)
    games = table[table['date_game'] < df['date_game']]
    if games.empty:
        df[points_col] = 0
        return df
    
    games_won_in_reg = games[(games["home_team_name"] == df[name_col]) & (games["home_win"])]
    games_won_in_reg = games_won_in_reg.append(games[(games["visitor_team_name"] == df[name_col]) 
                                                     & (games["home_win"] == False)])
    
    df[points_col] = games_won_in_reg.shape[0] * 2
    
    games_lost_in_ot = games[(games["home_team_name"] == df[name_col]) & (games["home_win"] == False) 
                             & (games["overtimes"])]
    games_lost_in_ot = games_lost_in_ot.append(games[(games["visitor_team_name"] == df[name_col])
                                                    & (games["home_win"]) & (games["overtimes"])])
    
    df[points_col] = df[points_col] + games_lost_in_ot.shape[0]
    
    return df
    
df = df.apply(get_teams_points, team="home", axis=1)
df = df.apply(get_teams_points, team="visitor", axis=1)

df.head()

Unnamed: 0,date_game,visitor_team_name,home_team_name,visitor_goals,home_goals,overtimes,home_win,home_points,visitor_win,visitor_points
7,2017-10-05,Pittsburgh Penguins,Chicago Blackhawks,1,10,False,True,0,False,1
19,2017-10-07,Columbus Blue Jackets,Chicago Blackhawks,1,5,False,True,2,False,2
37,2017-10-09,Chicago Blackhawks,Toronto Maple Leafs,3,4,True,True,4,False,4
40,2017-10-10,Chicago Blackhawks,Montreal Canadiens,3,1,False,False,2,True,5
51,2017-10-12,Minnesota Wild,Chicago Blackhawks,5,2,False,False,7,True,1
63,2017-10-14,Nashville Predators,Chicago Blackhawks,1,2,True,True,7,False,4
92,2017-10-18,Chicago Blackhawks,St. Louis Blues,2,5,False,True,8,False,9
98,2017-10-19,Edmonton Oilers,Chicago Blackhawks,2,1,True,False,9,True,2
109,2017-10-21,Chicago Blackhawks,Arizona Coyotes,4,2,False,False,1,True,10
134,2017-10-24,Chicago Blackhawks,Vegas Golden Knights,2,4,False,True,12,False,12


## Get the number of days since the last game

In [90]:
#team = "Chicago Blackhawks"
#filtered = df[(df["home_team_name"] == team) | (df["visitor_team_name"] == team)]
#filtered.head(10)