# NHL Game Predictor
* How it works:
    * This will create the model based on the season of data
    * This will not make the estimates for a games in a season

In [1]:
import pandas as pd
import numpy as np
import csv
import os
from pathlib import Path
import glob
from datetime import datetime

### Load files from data/games/*.csv directory
* The game_data is retrieved first for all the game data of the season
* Using game_data, the feature_data is populated which contains more scraped data from nhl.com

## Load the feature data

In [2]:
predictor_path = os.getcwd()
data_path = predictor_path.replace("model", "data/games/")
data_files = glob.glob(os.path.join(data_path,"feat*_prod.csv"))
data_files

['/Users/bwhitlock/repos/NHL_Predictor/data/games/feature_data_2018_prod.csv']

In [3]:
seasons = []
for season_file in data_files:
    data = pd.read_csv(season_file, header=0)
    seasons.append(data)
df = pd.concat(seasons)

print ("Data for {} games.".format(df.shape[0]))
df.head()

Data for 1271 games.


Unnamed: 0,game_id,visitor_team,visitor_gp,visitor_w,visitor_l,visitor_t,visitor_ot,visitor_p,visitor_gf,visitor_ga,...,home_ts,home_ppga,home_pk%,home_fow,home_fol,home_fow%,home_goals,visitor_goals,extra_time,game_date
0,0,Calgary Flames,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0.0,0,3,,2017-10-04
1,1,St. Louis Blues,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0.0,5,4,OT,2017-10-04
2,2,Philadelphia Flyers,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0.0,5,3,,2017-10-04
3,3,Toronto Maple Leafs,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0.0,7,2,,2017-10-04
4,4,Arizona Coyotes,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0.0,4,5,,2017-10-05


# List of possible features scraped from NHL.com

In [4]:
df.columns

Index(['game_id', 'visitor_team', 'visitor_gp', 'visitor_w', 'visitor_l',
       'visitor_t', 'visitor_ot', 'visitor_p', 'visitor_gf', 'visitor_ga',
       'visitor_s/o win', 'visitor_s/o loss', 'visitor_sf', 'visitor_sa',
       'visitor_ppg', 'visitor_pp opp', 'visitor_pp%', 'visitor_ts',
       'visitor_ppga', 'visitor_pk%', 'visitor_fow', 'visitor_fol',
       'visitor_fow%', 'home_team', 'home_gp', 'home_w', 'home_l', 'home_t',
       'home_ot', 'home_p', 'home_gf', 'home_ga', 'home_s/o win',
       'home_s/o loss', 'home_sf', 'home_sa', 'home_ppg', 'home_pp opp',
       'home_pp%', 'home_ts', 'home_ppga', 'home_pk%', 'home_fow', 'home_fol',
       'home_fow%', 'home_goals', 'visitor_goals', 'extra_time', 'game_date'],
      dtype='object')

# Define the Target Variable

In [5]:
df["h_wins"] = df["home_goals"] > df["visitor_goals"]
df[["home_goals", "visitor_goals", "h_wins"]].head()

Unnamed: 0,home_goals,visitor_goals,h_wins
0,0,3,False
1,5,4,True
2,5,3,True
3,7,2,True
4,4,5,False


## Map the extra_time string to boolean

In [6]:
df = df.replace({'extra_time': {'SO': True, 'OT': True, np.NaN:False}})
df[["home_goals", "visitor_goals", "h_wins", "extra_time"]].head()


Unnamed: 0,home_goals,visitor_goals,h_wins,extra_time
0,0,3,False,False
1,5,4,True,True
2,5,3,True,False
3,7,2,True,False
4,4,5,False,False


# Remove some of the not relevant columns

In [7]:
suffix = ["gp", "w", "l", "t", "ot", "s/o win", "s/o loss", "pp opp", "ppg", "ts", "ppga", "fow", "fol"]
for prefix in ["visitor_", "home_"]:
    cols = [prefix+x for x in suffix]
    df = df.drop(columns=cols)


In [9]:
df.columns

Index(['game_id', 'visitor_team', 'visitor_p', 'visitor_gf', 'visitor_ga',
       'visitor_sf', 'visitor_sa', 'visitor_pp%', 'visitor_pk%',
       'visitor_fow%', 'home_team', 'home_p', 'home_gf', 'home_ga', 'home_sf',
       'home_sa', 'home_pp%', 'home_pk%', 'home_fow%', 'home_goals',
       'visitor_goals', 'extra_time', 'game_date', 'h_wins'],
      dtype='object')

## Features
| Feature | Description | Status |
| --- | --- |
| h_days_since_game | home days since last game played | |
| v_days_since_game | visitor days since last game played | |
| h_points | home points thus far this season | Done |
| v_points | visitor points thus far this season | Done |
| h_wins_hth | home wins head to head | |
| v_wins_hth | visitor wins head to head | |
| division_game | Binary - is the game within the division | |
| h_gf_ga | Home goals for to goals against ratio | |
| v_gf_ga | Visitor goals for to goals against ratio |  |

#### Future Considerations - Player considerations
| Feature | Description |
| --- | --- |
| Injuries | Are there any players injured on each team? |
| Star players | Does one team have star players, may be good for early season games | 
| Depth | Depth of team |

## Get the number of days since the last game for Visitor and Home team
* This is a way of defining momentum

In [10]:
def get_num_days(df, game_date, team_name):
    df = df[(df["game_date"] < game_date) & ((df["home_team"] == team_name) | 
                                             (df["visitor_team"] == team_name))]["game_date"]
    if df.empty:
        return 25
    else:
        d1 = datetime.strptime(game_date, '%Y-%m-%d')
        d0 = datetime.strptime(df.max(), '%Y-%m-%d')
        delta = d1 - d0
        return delta.days
    
for index, row in df.iterrows():
    for team_name in ["visitor_team", "home_team"]:
        df.loc[index, "days_rest_"+team_name] = get_num_days(df, row["game_date"], row[team_name])


# Get the record of the teams last 5 games

In [11]:
def get_recent_record(df, row, is_home):
    team_col = "home_team" if is_home else "visitor_team"

    df = df[(df["game_date"] < row["game_date"]) & ((df["home_team"] == row[team_col]) | 
                                             (df["visitor_team"] == row[team_col]))][["h_wins", "extra_time"]]
    if df.empty:
        return 0
    else:
        df = df.iloc[-5:]
        points = 0
        for index, row in df.iterrows():
            if is_home and row["h_wins"]:
                points += 2
            elif is_home and not row["h_wins"] and row["extra_time"]:
                points += 1
            elif not is_home and not row["h_wins"]:
                points += 2
            elif not is_home and row["h_wins"] and row["extra_time"]:
                points += 1
        
        return points
    
for index, row in df.iterrows():
    for team_name in ["visitor_team", "home_team"]:
        df.loc[index, "last_5_"+team_name] = get_recent_record(df, row, team_name=="home_team")


In [103]:
df.head(6)

Unnamed: 0,game_id,visitor_team,visitor_gp,visitor_w,visitor_l,visitor_t,visitor_ot,visitor_p,visitor_gf,visitor_ga,...,home_fow%,home_goals,visitor_goals,extra_time,game_date,days_rest_visitor_team,days_rest_home_team,last_5_visitor_team,last_5_home_team,h_wins
0,0,Calgary Flames,0,0,0,0,0,0,0,0,...,0.0,0,3,False,2017-10-04,25.0,25.0,0.0,0.0,False
1,1,St. Louis Blues,0,0,0,0,0,0,0,0,...,0.0,5,4,True,2017-10-04,25.0,25.0,0.0,0.0,True
2,2,Philadelphia Flyers,0,0,0,0,0,0,0,0,...,0.0,5,3,False,2017-10-04,25.0,25.0,0.0,0.0,True
3,3,Toronto Maple Leafs,0,0,0,0,0,0,0,0,...,0.0,7,2,False,2017-10-04,25.0,25.0,0.0,0.0,True
4,4,Arizona Coyotes,0,0,0,0,0,0,0,0,...,0.0,4,5,False,2017-10-05,25.0,25.0,0.0,0.0,False
5,5,Nashville Predators,0,0,0,0,0,0,0,0,...,0.0,3,4,False,2017-10-05,25.0,25.0,0.0,0.0,False
