# NHL Game Predictor
* How it works:
    * This will create the model based on the season of data
    * This will not make the estimates for a games in a season

In [119]:
import pandas as pd
import numpy as np
import csv
import os
from pathlib import Path
import glob
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn import tree

### Load files from data/games/*.csv directory
* The game_data is retrieved first for all the game data of the season
* Using game_data, the feature_data is populated which contains more scraped data from nhl.com

## Load the feature data

In [94]:
predictor_path = os.getcwd()
data_path = predictor_path.replace("model", "data/games/")
data_files = glob.glob(os.path.join(data_path,"feat*_prod.csv"))
data_files

['/Users/bwhitlock/repos/NHL_Predictor/data/games/feature_data_2018_prod.csv']

In [95]:
seasons = []
for season_file in data_files:
    data = pd.read_csv(season_file, header=0)
    seasons.append(data)
df = pd.concat(seasons)

print ("Data for {} games.".format(df.shape[0]))
df.head()

Data for 1271 games.


Unnamed: 0,game_id,visitor_team,visitor_gp,visitor_w,visitor_l,visitor_t,visitor_ot,visitor_p,visitor_gf,visitor_ga,...,home_ts,home_ppga,home_pk%,home_fow,home_fol,home_fow%,home_goals,visitor_goals,extra_time,game_date
0,0,Calgary Flames,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0.0,0,3,,2017-10-04
1,1,St. Louis Blues,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0.0,5,4,OT,2017-10-04
2,2,Philadelphia Flyers,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0.0,5,3,,2017-10-04
3,3,Toronto Maple Leafs,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0.0,7,2,,2017-10-04
4,4,Arizona Coyotes,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0.0,4,5,,2017-10-05


# Load the points data

In [96]:
season = 2017 # season being 2017 contains data for the 16-17 season
predictor_path = os.getcwd()
data_path = predictor_path.replace("model", "data/points/")
points_files = glob.glob(os.path.join(data_path,"*.csv"))

print (points_files)
points = []
for file in points_files:
    data = pd.read_csv(file, header=0)
    points.append(data)
points_df = pd.concat(points)

points_df = points_df[points_df["year"] == season]
points_df = points_df.drop(columns=["year"])

print ("Data for {} games.".format(points_df.shape[0]))
points_df.head()

['/Users/bwhitlock/repos/NHL_Predictor/data/points/2018.csv', '/Users/bwhitlock/repos/NHL_Predictor/data/points/2016.csv', '/Users/bwhitlock/repos/NHL_Predictor/data/points/2017.csv']
Data for 30 games.


Unnamed: 0,team_name,points
0,Washington Capitals,118
1,Pittsburgh Penguins,111
2,Chicago Blackhawks,109
3,Columbus Blue Jackets,108
4,Minnesota Wild,106


In [97]:
# home_points
df = pd.merge(df, points_df, left_on = 'home_team', right_on = 'team_name')
df = df.drop(columns=["team_name"])
df = df.rename(columns={"points": "home_points"})

# visitor_points
df = pd.merge(df, points_df, left_on = 'visitor_team', right_on = 'team_name')
df = df.drop(columns=["team_name"])
df = df.rename(columns={"points": "visitor_points"})
df.head()

Unnamed: 0,game_id,visitor_team,visitor_gp,visitor_w,visitor_l,visitor_t,visitor_ot,visitor_p,visitor_gf,visitor_ga,...,home_pk%,home_fow,home_fol,home_fow%,home_goals,visitor_goals,extra_time,game_date,home_points,visitor_points
0,0,Calgary Flames,0,0,0,0,0,0,0,0,...,0.0,0,0,0.0,0,3,,2017-10-04,103,94
1,748,Calgary Flames,48,25,16,0,7,57,132,127,...,71.4,1440,1501,49.0,3,4,SO,2018-01-25,103,94
2,1016,Calgary Flames,66,32,25,0,9,73,183,190,...,81.6,2065,2052,50.1,3,4,OT,2018-03-05,111,94
3,571,Calgary Flames,36,18,15,0,3,39,99,104,...,86.5,1041,988,51.3,2,3,SO,2017-12-28,99,94
4,1159,Calgary Flames,75,35,30,0,10,80,202,221,...,84.4,2302,2201,51.1,1,5,,2018-03-24,99,94


# List of possible features scraped from NHL.com

In [98]:
df.columns

Index(['game_id', 'visitor_team', 'visitor_gp', 'visitor_w', 'visitor_l',
       'visitor_t', 'visitor_ot', 'visitor_p', 'visitor_gf', 'visitor_ga',
       'visitor_s/o win', 'visitor_s/o loss', 'visitor_sf', 'visitor_sa',
       'visitor_ppg', 'visitor_pp opp', 'visitor_pp%', 'visitor_ts',
       'visitor_ppga', 'visitor_pk%', 'visitor_fow', 'visitor_fol',
       'visitor_fow%', 'home_team', 'home_gp', 'home_w', 'home_l', 'home_t',
       'home_ot', 'home_p', 'home_gf', 'home_ga', 'home_s/o win',
       'home_s/o loss', 'home_sf', 'home_sa', 'home_ppg', 'home_pp opp',
       'home_pp%', 'home_ts', 'home_ppga', 'home_pk%', 'home_fow', 'home_fol',
       'home_fow%', 'home_goals', 'visitor_goals', 'extra_time', 'game_date',
       'home_points', 'visitor_points'],
      dtype='object')

# Define the Target Variable

In [99]:
df["h_wins"] = df["home_goals"] > df["visitor_goals"]
df[["home_goals", "visitor_goals", "h_wins"]].head()

Unnamed: 0,home_goals,visitor_goals,h_wins
0,0,3,False
1,3,4,False
2,3,4,False
3,2,3,False
4,1,5,False


## Map the extra_time string to boolean

In [100]:
df = df.replace({'extra_time': {'SO': True, 'OT': True, np.NaN:False}})
df[["home_goals", "visitor_goals", "h_wins", "extra_time"]].head()


Unnamed: 0,home_goals,visitor_goals,h_wins,extra_time
0,0,3,False,False
1,3,4,False,True
2,3,4,False,True
3,2,3,False,True
4,1,5,False,False


# Remove some of the not relevant columns

In [101]:
suffix = ["gp", "w", "l", "t", "ot", "s/o win", "s/o loss", "pp opp", "ppg", "ppga", "fow", "fol"]
for prefix in ["visitor_", "home_"]:
    cols = [prefix+x for x in suffix]
    df = df.drop(columns=cols)


In [102]:
df.columns

Index(['game_id', 'visitor_team', 'visitor_p', 'visitor_gf', 'visitor_ga',
       'visitor_sf', 'visitor_sa', 'visitor_pp%', 'visitor_ts', 'visitor_pk%',
       'visitor_fow%', 'home_team', 'home_p', 'home_gf', 'home_ga', 'home_sf',
       'home_sa', 'home_pp%', 'home_ts', 'home_pk%', 'home_fow%', 'home_goals',
       'visitor_goals', 'extra_time', 'game_date', 'home_points',
       'visitor_points', 'h_wins'],
      dtype='object')

## Get the number of days since the last game for Visitor and Home team
* This is a way of defining momentum

In [103]:
df.columns

Index(['game_id', 'visitor_team', 'visitor_p', 'visitor_gf', 'visitor_ga',
       'visitor_sf', 'visitor_sa', 'visitor_pp%', 'visitor_ts', 'visitor_pk%',
       'visitor_fow%', 'home_team', 'home_p', 'home_gf', 'home_ga', 'home_sf',
       'home_sa', 'home_pp%', 'home_ts', 'home_pk%', 'home_fow%', 'home_goals',
       'visitor_goals', 'extra_time', 'game_date', 'home_points',
       'visitor_points', 'h_wins'],
      dtype='object')

In [104]:
def get_num_days(df, game_date, team_name):
    df = df[(df["game_date"] < game_date) & ((df["home_team"] == team_name) | 
                                             (df["visitor_team"] == team_name))]["game_date"]
    if df.empty:
        return 25
    else:
        d1 = datetime.strptime(game_date, '%Y-%m-%d')
        d0 = datetime.strptime(df.max(), '%Y-%m-%d')
        delta = d1 - d0
        return delta.days
    
for index, row in df.iterrows():
    for team_name in ["visitor", "home"]:
        df.loc[index, team_name+"_days_rest"] = get_num_days(df, row["game_date"], row[team_name+"_team"])


# Get the record of the teams last 5 games

In [105]:
def get_recent_record(df, row, team_name):
    is_home = team_name == "home"
    team_name = team_name + "_team"
    
    df = df[(df["game_date"] < row["game_date"]) & ((df["home_team"] == row[team_name]) | 
                                             (df["visitor_team"] == row[team_name]))][["h_wins", "extra_time"]]
    if df.empty:
        return 0
    else:
        df = df.iloc[-5:]
        points = 0
        for index, row in df.iterrows():
            if is_home and row["h_wins"]:
                points += 2
            elif is_home and not row["h_wins"] and row["extra_time"]:
                points += 1
            elif not is_home and not row["h_wins"]:
                points += 2
            elif not is_home and row["h_wins"] and row["extra_time"]:
                points += 1
        
        return points
    
for index, row in df.iterrows():
    for team_name in ["visitor", "home"]:
        df.loc[index, team_name+"_last_5"] = get_recent_record(df, row, team_name)


## Features
### Ratio is Home / Away
| Feature | Description |
| --- | --- |
| gf_r | Goals Ratio |
| ga_r | Goals Against Ratio |
| pp_r | Power Play Ratio |
| pk_r | Penalty Kill Ratio |
| shot_per_r | Shooting Percent Ratio |
| save_per_r | Save Percent Ratio |
| streak_r | Points in Last 5 Ratio |
| rest_r | Days Rest Ratio |
| last_pts_r | Last Seasons Points Ratio |

#### Future Considerations - Player considerations
| Feature | Description |
| --- | --- |
| Injuries | Are there any players injured on each team? |
| Star players | Does one team have star players, may be good for early season games | 
| Depth | Depth of team |
| Randomness | Hockey is a game of luck how can we include this in some way |

In [106]:
df.columns

Index(['game_id', 'visitor_team', 'visitor_p', 'visitor_gf', 'visitor_ga',
       'visitor_sf', 'visitor_sa', 'visitor_pp%', 'visitor_ts', 'visitor_pk%',
       'visitor_fow%', 'home_team', 'home_p', 'home_gf', 'home_ga', 'home_sf',
       'home_sa', 'home_pp%', 'home_ts', 'home_pk%', 'home_fow%', 'home_goals',
       'visitor_goals', 'extra_time', 'game_date', 'home_points',
       'visitor_points', 'h_wins', 'visitor_days_rest', 'home_days_rest',
       'visitor_last_5', 'home_last_5'],
      dtype='object')

# Remove rest of non relevant columns

#### Feature Columns

In [107]:
X = pd.DataFrame()
X["gf_r"] = df["home_gf"] / df["visitor_gf"]
X["ga_r"] = df["home_ga"] / df["visitor_ga"]
X["pp_r"] = df["home_pp%"] / df["visitor_pp%"]
X["pk_r"] = df["home_pk%"] / df["visitor_pk%"]
X["shot_per_r"] = (df["home_gf"] / df["home_sf"]) / (df["visitor_gf"] / df["visitor_sf"])
X["save_per_r"] = (df["home_ga"] / df["home_sa"]) / (df["visitor_ga"] / df["visitor_sa"])
X["streak_r"] = df["home_last_5"] / df["visitor_last_5"]
X["rest_r"] = df["home_days_rest"] / df["visitor_days_rest"]
X["last_pts_r"] = df["home_points"] / df["visitor_points"]
X = X.fillna(0)
X = X.replace([np.inf, -np.inf], 10)

#### Target Column

In [108]:
y = pd.DataFrame()
y["h_wins"] = df["h_wins"]

# Modelling

In [109]:
rfc = RandomForestClassifier(
            n_estimators=20,
            criterion="gini",
            max_features=None, # Use all features to make the cut
        )

In [110]:
model = rfc.fit(X, y["h_wins"])
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [111]:
rfc.feature_importances_

array([0.1148123 , 0.12617947, 0.12236806, 0.11426122, 0.11302944,
       0.10764074, 0.10434235, 0.06053038, 0.13683603])

In [112]:
v_results = cross_validate(model, X, y["h_wins"], return_train_score=False, cv=5)
v_results

{'fit_time': array([0.1057632 , 0.0873158 , 0.08587909, 0.08717322, 0.08448219]),
 'score_time': array([0.00270581, 0.00200725, 0.00206304, 0.00198197, 0.0025301 ]),
 'test_score': array([0.59192825, 0.52017937, 0.52702703, 0.54751131, 0.53846154])}

# Export to DOT file to view tree

In [122]:
tree_1 = rfc.estimators_[0]
tree.export_graphviz(tree_1,out_file='tree.dot') 