# 5) Random Forest Classifer

How would a Random Forest Classifer compare to the Logistic Regression? What does it tell us about which fields are most important?

In [1]:
#Imports
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import numpy as np
from pprint import pprint

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing
from sklearn import utils
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import export_graphviz

In [2]:
#Read in the csv
df = pd.read_csv(r'nfl_game_logs_df.csv', index_col=0)

In [3]:
#Remove the fields that shouldn't be used in a Random Forest Classifer, including
# - Those that don't provide useful information about the game
# - Those that would only be known after the game has been played
# - Those that are a copy of another field already in the DataFrame, but from the perspective of the opponent
feature_cols = df.columns.drop(['day', 'team_name', 'season_year', 'week', 'date', 'game_time', 'stadium_name', 'location', 'latitude', 'longitude', 'opp', 'result', 'points_for', 'points_allowed', 'tot_yds', 'pass_yds', 'rush_yds', 'first_downs', 'turnovers', 'opp_tot_yds', 'opp_pass_yds', 'opp_rush_yds', 'opp_first_downs', 'opp_turnovers', 'exp_pts_off', 'exp_pts_def', 'exp_pts_st', 'opp_index', 'result_value', 'mov_winner', 'elo_winner_diff', 'mov_mult', 'elo_change', 'elo_end',
                       'team_name_opp', 'season_year_opp', 'week_opp', 'day_opp', 'date_opp', 'game_time_opp', 'home_team_opp', 'stadium_name_opp', 'location_opp', 'latitude_opp', 'longitude_opp', 'opp_opp', 'result_opp', 'points_for_opp', 'points_allowed_opp', 'tot_yds_opp', 'pass_yds_opp', 'rush_yds_opp', 'first_downs_opp', 'turnovers_opp', 'opp_tot_yds_opp', 'opp_pass_yds_opp', 'opp_rush_yds_opp', 'opp_first_downs_opp', 'opp_turnovers_opp', 'exp_pts_off_opp', 'exp_pts_def_opp', 'exp_pts_st_opp', 'opp_index_opp', 'distance_travelled_opp.1', 'distance_travelled_opp_opp', 'distance_travelled_opp_diff_opp', 'elo_start_opp.1', 'elo_start_opp_opp', 'elo_opp_diff_team_opp', 'exp_win_prob_opp', 'result_value_opp', 'mov_winner_opp', 'elo_winner_diff_opp', 'mov_mult_opp', 'elo_change_opp', 'elo_end_opp'])

In [4]:
X = df[feature_cols]
y = df['result_value'].apply(str)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 39)

In [6]:
rf = RandomForestClassifier(random_state=39, max_depth=5)

In [7]:
lab_enc = preprocessing.LabelEncoder()
encoded = lab_enc.fit_transform(y_train)
rf.fit(X_train, encoded)

In [8]:
encoded_test = lab_enc.fit_transform(y_test)

In [9]:
y_pred = rf.predict(X_test)

In [10]:
print(metrics.accuracy_score(encoded_test, y_pred))

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

0.6557705136757839
Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 39,
 'verbose': 0,
 'warm_start': False}


In [11]:
estimator = rf.estimators_[7]

In [12]:
export_graphviz(estimator, out_file='tree.dot', feature_names = list(feature_cols), class_names=['W', 'L', 'T'])

In [13]:
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree_7.png', '-Gdpi=600'])

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.927587 to fit


0

In [14]:
pd.DataFrame({'feature': list(feature_cols), 'importance': rf.feature_importances_}).sort_values(by='importance', ascending=False).head(25)

Unnamed: 0,feature,importance
127,elo_opp_diff_team,0.07536
128,exp_win_prob,0.074342
125,elo_start,0.037375
126,elo_start_opp,0.031951
107,7_game_avg_exp_pts_off,0.023331
104,4_game_avg_exp_pts_off,0.019871
231,6_game_avg_exp_pts_off_opp,0.017756
108,8_game_avg_exp_pts_off,0.016937
233,8_game_avg_exp_pts_off_opp,0.016492
137,8_game_avg_points_for_opp,0.016265
