# Random Forest
A Random Forest Classifier is a decision based ensemble method that uses a series of decision trees to determine the class of our input variables. It works by selecting a random set of features at each node then averaging them together. This helps reduce overfitting by not emphasizing any one particular feature over another. We felt this was our next best choice after Logistic Regression because decision trees make no assumptions about the spread of the data and are very quick to train. Though this was not our production model we did spend a good amount of time searching through hyperparameters to find an optimum fit. Our results with this modelexhibited overfitting with a low amount of
predictive power on unseen data.


In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier

# Importing the Time Series split assets

In [2]:
X_train = pd.read_pickle('../assets/X_train.pkl')
X_test = pd.read_pickle('../assets/X_test.pkl')
y_train = pd.read_pickle('../assets/y_train.pkl')
y_test = pd.read_pickle('../assets/y_test.pkl')


In [3]:
X_train.columns

Index(['Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Sunset', 'Sunrise', 'Heat', 'Depart', 'DewPoint',
       'WetBulb', 'Cool', 'PrecipTotal', 'StnPressure', 'Latitude',
       'Longitude', 'Month', 'Day_length_shift', 'Tavg_shift', 'Heat_shift',
       'Cool_shift', 'Tmax_shift', 'Tmin_shift', 'Depart_shift',
       'ResultSpeed_shift', 'ResultDir_shift', 'WetBulb_shift',
       'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS',
       'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS',
       'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS',
       'Species_CULEX TERRITANS'],
      dtype='object')

In [4]:
X_train.drop(columns=['Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Sunset', 'Sunrise', 'Heat', 'Depart', 'DewPoint',
       'WetBulb', 'Cool', 'PrecipTotal', 'StnPressure',],inplace=True)

In [5]:
X_test.drop(columns=['Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Sunset', 'Sunrise', 'Heat', 'Depart', 'DewPoint',
       'WetBulb', 'Cool', 'PrecipTotal', 'StnPressure',],inplace=True)

In [6]:
X_train.columns

Index(['Latitude', 'Longitude', 'Month', 'Day_length_shift', 'Tavg_shift',
       'Heat_shift', 'Cool_shift', 'Tmax_shift', 'Tmin_shift', 'Depart_shift',
       'ResultSpeed_shift', 'ResultDir_shift', 'WetBulb_shift',
       'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS',
       'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS',
       'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS',
       'Species_CULEX TERRITANS'],
      dtype='object')

# Running a pipeline
With a standard scaler and a Random Forest Classifier

In [7]:
pipe = Pipeline([
    ('ss',StandardScaler()),
    ('rfc',RandomForestClassifier(n_jobs=3,n_estimators=100,random_state=42))   
])

# Setting the range of hyperparameters 

In [8]:
param_grid =  {
    'rfc__min_samples_split': [2, 7, 10, 20]
        
}

# Gridsearching with  timeseries split
We used roc_auc as our scoring metric paired with a timeseries split.

In [9]:
gs = GridSearchCV(pipe, param_grid=param_grid,verbose=1,scoring='roc_auc', cv=TimeSeriesSplit())

In [10]:
gs.fit(X_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    4.8s finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('rfc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0....imators=100, n_jobs=3,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'rfc__min_samples_split': [2, 7, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

# Scoring
Our data was routinely overfit with this model. 

In [11]:
gs.score(X_train,y_train)

0.9782298259420594

In [12]:
gs.score(X_test,y_test)

0.7034711864071378

In [13]:
X_train_preds = gs.predict(X_train)

In [14]:
preds = pd.DataFrame({
    "preds":X_train_preds,
    "truth":y_train
})


In [15]:
preds.sum()

preds     16
truth    261
dtype: int64

In [16]:
gs.best_params_

{'rfc__min_samples_split': 20}

In [17]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

# Feature importances
The first 7 features account for most of the information in our model.

In [18]:
feat_importances = pd.DataFrame(gs.best_estimator_.named_steps['rfc'].feature_importances_, X_train.columns, columns=['importance'])

In [19]:
feat_importances.sort_values('importance', ascending=False)

Unnamed: 0,importance
Longitude,0.252674
Latitude,0.223221
Day_length_shift,0.100039
WetBulb_shift,0.06352
ResultDir_shift,0.048462
Tmin_shift,0.041383
Tmax_shift,0.040809
ResultSpeed_shift,0.0402
Tavg_shift,0.034856
Month,0.031583


In [20]:
with open('../assets/rf_0925_303.pkl','wb+') as f:
    pickle.dump(gs,f)