In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier

In [None]:
X_train = pd.read_pickle('../assets/X_train.pkl')
X_test = pd.read_pickle('../assets/X_test.pkl')
y_train = pd.read_pickle('../assets/y_train.pkl')
y_test = pd.read_pickle('../assets/y_test.pkl')

In [None]:
X_train.head()

In [3]:
pipe = Pipeline([
    ('ss',StandardScaler()),
    ('rfc',RandomForestClassifier(n_jobs=3,))   
])

In [6]:
X_train.columns

Index(['Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Sunset', 'Sunrise', 'Heat', 'Depart', 'DewPoint',
       'WetBulb', 'Cool', 'PrecipTotal', 'StnPressure', 'Latitude',
       'Longitude', 'Month', 'Day_length_exp', 'Tavg_shift', 'Heat_exp',
       'Cool_shift', 'Tmax_shift', 'Tmin_shift', 'Depart_shift',
       'ResultSpeed_shift', 'ResultDir_exp', 'PrecipTotal_exp', 'WetBulb_exp',
       'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS',
       'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS',
       'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS',
       'Species_CULEX TERRITANS'],
      dtype='object')

In [52]:
param_grid =  {
    'rfc__n_estimators': [100],
#     'rfc__min_samples_leaf': [2,4,5,6],
#     'rfc__max_features':['log2','sqrt','auto'],
    'rfc__min_samples_split': [2, 7, 10, 20]
        
}

In [53]:
gs = GridSearchCV(pipe, param_grid=param_grid,verbose=1,scoring='roc_auc', cv=TimeSeriesSplit())

In [54]:
gs.fit(X_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    5.3s finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('rfc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
         ...n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'rfc__n_estimators': [100], 'rfc__min_samples_split': [2, 7, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [55]:
gs.score(X_train,y_train)

0.996271418560515

In [56]:
gs.score(X_test,y_test)

0.7238152130625248

In [60]:
X_train_preds = gs.predict(X_train)

In [61]:
preds = pd.DataFrame({
    "preds":X_train_preds,
    "truth":y_train
})


In [62]:
preds.sum()

preds     86
truth    261
dtype: int64

In [51]:
gs.best_params_

{'rfc__min_samples_split': 20, 'rfc__n_estimators': 500}

In [36]:
y_train.shape

(6483,)

In [37]:
X_train.shape

(6483, 37)

In [38]:
X_train_preds.shape

(6483,)

In [16]:
preds

Unnamed: 0,preds,truth
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [42]:
with open('../assets/random_forest_model_0924_1247.pkl','wb+') as f:
    pickle.dump(gs,f)

## Looking at the feature importances

In [39]:
feat_importances = pd.DataFrame(gs.best_estimator_.named_steps['rfc'].feature_importances_, X_train.columns, columns=['importance'])

In [40]:
feat_importances.sort_values('importance', ascending=False)

Unnamed: 0,importance
Heat_exp,0.123327
PrecipTotal_exp,0.120967
WetBulb_exp,0.114792
Longitude,0.113067
Day_length_exp,0.109708
ResultDir_exp,0.104793
Latitude,0.091445
Species_CULEX PIPIENS,0.017451
ResultSpeed_shift,0.013871
Species_CULEX PIPIENS/RESTUANS,0.013212
