In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier



In [2]:
X_train = pd.read_pickle('../assets/X_train.pkl')
X_test = pd.read_pickle('../assets/X_test.pkl')
y_train = pd.read_pickle('../assets/y_train.pkl')
y_test = pd.read_pickle('../assets/y_test.pkl')

In [3]:
pipe = Pipeline([
    ('ss',StandardScaler()),
    ('rfc',RandomForestClassifier(n_jobs=3))   
])

In [4]:
param_grid =  {
    'rfc__n_estimators':[25,50,75,100],
    'rfc__min_samples_leaf': [2,4,5,6],
    'rfc__max_features':['log2','sqrt','auto'],
    'rfc__min_samples_split': [.05,.1,.5,.7]
        
}

In [5]:
gs = GridSearchCV(pipe, param_grid=param_grid,verbose=1,scoring='roc_auc')

In [6]:
gs.fit(X_train,y_train)

Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=1)]: Done 576 out of 576 | elapsed:  2.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('rfc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
         ...n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'rfc__n_estimators': [25, 50, 75, 100], 'rfc__min_samples_leaf': [2, 4, 5, 6], 'rfc__max_features': ['log2', 'sqrt', 'auto'], 'rfc__min_samples_split': [0.05, 0.1, 0.5, 0.7]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=1)

In [7]:
gs.score(X_train,y_train)

0.8222658198383932

In [8]:
gs.score(X_test,y_test)

0.6829317462791049

In [9]:
X_train_preds = gs.predict(X_train)

In [10]:
preds = pd.DataFrame({
    "preds":X_train_preds,
    "truth":y_train
})


In [11]:
preds.sum()

preds      0
truth    261
dtype: int64

In [12]:
gs.best_params_

{'rfc__max_features': 'auto',
 'rfc__min_samples_leaf': 5,
 'rfc__min_samples_split': 0.5,
 'rfc__n_estimators': 50}

In [13]:
y_train.shape

(6483,)

In [14]:
X_train.shape

(6483, 37)

In [15]:
X_train_preds.shape

(6483,)

In [16]:
preds

Unnamed: 0,preds,truth
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [17]:
with open('../assets/random_forest_model.pkl','wb+') as f:
    pickle.dump(gs,f)