# FINAL RANDOM FOREST MODEL TO BE PICKLED IN WEB APP

In [2]:
print(__doc__)

import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import cross_validation, linear_model
from sklearn.externals import joblib
%matplotlib inline 

Automatically created module for IPython interactive environment




In [3]:
#### Import training data
train_gut = pd.read_csv('train_valid.csv')
X = train_gut[['SP1','SP2','SP3','SP4','SP5','SP6','SP7','SP8','SP9','SP10','SP11','SP12','SP13','SP14','SP15','SP16','SP17','SP18','SP19','SP20','SP21','SP22','SP23','SP24','SP25','SP26','SP27']].values
y = train_gut['Group'].values

In [4]:
#### random forest model
rf_final = RandomForestClassifier(max_depth=None, n_estimators=50, max_features= 1, min_samples_split=6,min_samples_leaf = 4, bootstrap=False, criterion='gini') 
RF_final = rf_final.fit(X, y) #model fitting



In [8]:
#### Set leave-one-out validation
#### Cross validation iterator
#http://scikit-learn.org/stable/modules/cross_validation.html#leave-one-out-loo

from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
#loo = LeaveOneOut()

### eg

loo = cross_validation.LeaveOneOut(X.shape[0])
#for train_i, test_i in loo:
#    score = rf_final.fit(X[train_i], y[train_i]).score(X[test_i], y[test_i])
#    print('Sample %d score: %f' % (test_i[0], score))

In [5]:
### Cross validation
from sklearn import cross_validation, linear_model
cv = cross_validation.ShuffleSplit(len(y), n_iter=50,test_size=0.2) #Set test set as 20% instead

In [6]:
### Use 20% as validation 
scores_RF_final = cross_validation.cross_val_score(RF_final,X, y, cv=cv)
scores_RF_final.mean(), scores_RF_final.std()

(0.73826086956521753, 0.1115249173205918)

In [11]:
### Use leave one out
scores_RF_final_loo = cross_validation.cross_val_score(RF_final,X, y, cv=loo)
scores_RF_final_loo.mean(), scores_RF_final_loo.std()

(0.76315789473684215, 0.4251445900369345)

In [7]:
### Import test data
bulk_test = pd.read_csv('gut_test.csv')
X_bt = bulk_test[['SP1','SP2','SP3','SP4','SP5','SP6','SP7','SP8','SP9','SP10','SP11','SP12','SP13','SP14','SP15','SP16','SP17','SP18','SP19','SP20','SP21','SP22','SP23','SP24','SP25','SP26','SP27']].values
y_bt = bulk_test['Group'].values

In [8]:
### Performance evaluation for test data with the default 0.5 cutoff 
acc = (y_bt == RF_final.predict(X_bt))
print np.mean(acc)

0.777777777778


In [11]:
### Performance evaluation for test data with the turned-up 0.7 cutoff
cancer_chance_test = RF_final.predict_proba(X_bt)[:,0]
print cancer_chance_test
cancer_bin_test_7 = []
for x in np.nditer(cancer_chance_test):
    if x > 0.7:
        z = 0
    else:
        z = 1
    cancer_bin_test_7.append(z)
#print cancer_bin_train_l
import numpy as np
cancer_predict_test_7 = np.asarray(cancer_bin_test_7)
print cancer_predict_test_7

[ 0.6299847   0.69271164  0.79952772  0.78193359  0.74621272  0.75304033
  0.805799    0.86492885  0.62974666  0.63537706  0.69455513  0.75831895
  0.75954819  0.82012647  0.64300117  0.66613569  0.66624276  0.5918645
  0.56022304  0.32124936  0.45017965  0.34697469  0.37250621  0.56293766
  0.53362594  0.54253964  0.70986883]
[1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0]


In [17]:
cancer_chance_train = RF_final.predict_proba(X)[:,0]
print cancer_chance_train
cancer_bin_train_l = []
cancer_chance_test = RF_final.predict_proba(X_bt)[:,0]
print cancer_chance_test
cancer_bin_test_7 = []
for x in np.nditer(cancer_chance_test):
    if x > 0.7:
        z = 0
    else:
        z = 1
    cancer_bin_test_7.append(z)
#print cancer_bin_train_l
import numpy as np
cancer_predict_test_7 = np.asarray(cancer_bin_test_7)
print cancer_predict_test_7

[ 0.78333865  0.81735091  0.76435247  0.779519    0.72919594  0.77413955
  0.8705656   0.88808973  0.86645149  0.66478234  0.7771855   0.80161125
  0.73075751  0.73403068  0.83285913  0.85764457  0.82194773  0.84105839
  0.85398567  0.89156802  0.8124956   0.66946909  0.79362838  0.83532379
  0.77343057  0.72475519  0.87793443  0.79106317  0.86202689  0.85461062
  0.78395713  0.63391342  0.89690328  0.83269078  0.75820183  0.74629608
  0.89121953  0.89897108  0.74508637  0.84284353  0.87811236  0.68192898
  0.72415205  0.72397368  0.89894612  0.70159451  0.83412518  0.90316267
  0.86460048  0.90302836  0.7988611   0.81591485  0.82022986  0.6667792
  0.84511595  0.8293157   0.62190332  0.91968534  0.82833172  0.8308848
  0.80791226  0.76124382  0.80406836  0.64103763  0.77799145  0.87108107
  0.88747241  0.85494051  0.84949059  0.81579549  0.88704175  0.19610761
  0.25703008  0.25639683  0.48827373  0.2917618   0.19884848  0.23207073
  0.27969001  0.47065425  0.41678322  0.25595238  0.1

In [18]:
acc2 = (y_bt == cancer_predict_train)
print np.mean(acc2)

0.851851851852


In [19]:
## Include confusion matrix to see TP, FP, TN, FN
## for the default 0.5
from sklearn.metrics import confusion_matrix
confusion_matrix(y_bt, RF_final.predict(X_bt))

array([[17,  0],
       [ 6,  4]])

In [20]:
## confusion matrix for the 0.7 new table
confusion_matrix(y_bt, cancer_predict_train)

array([[13,  4],
       [ 0, 10]])

In [1]:
## confusion matrix for the 0.6 new table
cancer_chance_train = RF_final.predict_proba(X)[:,0]
print cancer_chance_train
cancer_bin_train_6 = []
for x in np.nditer(cancer_chance_train):
    if x > 0.6:
        z = 0
    else:
        z = 1
    cancer_bin_train_6.append(z)
#print cancer_bin_train_l
import numpy as np
cancer_predict_train_6 = np.asarray(cancer_bin_train_6)
print cancer_predict_train_6

NameError: name 'RF_final' is not defined

In [21]:
## Pickle the final model for web app 
from sklearn.externals import joblib

In [22]:
joblib.dump(RF_final,'gut_app/fit_models/RF_final.pkl')  #pickle related

['gut_app/fit_models/RF_final.pkl']

In [None]:
## Make ROC curve and calculate AUC to compare methods

RF_final_predict_test = RF_final.predict_proba(X_bt)[:,1]  #Proba prediction  
fpr_t, tpr_t, thresholds_t = roc_curve(y_bt, RF_final_predict_test)

from sklearn.metrics import roc_auc_score ##roc auc##
roc_auc_score_t = roc_auc_score(y_bt, RF_predict_test)
roc_auc_score_t

In [11]:
rf_final.fit(X,y)
importances = rf_final.feature_importances_

In [13]:
importances

array([ 0.04062714,  0.07128638,  0.08709663,  0.03518391,  0.05396193,
        0.02563821,  0.02183593,  0.04362975,  0.05204962,  0.06330605,
        0.0184215 ,  0.03394493,  0.03492832,  0.03029722,  0.02310572,
        0.02749652,  0.04537184,  0.02834343,  0.03760688,  0.03134208,
        0.03944701,  0.02503717,  0.0284081 ,  0.01968092,  0.02476821,
        0.0345748 ,  0.02260978])