In [144]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import cross_validation
import math

%pylab inline

Populating the interactive namespace from numpy and matplotlib


### Import Data, EDA, Munging, and Test/Train Split

In [126]:
# import data
data = pd.read_csv("../../data/breast_cancer/breast_cancer.csv")

In [127]:
# eda
data.head()

Unnamed: 0.1,Unnamed: 0,id number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,malignant
0,0,1000025,5,1,1,1,2,1,3,1,1,0
1,1,1002945,5,4,4,5,7,10,3,2,1,0
2,2,1015425,3,1,1,1,2,2,3,1,1,0
3,3,1016277,6,8,8,1,3,4,3,7,1,0
4,4,1017023,4,1,1,3,2,1,3,1,1,0


In [128]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 12 columns):
Unnamed: 0                  699 non-null int64
id number                   699 non-null int64
clump_thickness             699 non-null int64
uniformity_of_cell_size     699 non-null int64
uniformity_of_cell_shape    699 non-null int64
marginal_adhesion           699 non-null int64
epithelial_cell_size        699 non-null int64
bare_nuclei                 699 non-null int64
bland_chromatin             699 non-null int64
normal_nucleoli             699 non-null int64
mitoses                     699 non-null int64
malignant                   699 non-null int64
dtypes: int64(12)
memory usage: 65.6 KB


In [129]:
# drop unused fields
X = data.drop(['Unnamed: 0', 'id number'], axis=1)

In [130]:
# remove dependent variable
y = X.pop('malignant')

In [131]:
# create test/train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Random Forest with Defaults (Baseline)

In [132]:
# instantiate and train model
rfc_base = RandomForestClassifier(random_state=42)
rfc_base.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [133]:
# evaulate performance
y_pred_base = rfc_base.predict(X_test)
accuracy_base = accuracy_score(y_test, y_pred_base)
auc_base = roc_auc_score(y_test, y_pred_base)
precision_base = precision_score(y_test, y_pred_base)
recall_base = recall_score(y_test, y_pred_base)
confusionX_base = classification_report(y_test, y_pred_base)
print ("Accuracy: ", accuracy_base, "\nAUC: ", auc_base, "\nPrecision: ", precision_base, "\nRecall: ", recall_base, "\n\n", confusionX_base)

Accuracy:  0.971428571429 
AUC:  0.967251461988 
Precision:  0.955555555556 
Recall:  0.955555555556 

              precision    recall  f1-score   support

          0       0.98      0.98      0.98        95
          1       0.96      0.96      0.96        45

avg / total       0.97      0.97      0.97       140



### Random Forest with OOB

In [134]:
# instantiate and train model
rfc_oob = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rfc_oob.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [135]:
# evaulate performance
y_pred_oob = rfc_oob.predict(X_test)
accuracy_oob = accuracy_score(y_test, y_pred_oob)
auc_oob = roc_auc_score(y_test, y_pred_oob)
precision_oob = precision_score(y_test, y_pred_oob)
recall_oob = recall_score(y_test, y_pred_oob)
confusionX_oob = classification_report(y_test, y_pred_oob)
print ("Accuracy: ", accuracy_oob, "\nAUC: ", auc_oob, "\nPrecision: ", precision_oob, "\nRecall: ", recall_oob, "\n\n", confusionX_oob)

Accuracy:  0.964285714286 
AUC:  0.956140350877 
Precision:  0.954545454545 
Recall:  0.933333333333 

              precision    recall  f1-score   support

          0       0.97      0.98      0.97        95
          1       0.95      0.93      0.94        45

avg / total       0.96      0.96      0.96       140



### Random Forest with Grid Search

In [136]:
# set hyperparameter options
n_estimators = [10,100,300,400,500,1000]
max_features = ['auto', 'sqrt', 'log2']
min_samples_split = [2,3,5,7,20,50,100]
oob_score = [True, False]
min_samples_leaf = [1,3,10]

In [137]:
# instantiate and train model with grid search
rfc_grid = RandomForestClassifier(n_jobs=1, random_state=42)
rfc_grid_estimator = GridSearchCV(rfc_grid, dict(n_estimators=n_estimators, max_features=max_features, min_samples_split=min_samples_split, oob_score=oob_score), cv=None, n_jobs=-1)
rfc_grid_estimator.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [10, 100, 300, 400, 500, 1000], 'min_samples_split': [2, 3, 5, 7, 20, 50, 100], 'oob_score': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [138]:
# best hyperparameters
rfc_grid_estimator.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=50,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [139]:
# evaulate performance
rfc_grid_best = rfc_grid_estimator.best_estimator_
y_pred_grid = rfc_grid_best.predict(X_test)
accuracy_grid = accuracy_score(y_test, y_pred_grid)
auc_grid = roc_auc_score(y_test, y_pred_grid)
precision_grid = precision_score(y_test, y_pred_grid)
recall_grid = recall_score(y_test, y_pred_grid)
confusionX_grid = classification_report(y_test, y_pred_grid)
print ("Accuracy: ", accuracy_grid, "\nAUC: ", auc_grid, "\nPrecision: ", precision_grid, "\nRecall: ", recall_grid, "\n\n", confusionX_grid)

Accuracy:  0.964285714286 
AUC:  0.96783625731 
Precision:  0.916666666667 
Recall:  0.977777777778 

              precision    recall  f1-score   support

          0       0.99      0.96      0.97        95
          1       0.92      0.98      0.95        45

avg / total       0.97      0.96      0.96       140



### Random Forest with K-Fold Cross Validation

In [140]:
scores = cross_validation.cross_val_score(rfc_grid_best, X, y, cv=10)

In [141]:
scores

array([ 0.91549296,  0.98571429,  0.97142857,  0.91428571,  0.97142857,
        0.95714286,  0.98571429,  0.98571429,  0.98550725,  0.98550725])

In [145]:
mean_score = scores.mean()
std_dev = scores.std()
std_error = scores.std() / math.sqrt(scores.shape[0])
ci =  2.262 * std_error
lower_bound = mean_score - ci
upper_bound = mean_score + ci

print ("Score is %f +/-  %f" % (mean_score, ci))
print ('95 percent probability that if this experiment were repeated over and over the average score would be between %f and %f' % (lower_bound, upper_bound))

Score is 0.965794 +/-  0.019312
95 percent probability that if this experiment were repeated over and over the average score would be between 0.946481 and 0.985106
