In [184]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

### Import Data, EDA, Munging, and Test/Train Split

In [185]:
# import data
data = pd.read_csv("../../data/breast_cancer/breast_cancer.csv")

In [186]:
# eda
data.head()

Unnamed: 0.1,Unnamed: 0,id number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,malignant
0,0,1000025,5,1,1,1,2,1,3,1,1,0
1,1,1002945,5,4,4,5,7,10,3,2,1,0
2,2,1015425,3,1,1,1,2,2,3,1,1,0
3,3,1016277,6,8,8,1,3,4,3,7,1,0
4,4,1017023,4,1,1,3,2,1,3,1,1,0


In [187]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 12 columns):
Unnamed: 0                  699 non-null int64
id number                   699 non-null int64
clump_thickness             699 non-null int64
uniformity_of_cell_size     699 non-null int64
uniformity_of_cell_shape    699 non-null int64
marginal_adhesion           699 non-null int64
epithelial_cell_size        699 non-null int64
bare_nuclei                 699 non-null int64
bland_chromatin             699 non-null int64
normal_nucleoli             699 non-null int64
mitoses                     699 non-null int64
malignant                   699 non-null int64
dtypes: int64(12)
memory usage: 65.6 KB


In [188]:
# drop unused fields
X = data.drop(['Unnamed: 0'], axis=1)

In [189]:
# remove dependent variable
y = X.pop('malignant')

In [190]:
# create test/train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Random Forest with Defaults (Baseline)

In [191]:
# instantiate and train model
rfc_base = RandomForestClassifier(random_state=42)
rfc_base.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [192]:
# evaulate performance
y_pred_base = rfc_base.predict(X_test)
accuracy_base = accuracy_score(y_test, y_pred_base)
auc_base = roc_auc_score(y_test, y_pred_base)
precision_base = precision_score(y_test, y_pred_base)
recall_base = recall_score(y_test, y_pred_base)
confusionX_base = classification_report(y_test, y_pred_base)
print ("Accuracy: ", accuracy_base, "\nAUC: ", auc_base, "\nPrecision: ", precision_base, "\nRecall: ", recall_base, "\n\n", confusionX_base)

Accuracy:  0.957142857143 
AUC:  0.945029239766 
Precision:  0.953488372093 
Recall:  0.911111111111 

              precision    recall  f1-score   support

          0       0.96      0.98      0.97        95
          1       0.95      0.91      0.93        45

avg / total       0.96      0.96      0.96       140



### Random Forest with OOB

In [193]:
# instantiate and train model
rfc_oob = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rfc_oob.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=True, random_state=42,
            verbose=0, warm_start=False)

In [194]:
# evaulate performance
y_pred_oob = rfc_oob.predict(X_test)
accuracy_oob = accuracy_score(y_test, y_pred_oob)
auc_oob = roc_auc_score(y_test, y_pred_oob)
precision_oob = precision_score(y_test, y_pred_oob)
recall_oob = recall_score(y_test, y_pred_oob)
confusionX_oob = classification_report(y_test, y_pred_oob)
print ("Accuracy: ", accuracy_oob, "\nAUC: ", auc_oob, "\nPrecision: ", precision_oob, "\nRecall: ", recall_oob, "\n\n", confusionX_oob)

Accuracy:  0.971428571429 
AUC:  0.967251461988 
Precision:  0.955555555556 
Recall:  0.955555555556 

              precision    recall  f1-score   support

          0       0.98      0.98      0.98        95
          1       0.96      0.96      0.96        45

avg / total       0.97      0.97      0.97       140



### Random Forest with OOB and Grid Search

In [None]:
# instantiate and train model
rfc_grid = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rfc_grid.fit(X_train, y_train)

In [None]:
# evaulate performance
y_pred_grid = rfc_grid.predict(X_test)
accuracy_grid = accuracy_score(y_test, y_pred_grid)
auc_grid = roc_auc_score(y_test, y_pred_grid)
precision_grid = precision_score(y_test, y_pred_grid)
recall_grid = recall_score(y_test, y_pred_grid)
confusionX_grid = classification_report(y_test, y_pred_grid)
print ("Accuracy: ", accuracy_grid, "\nAUC: ", auc_grid, "\nPrecision: ", precision_grid, "\nRecall: ", recall_grid, "\n\n", confusionX_grid)