In [1]:
%pylab inline

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
import math
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [3]:
data = pd.read_csv("breast_cancer.csv")

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,id number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,malignant
0,0,1000025,5,1,1,1,2,1,3,1,1,0
1,1,1002945,5,4,4,5,7,10,3,2,1,0
2,2,1015425,3,1,1,1,2,2,3,1,1,0
3,3,1016277,6,8,8,1,3,4,3,7,1,0
4,4,1017023,4,1,1,3,2,1,3,1,1,0


In [5]:
data = data.drop(['Unnamed: 0'], axis=1)

In [6]:
y = data.pop("malignant")

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=.2, random_state=42)

In [8]:
n_estimators = [300,400,500]
max_features = ['auto', 'sqrt','log2']
min_samples_split = [3,5,7]

rfc = RandomForestClassifier(n_jobs=1)

estimator = GridSearchCV(rfc,
                         dict(n_estimators=n_estimators,
                              max_features=max_features,
                              min_samples_split=min_samples_split
                              ), cv=None, n_jobs=-1)

In [None]:
estimator.fit(X_train, y_train)

In [None]:
estimator.best_estimator_

In [None]:
best_rfc = estimator.best_estimator_

## K-Fold Cross Validation

In [None]:
from sklearn import cross_validation
scores = cross_validation.cross_val_score(best_rfc, data, y, cv=10)

In [None]:
scores

In [None]:
mean_score = scores.mean()
std_dev = scores.std()
std_error = scores.std() / math.sqrt(scores.shape[0])
ci =  2.262 * std_error
lower_bound = mean_score - ci
upper_bound = mean_score + ci

print "Score is %f +/-  %f" % (mean_score, ci)
print '95 percent probability that if this experiment were repeated over and over the average score would be between %f and %f' % (lower_bound, upper_bound)

## AUC

In [None]:
auc = roc_auc_score(y_test, best_rfc.predict_proba(X_test)[:,1])
print "AUC Score: ", auc

## Accuracy

In [None]:
accuracy = accuracy_score(y_test, best_rfc.predict(X_test))
print "Accuracy: ", accuracy

## Precision and Recall

In [None]:
print classification_report(y_test, best_rfc.predict(X_test))

In [None]:
print confusion_matrix(y_test, best_rfc.predict(X_test))

**Precision:**  43 / 43 + 2 = .96

**Recall:**   43 / 43 + 2 = .96


#### Accuracy of test data = 0.97  

#### Precision = 96%
#### Interpretation Results = There is 40% chance of incorrect identification of tumor as benign.

#### Recall is 96%. 
#### Interpretation of recall = All tumors identified by the model as malignant, all 96% are malignant.

#### Precision-Recall metric is used to measure the quality of the output for the classifier, K-Fold in this case.