In [19]:
import pandas as pd
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [27]:
X = pd.read_csv("breast_cancer.csv")

In [32]:
X.head()

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,malignant
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0


In [31]:
X.drop(['Unnamed: 0','id number'],axis=1,inplace=True)

In [33]:
y = X.pop('malignant')

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [35]:
### Grid Search
n_estimators = [300,400,500]
max_features = ['auto', 'sqrt','log2']
min_samples_split = [3,5,7]


forest = RandomForestClassifier(n_jobs=1)
#Parameters of pipelines can be set using ‘__’ separated parameter names:
estimator = GridSearchCV(forest,
                         dict(n_estimators=n_estimators,
                              max_features=max_features,
                              min_samples_split=min_samples_split
                              ), cv=None, n_jobs=-1)

In [36]:
estimator.fit(X_train,y_train)
forestest = estimator.best_estimator_

### Accuracy

In [37]:
accuracy_score(y_test, forestest.predict(X_test))

0.97857142857142854

### Precision/Recall

In [38]:
print classification_report(y_test, forestest.predict(X_test))

             precision    recall  f1-score   support

          0       1.00      0.97      0.98        95
          1       0.94      1.00      0.97        45

avg / total       0.98      0.98      0.98       140



### AUC Score

In [39]:
roc_auc_score(y_test,forestest.predict(X_test))

0.98421052631578954

### K-Folds

In [40]:
scores = cross_validation.cross_val_score(forestest, X, y, cv=10)

In [42]:
mean_score = scores.mean()
std_dev = scores.std()
std_error = scores.std() / math.sqrt(scores.shape[0])
ci =  2.262 * std_error
lower_bound = mean_score - ci
upper_bound = mean_score + ci

print "Score is %f +/-  %f" % (mean_score, ci)
print "Lower Bound: %f" % (lower_bound)
print "Upper Bound: %f" % (upper_bound)

Score is 0.967202 +/-  0.016992
Lower Bound: 0.950210
Upper Bound: 0.984194


### Discussion

The K-Folds CV score was slightly lower than the single holdout AUC score. The AUC score was just over the upper bound of the CI using K-Folds. The accuracy score was higher than the K-Folds score as well, but it was within the CI.

Without performing any data cleaning, the accuracy, precision, recall, and AUC scores were very high at .979, .98, .98, and .984, respectively. This makes me think the model is overfitting the training data. A similar situation came up in the midterm when I had an R-Squared of .99 and very low RMSE for one of my tests, and I knew immediately I had overfit the training set.