## Advanced Validation

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
import math

%pylab inline

Populating the interactive namespace from numpy and matplotlib




### 0a.  Using breast_cancer.csv

In [2]:
data = pd.read_csv("data/breast_cancer.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,id number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,malignant
0,0,1000025,5,1,1,1,2,1,3,1,1,0
1,1,1002945,5,4,4,5,7,10,3,2,1,0
2,2,1015425,3,1,1,1,2,2,3,1,1,0
3,3,1016277,6,8,8,1,3,4,3,7,1,0
4,4,1017023,4,1,1,3,2,1,3,1,1,0


In [4]:
data = data.drop(['Unnamed: 0', 'id number'], axis=1)

### 0b. Create a random forest model that predicts malignant given the other relevant variables.

In [5]:
#Setting the feature we want to predict
y = data.pop("malignant")

### 0c. Use a single holdout (test/train split)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=.2, random_state=42)

### 0d. Create a random forest model that predicts malignant given the other relevant variables.

### 0e.  Use Grid Search to optimize model hyperparameters.

In [7]:
rfc = RandomForestClassifier(n_jobs=1)

In [8]:
### Grid Search
n_estimators = [300,400,500]
max_features = ['auto', 'sqrt','log2']
min_samples_split = [3,5,7]
#Parameters of pipelines can be set using ‘__’ separated parameter names:
estimator = GridSearchCV(rfc,
                         dict(n_estimators=n_estimators,
                              max_features=max_features,
                              min_samples_split=min_samples_split
                              ), cv=None, n_jobs=-1)

In [9]:
estimator.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [300, 400, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_split': [3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [10]:
estimator.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=3, min_weight_fraction_leaf=0.0,
            n_estimators=400, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [11]:
best_rfc = estimator.best_estimator_

### 0f. Accuracy



In [12]:
accuracy = accuracy_score(y_test, best_rfc.predict(X_test))
print ("Accuracy: ", accuracy)

Accuracy:  0.971428571429


### 0g. Precision and Recall

In [13]:
print (classification_report(y_test, best_rfc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.98      0.98      0.98        95
          1       0.96      0.96      0.96        45

avg / total       0.97      0.97      0.97       140



In [14]:
print (confusion_matrix(y_test, best_rfc.predict(X_test)))

[[93  2]
 [ 2 43]]


**Precision:**  43 / 43 + 2 = .96

**Recall:**   43 / 43 + 2 = .96

### 0h. AUC

In [15]:
roc = roc_auc_score(y_test, best_rfc.predict_proba(X_test)[:,1])
print ("AUC Score: ", roc)

AUC Score:  0.99649122807


### 1. Implement K-Fold Cross Validation, with 10 folds

In [16]:
from sklearn import cross_validation
scores = cross_validation.cross_val_score(best_rfc, data, y, cv=10)

In [17]:
scores

array([ 0.91549296,  0.98571429,  0.97142857,  0.91428571,  0.98571429,
        0.98571429,  0.97142857,  0.98571429,  0.98550725,  1.        ])

In [18]:
mean_score = scores.mean()
std_dev = scores.std()
std_error = scores.std() / math.sqrt(scores.shape[0])
ci =  2.262 * std_error
lower_bound = mean_score - ci
upper_bound = mean_score + ci

print ("Score is %f +/-  %f" % (mean_score, ci))
print ('95 percent probability that if this experiment were repeated over and over the average score would be between %f and %f' % (lower_bound, upper_bound))

Score is 0.970100 +/-  0.020492
95 percent probability that if this experiment were repeated over and over the average score would be between 0.949608 and 0.990592


### 2. Comparison between K-Fold CV score compared to your single holdout AUC

Our AUC score is 0.996. It is very close to 1 the perfect model. But our model is tuned against the 20% random samples. In reality, these holdout might not be a good representative of the whole dataset. The model might not perform well in real life as it’s going to overfit to the specific test examples.

In K-Fold CV, we got 10 K scores and then combined it into a single average score. There’s 95% probability that if the experiment is repeated over and over again the average score would be between 0.949 and 0.990. It gives a more general realistic perspective compared to AUC score calculated against a single holdout. 

### 3. Model Performance

* Accuracy: Model accuracy is 0.9714. It means in 97% of the predictions would be correct.
* Precision:  96% percent of malignant predictions were correct. Which also means there is a 4% chance it will identify a non-malignant as malignant.
* Recall:  The model will be able to correctly identify 96% of malignant cases. However, it will fail to correctly predict 4% of malignant cases. In this specific case, it’s probably better to have a higher recall, so that the model don’t consider actual malignant cases as non-malignant.
* AUC: AUC score is 0.99. So it has almost perfect True Positives and a very few False Positives.
