In [40]:
import numpy as np

from sklearn import datasets
from sklearn.model_selection  import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

### Load Dataset

In [5]:
data = datasets.load_digits()

X_data = data.images   # load X_data
y_data = data.target   # load y_data

X_data = X_data.reshape(X_data.shape[0], X_data.shape[1] * X_data.shape[2])    # flatten X_data
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2, random_state = 7)    # split data into train & test set

### 1. Bootstrapping
- "Given a dataset of size n, a **bootstrap sample** is created by sampling n instances uniformly from the data (with replacement)."
- Create a model with each bootstrap sample and validate it with the test set
- Final result is calculated by averaging the accuracy of models created by each bootstrap sample.

In [20]:
bootstrap_iter = 10    # designate the number of iterations for bootstrapping

In [22]:
clf = SVC()    # create a SVM classifier

In [24]:
accuracy = []

In [25]:
for i in range(bootstrap_iter):
    X_, y_ = resample(X_train, y_train)
    clf.fit(X_, y_)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_pred, y_test)
    accuracy.append(acc)

In [27]:
accuracy = np.array(accuracy)

In [37]:
print('Accuracy Score')
print('Avearge: ', accuracy.mean())
print('Standard deviation: ', accuracy.std())

Accuracy Score
Avearge:  0.300555555556
Standard deviation:  0.0902910246691


### 2. Naive cross-validation
- k-fold cross validation without stratification 
- Usually k is set as 10-20 in practical settings

In [31]:
k = 10

In [35]:
clf = SVC()    # create a SVM classifier

In [34]:
kfold = KFold(n_splits = k, random_state = 777)

In [36]:
results = cross_val_score(clf, X_train, y_train, cv = kfold)

In [39]:
print('Accuracy Score')
print('Avearge: ', results.mean())
print('Standard deviation: ', results.std())

Accuracy Score
Avearge:  0.392438811189
Standard deviation:  0.0681843965484


### Stratified cross-validation
- k-fold cross validation with stratification
- Stratification is highly recommended in the paper (Kohavi 1995)

In [41]:
k = 10

In [42]:
clf = SVC()    # create a SVM classifier

In [43]:
stratified_kfold = StratifiedKFold(n_splits = k, random_state = 777)

In [44]:
results = cross_val_score(clf, X_train, y_train, cv = stratified_kfold)

In [45]:
print('Accuracy Score')
print('Avearge: ', results.mean())
print('Standard deviation: ', results.std())

Accuracy Score
Avearge:  0.404715502269
Standard deviation:  0.0314864025458
