In [1]:
import numpy as np

from sklearn import datasets
from sklearn.model_selection  import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

### Load Dataset

In [2]:
data = datasets.load_digits()

X_data = data.images   # load X_data
y_data = data.target   # load y_data

X_data = X_data.reshape(X_data.shape[0], X_data.shape[1] * X_data.shape[2])    # flatten X_data
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2, random_state = 7)    # split data into train & test set

In [3]:
print(y_train.shape)

(1437,)


In [4]:
print(y_test.shape)

(360,)


In [5]:
print(X_train.shape)

(1437, 64)


### 1. Bootstrapping
- "Given a dataset of size n, a **bootstrap sample** is created by sampling n instances uniformly from the data (with replacement)."
- Create a model with each bootstrap sample and validate it with the test set
- Final result is calculated by averaging the accuracy of models created by each bootstrap sample.

In [6]:
bootstrap_iter = 10    # designate the number of iterations for bootstrapping

In [7]:
clf = SVC()    # create a SVM classifier

In [8]:
accuracy = []

In [9]:
for i in range(bootstrap_iter):
    X_, y_ = resample(X_train, y_train,n_samples=100)
    clf.fit(X_, y_)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_pred, y_test)
    accuracy.append(acc)
    
# Tips: 1) learn resample (with replacement)

In [10]:
accuracy = np.array(accuracy)

In [11]:
print('Accuracy Score')
print('Avearge: ', accuracy.mean())
print('Standard deviation: ', accuracy.std())

Accuracy Score
Avearge:  0.8794444444444445
Standard deviation:  0.02357677241546996


In [12]:
print(accuracy)

[0.82222222 0.85277778 0.88333333 0.88055556 0.89722222 0.89722222
 0.875      0.90277778 0.88611111 0.89722222]


In [13]:
len(accuracy)

10

### 2. Naive cross-validation
- k-fold cross validation without stratification 
- Usually k is set as 10-20 in practical settings

In [14]:
k = 10

In [15]:
clf = SVC()    # create a SVM classifier

In [16]:
kfold = KFold(n_splits = k)

In [17]:
results = cross_val_score(clf, X_train, y_train, cv = kfold)

In [18]:
print('Accuracy Score')
print('Avearge: ', results.mean())
print('Standard deviation: ', results.std())

Accuracy Score
Avearge:  0.9867715617715618
Standard deviation:  0.005790325903845656


### Stratified cross-validation
- k-fold cross validation with stratification
- Stratification is highly recommended in the paper (Kohavi 1995)
- Stratified: The splitting of data into folds may be governed by criteria such as ensuring that each fold has the same proportion of observations with a given categorical value, such as the class outcome value. This is called stratified cross-validation

In [19]:
k = 10

In [20]:
clf = SVC()    # create a SVM classifier

In [22]:
stratified_kfold = StratifiedKFold(n_splits = k)

In [23]:
results = cross_val_score(clf, X_train, y_train, cv = stratified_kfold)

In [24]:
print('Accuracy Score')
print('Avearge: ', results.mean())
print('Standard deviation: ', results.std())

Accuracy Score
Avearge:  0.9874660062160064
Standard deviation:  0.006826120493732847
