See also `../Regression/lr_housing_california_7_cross_validation.ipynb`

# Download dataset

In [1]:
import numpy as np

from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')

X, y = mnist["data"], mnist["target"]  # deprecated

shuffle_index = np.random.permutation(X.shape[0])
X =  X[shuffle_index,:]
y = y[shuffle_index]

# Prepare train and test sets

In [2]:
import numpy as np

N = 60000

X_train = np.float64(X[:N]) # np.float64 is used to avoid warnings when preprocessing data
y_train = y[:N]

X_test = np.float64(X[N:])
y_test = y[N:]

In [3]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

# Binary classifier - SGD Classifier

In [4]:
import time

from sklearn.linear_model import SGDClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import MinMaxScaler

model = Pipeline([('scaler', StandardScaler()), 
                  ('sgd_clf', SGDClassifier(random_state=0, loss = 'log',alpha = 0.00001))]) 

start_time = time.time()

model.fit(X_train, y_train_5)

print("Elapsed time %.2f seconds." % (time.time() - start_time))



Elapsed time 1.71 seconds.


In [5]:
model.score(X_train,y_train_5) 

0.9710666666666666

# Measuring Accuracy using Cross Validation

In [6]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train_5, cv=3, scoring="accuracy")



In [7]:
print(scores,'; mean = %0.3f'%np.mean(scores))

[0.97065 0.97135 0.96845] ; mean = 0.970


# Final Accuracy over test data

In [8]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test_5, model.predict(X_test))

0.9691

# Implementing Cross-validation yourself

If you need more control over the cross-validation process than what `cross_val_score()` and similar functions provide, you can implement it yourself. The following code does the same thing as the `cross_val_score()` code.

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=0)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_model = clone(model)
    
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train_5[train_index])
    
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train_5[test_index])
    
    
    clone_model.fit(X_train_folds, y_train_folds)
    
    
    y_pred = clone_model.predict(X_test_fold)
    
    print('accuracy = %0.3f\n'%accuracy_score(y_test_fold, y_pred)) # the ratio of correct predictions



accuracy = 0.971





accuracy = 0.971





accuracy = 0.968

