# k-fold cross validation

    1. Shuffle the dataset randomly.
    2. Split the dataset into k groups
    3. For each unique group:
        a. Take the group as a hold out or test data set
        b. Take the remaining groups as a training data set
        c. Fit a model on the training set and evaluate it on the test set
        d. Retain the evaluation score and discard the model
    4. Summarize the skill of the model using the sample of model evaluation scores


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # Support vector machines seperates categories of plotted points
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_digits
digits = load_digits()

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3)

In [20]:
logmodel = LogisticRegression(max_iter=3000)
logmodel.fit(X_train, y_train)
logmodel.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9592592592592593

In [21]:
svmmodel = SVC()
svmmodel.fit(X_train, y_train)
svmmodel.score(X_test, y_test)

0.9777777777777777

In [22]:
rfmmodel = RandomForestClassifier(n_estimators=40)
rfmmodel.fit(X_train, y_train)
rfmmodel.score(X_test, y_test)

0.9611111111111111

In [23]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)

In [24]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [25]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [26]:
get_score(LogisticRegression(max_iter=3000), X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9592592592592593

In [27]:
get_score(SVC(), X_train, X_test, y_train, y_test)

0.9777777777777777

In [28]:
get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test)

0.9592592592592593

In [29]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

In [30]:
score_log = []
score_svm = []
score_rf = []

for train_index, test_index in folds.split(digits.data, digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], \
         digits.target[train_index], digits.target[test_index]
    
    score_log.append(get_score(LogisticRegression(max_iter=4000), X_train, X_test, y_train, y_test))
    score_svm.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    score_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))

print("LogisticRegression: %s"%score_log)
print("SVM: %s"%score_svm)
print("Random Forest: %s"%score_rf)

LogisticRegression: [0.9248747913188647, 0.9382303839732888, 0.9232053422370617]
SVM: [0.9649415692821369, 0.9799666110183639, 0.9649415692821369]
Random Forest: [0.9332220367278798, 0.9499165275459098, 0.9282136894824707]


In [33]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(LogisticRegression(max_iter=4000), digits.data, digits.target))
print(cross_val_score(SVC(), digits.data, digits.target))
print(cross_val_score(RandomForestClassifier(n_estimators=40), digits.data, digits.target))

[0.925      0.875      0.93871866 0.93593315 0.89693593]
[0.96111111 0.94444444 0.98328691 0.98885794 0.93871866]
[0.91944444 0.90555556 0.95543175 0.97214485 0.91086351]
