# k-fold cross validation

    1. Shuffle the dataset randomly.
    2. Split the dataset into k groups
    3. For each unique group:
        a. Take the group as a hold out or test data set
        b. Take the remaining groups as a training data set
        c. Fit a model on the training set and evaluate it on the test set
        d. Retain the evaluation score and discard the model
    4. Summarize the skill of the model using the sample of model evaluation scores


In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # Support vector machines seperates categories of plotted points
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [59]:
logmodel = LogisticRegression(max_iter=3000)
logmodel.fit(X_train, y_train)
logmodel.score(X_test, y_test)

0.9555555555555556

In [60]:
svmmodel = SVC()
svmmodel.fit(X_train, y_train)
svmmodel.score(X_test, y_test)

0.9555555555555556

In [61]:
rfmmodel = RandomForestClassifier(n_estimators=40)
rfmmodel.fit(X_train, y_train)
rfmmodel.score(X_test, y_test)

0.9555555555555556

In [62]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)

In [63]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [64]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [65]:
get_score(LogisticRegression(max_iter=3000), X_train, X_test, y_train, y_test)

0.9555555555555556

In [66]:
get_score(SVC(), X_train, X_test, y_train, y_test)

0.9555555555555556

In [67]:
get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test)

0.9555555555555556

In [68]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

In [69]:
score_log = []
score_svm = []
score_rf = []

for train_index, test_index in folds.split(iris.data, iris.target):
    X_train, X_test, y_train, y_test = iris.data[train_index], iris.data[test_index], \
         iris.target[train_index], iris.target[test_index]
    
    score_log.append(get_score(LogisticRegression(max_iter=4000), X_train, X_test, y_train, y_test))
    score_svm.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    score_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))

print("LogisticRegression: %s"%score_log)
print("SVM: %s"%score_svm)
print("Random Forest: %s"%score_rf)

LogisticRegression: [0.98, 0.96, 0.98]
SVM: [0.96, 0.98, 0.94]
Random Forest: [0.98, 0.94, 0.98]


In [None]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(LogisticRegression(max_iter=4000), iris.data, iris.target))
print(cross_val_score(SVC(), iris.data, iris.target))
print(cross_val_score(RandomForestClassifier(n_estimators=40), iris.data, iris.target))