# k-fold cross validation

    1. Shuffle the dataset randomly.
    2. Split the dataset into k groups
    3. For each unique group:
        a. Take the group as a hold out or test data set
        b. Take the remaining groups as a training data set
        c. Fit a model on the training set and evaluate it on the test set
        d. Retain the evaluation score and discard the model
    4. Summarize the skill of the model using the sample of model evaluation scores


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # Support vector machines seperates categories of plotted points
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(breast_cancer.data, breast_cancer.target, test_size=0.3)

In [6]:
logmodel = LogisticRegression(max_iter=3000)
logmodel.fit(X_train, y_train)
logmodel.score(X_test, y_test)

0.9415204678362573

In [7]:
svmmodel = SVC()
svmmodel.fit(X_train, y_train)
svmmodel.score(X_test, y_test)

0.8888888888888888

In [8]:
rfmmodel = RandomForestClassifier(n_estimators=40)
rfmmodel.fit(X_train, y_train)
rfmmodel.score(X_test, y_test)

0.9415204678362573

In [9]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)

In [10]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [11]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [12]:
get_score(LogisticRegression(max_iter=3000), X_train, X_test, y_train, y_test)

0.9415204678362573

In [13]:
get_score(SVC(), X_train, X_test, y_train, y_test)

0.8888888888888888

In [14]:
get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test)

0.9415204678362573

In [15]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

In [16]:
score_log = []
score_svm = []
score_rf = []

for train_index, test_index in folds.split(breast_cancer.data, breast_cancer.target):
    X_train, X_test, y_train, y_test = breast_cancer.data[train_index], breast_cancer.data[test_index], \
         breast_cancer.target[train_index], breast_cancer.target[test_index]
    
    score_log.append(get_score(LogisticRegression(max_iter=4000), X_train, X_test, y_train, y_test))
    score_svm.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    score_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))

print("LogisticRegression: %s"%score_log)
print("SVM: %s"%score_svm)
print("Random Forest: %s"%score_rf)

LogisticRegression: [0.9421052631578948, 0.9631578947368421, 0.9417989417989417]
SVM: [0.8526315789473684, 0.9315789473684211, 0.9470899470899471]
Random Forest: [0.9473684210526315, 0.968421052631579, 0.9735449735449735]


In [None]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(LogisticRegression(max_iter=4000), breast_cancer.data, breast_cancer.target))
print(cross_val_score(SVC(), breast_cancer.data, breast_cancer.target))
print(cross_val_score(RandomForestClassifier(n_estimators=40), breast_cancer.data, breast_cancer.target))