# K Folds Cross Validation

In [None]:
# Sometimes we get into delima which Machine Learning model should I use to solve my problem.

# For Example: We are working on the iris flower dataset problem and we can classify those flowers using SVM,
# Random Forest, Logistic regression and Decision Tree. Which model out of these is the best.

# Cross Validation is the technique which allows us to answer tha question Basically allows us to evaluate
# the model performance.

In [1]:
# Ways of training the Model:
# Option 1: Use all available data for training and test (Using 100% of all our data to train the model
# and use the same exact data to test the model)

# Option 2: Split available dataset into trainingg and test sets.

# Option 3: K Folds Cross Validation (In this technique we divide our 100 samples into folds eg 5 folds each
# contain 20 samples and then we run multiple iterations. In first iteration we use folds 2 to 5 for training and
# fold 1 for testing and calculate the score, In second iteration we use 2 fold for test and rest of them as training,
# we repeat the same precess untill the last fold is used for testing at the last we average the score of all iterations)
# This technique is very useful we use variety of samples for testing.

In [3]:
# Importing Libraries

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits

digits = load_digits()

## Using train_test_split 

In [13]:
# Splitting the dataset into training and testing

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(digits['data'], digits['target'], test_size=0.3)

In [10]:
# Classify Digits using Logistic Regression Classifier

lr = LogisticRegression(max_iter=500)

lr.fit(x_train, y_train)
lr.score(x_test,y_test)

0.9629629629629629

In [11]:
# Classify Digits using SVM Classifier

svm = SVC()
svm.fit(x_train, y_train)
svm.score(x_test,y_test)

0.9851851851851852

In [12]:
# Classify Digits using Random Forest Classifier

rf = RandomForestClassifier()

rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.9833333333333333

In [None]:
# Above Score are Before again running the train-test-split method

In [None]:
# This is the Quick way of measuring the performance of these three models. We evaluate the performance 
# and SVM Classifier is performing the best.

# This works in the practical situation but when the distribution of the samples in x-train and x_test is 
# not uniform when we again run the train-test-split method the samples change and the score changes

In [None]:
# Below Score are After again running the train-test-split method

In [14]:
# Classify Digits using Logistic Regression Classifier

lr = LogisticRegression(max_iter=500)

lr.fit(x_train, y_train)
lr.score(x_test,y_test)

0.9555555555555556

In [15]:
# Classify Digits using SVM Classifier

svm = SVC()
svm.fit(x_train, y_train)
svm.score(x_test,y_test)

0.9833333333333333

In [16]:
# Classify Digits using Random Forest Classifier

rf = RandomForestClassifier()

rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.9703703703703703

In [17]:
# This is the problem with this technique because the samples are not unoform and we don't run the 
# train-test-split method only one time we run it again and again.

## Using K-Folds

In [19]:
# Creating K-Fold object

from sklearn.model_selection import KFold

kf = KFold(n_splits=3)          # n_splits=3 is the number of folds
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [21]:
# Using K-Folds on Example

for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [22]:
# Creating Method for calculating score

def get_score(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [24]:
# Using K-Folds on Digits

from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

# StratifiedKFold is similar to KFold but it is little better in a way that when we are separating out our folds
# it will divide each of the classification categoy in a unoform way.
# It will not put one category(label) in one fold it will divide it uniformaly.

In [34]:
# Using KFolds on Digits dataset

# Creating the scores array for each classifier

scores_l = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits['data'],digits['target']):
    x_train, x_test, y_train, y_test = digits.data[train_index], digits.data[test_index], \
                                        digits.target[train_index], digits.target[test_index]
    scores_l.append(get_score(LogisticRegression(max_iter=500), x_train, x_test, y_train, y_test))
    scores_svm.append(get_score(SVC(), x_train, x_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(), x_train, x_test, y_train, y_test))

In [35]:
scores_l

[0.9198664440734557, 0.9415692821368948, 0.9165275459098498]

In [36]:
scores_svm

[0.9649415692821369, 0.9799666110183639, 0.9649415692821369]

In [37]:
scores_rf

[0.9382303839732888, 0.9599332220367279, 0.9332220367278798]

In [None]:
# We can take the average of above scores and check which classifier is best.

In [38]:
# Instead of writing the huge code we can use the sklearn mMthod cross_val_score which do the same thing

from sklearn.model_selection import cross_val_score

In [43]:
# Using cross_val_score() method for Logistic Regression

cross_val_score(LogisticRegression(max_iter=500), digits.data, digits.target)

array([0.92222222, 0.87222222, 0.94150418, 0.94150418, 0.89693593])

In [45]:
# Using cross_val_score() method for SVM

cross_val_score(SVC(), digits.data, digits.target)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [46]:
# Using cross_val_score() method for random Forest

cross_val_score(RandomForestClassifier(), digits.data, digits.target)

array([0.92777778, 0.91111111, 0.95821727, 0.95821727, 0.91922006])

## Parameter Tunning

In [47]:
# Above we compare different Classifiers we can also compare same Classifier with different parameters.
# This is called parameter tunning.

cross_val_score(RandomForestClassifier(n_estimators=50), digits.data, digits.target)

array([0.93055556, 0.89444444, 0.94986072, 0.96100279, 0.9275766 ])

In [48]:
cross_val_score(RandomForestClassifier(n_estimators=30), digits.data, digits.target)

array([0.94166667, 0.89444444, 0.94428969, 0.95264624, 0.90807799])

In [49]:
cross_val_score(RandomForestClassifier(n_estimators=15), digits.data, digits.target)

array([0.91388889, 0.875     , 0.94986072, 0.95264624, 0.90529248])

In [50]:
cross_val_score(RandomForestClassifier(n_estimators=5), digits.data, digits.target)

array([0.85555556, 0.8       , 0.87743733, 0.91643454, 0.86072423])