In [1]:
# Importing required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Importing the digits data set as digit_data
from sklearn.datasets import load_digits
digit_data = load_digits()
dir(digit_data)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [3]:
# Data is the input and target is the output
X = digit_data.data
y = digit_data.target

In [10]:
# Train test split using 20% test size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
# Build the logistic regression model: model object define, train, model score
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_model.score(X_test, y_test)

0.9611111111111111

In [12]:
# Build the SVM model: model object define, train, model score
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_model.score(X_test, y_test)

0.9916666666666667

In [13]:
# Build the Random Forest model: model object define, train, model score
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_model.score(X_test, y_test)

0.975

#### The main issue with this setup is as train_test_split function randomly changes the test and train sample, the model score changes very little. However, it is changing. So, by runig train_test_split once, model score can't be predicted accurately. In this situations K-fold Cross Validation is important

In [14]:
# Creating the K-fold cross validation object
from sklearn.model_selection import KFold
kf = KFold(n_splits = 3)

In [15]:
# Supply 1-9 samples and see what happens
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
  print(train_index, test_index)

# In first iteration, sample 3-8 is for training and sample 0-2 is for test
# In second iteration, sample 1-2; 6-8 is for trainig and sample 3-5 is for test
# In third iteration, sample 0-5 is for training and sample 6-8 is for test

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [16]:
# Define a generic function that helps to get the model score is a simple method
def get_score(model, X_train, X_test, y_train, y_test):
  model.fit(X_train, y_train)
  return model.score(X_test, y_test)

In [17]:
# For model building using K-fold cross validation we use stratified k fold
# As it separates the classes in a uniform way
from sklearn.model_selection import StratifiedKFold
fold_model = StratifiedKFold(n_splits = 3)

In [20]:
digit_data.data

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [21]:
# Creating emplt list for storing model scroes
lr_score = []
svm_score = []
rf_score = []

# Creating a for loop with the fold_model as previous
for train_index, test_index in kf.split(digit_data.data):
  X_train, X_test, y_train, y_test = digit_data.data[train_index], digit_data.data[test_index], digit_data.target[train_index], digit_data.target[test_index]
  # Get the score for logistic regression, svm and random foest model and append them in the empty lists
  lr_score.append(get_score(LogisticRegression(),X_train, X_test, y_train, y_test))
  svm_score.append(get_score(SVC(),X_train, X_test, y_train, y_test))
  rf_score.append(get_score(RandomForestClassifier(),X_train, X_test, y_train, y_test))

In [22]:
# Check the scores of Logistic Regression Model for three iterations
lr_score

[0.9232053422370617, 0.9415692821368948, 0.9148580968280468]

In [23]:
# Check the scores of SVM Model for three iterations
svm_score

[0.9666110183639399, 0.9816360601001669, 0.9549248747913188]

In [24]:
# Check the scores of  Random Forest Model for three iterations
rf_score

[0.9365609348914858, 0.9616026711185309, 0.9232053422370617]

In [25]:
# The big code need not be written
# There is altenative approach
from sklearn.model_selection import cross_val_score
cross_val_score(LogisticRegression(), digit_data.data, digit_data.target)

array([0.92222222, 0.86944444, 0.94150418, 0.93871866, 0.89693593])

#### Sample code for runing cross validaion on iris data set

In [None]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Initialize models
log_reg = LogisticRegression(max_iter=200)
svm = SVC()
random_forest = RandomForestClassifier()

# Initialize KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform K-fold cross-validation for each model
log_reg_scores = cross_val_score(log_reg, X, y, cv=kf)
svm_scores = cross_val_score(svm, X, y, cv=kf)
random_forest_scores = cross_val_score(random_forest, X, y, cv=kf)

# Print the average accuracy for each model
print("Logistic Regression average accuracy:", np.mean(log_reg_scores))
print("SVM average accuracy:", np.mean(svm_scores))
print("Random Forest average accuracy:", np.mean(random_forest_scores))