In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = datasets.load_digits()

In [3]:
x_train,x_test,y_train,y_test = train_test_split(data.data,data.target,test_size=.3,random_state=20)

In [4]:
lr = LogisticRegression(max_iter=200)

In [5]:
lr.fit(x_train,y_train)
lr.score(x_test,y_test)

0.9574074074074074

In [6]:
svc = SVC()

In [7]:
svc.fit(x_train,y_train)
svc.score(x_test,y_test)

0.987037037037037

In [8]:
rfc = RandomForestClassifier(n_estimators=100)

In [9]:
rfc.fit(x_train,y_train)
rfc.score(x_test,y_test)

0.9833333333333333

In [10]:
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [11]:
for train_index,test_index in kf.split([1,2,3,4,5,6,7,8,9]):
  print(train_index,test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


# We create a **function**

In [12]:
def get_score(model,x_train,x_test,y_train,y_test):
  model.fit(x_train,y_train)
  return model.score(x_test,y_test)

In [13]:
get_score(RandomForestClassifier(n_estimators=100),x_train,x_test,y_train,y_test)

0.975925925925926

In [14]:
from sklearn.model_selection import StratifiedKFold

In [15]:
folds = StratifiedKFold(n_splits=3)

In [16]:
score_lr = []
score_svc = []
score_rfc = []
for train_index,test_index in folds.split(data.data,data.target):
  x_train,x_test,y_train,y_test = data.data[train_index],data.data[test_index],data.target[train_index],data.target[test_index]
  score_lr.append(get_score(LogisticRegression(max_iter=200),x_train,x_test,y_train,y_test))
  score_svc.append(get_score(SVC(),x_train,x_test,y_train,y_test))
  score_rfc.append(get_score(RandomForestClassifier(),x_train,x_test,y_train,y_test))

In [17]:
score_lr 

[0.9215358931552587, 0.9432387312186978, 0.9181969949916527]

In [18]:
score_svc

[0.9649415692821369, 0.9799666110183639, 0.9649415692821369]

In [19]:
score_rfc

[0.9382303839732888, 0.9532554257095158, 0.9298831385642737]

In [20]:
cross_val_score(LogisticRegression(),data.data,data.target)

array([0.92222222, 0.86944444, 0.94150418, 0.93871866, 0.89693593])

In [21]:
cross_val_score(SVC(),data.data,data.target)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [22]:
cross_val_score(RandomForestClassifier(),data.data,data.target)

array([0.93888889, 0.9       , 0.9637883 , 0.95821727, 0.92200557])

# **Parameter tunning using k fold cross validation**

### **k-fold cross validation work same way StratifiedKFold bydefault**

In [27]:
# we tunie for one model random forest classifier
all_score = dict()
n_estimators = [10,50,80,100,150]
for n_estimator in n_estimators:
  score_ = cross_val_score(estimator=RandomForestClassifier(n_estimators=n_estimator),X=data.data,y=data.target,cv=10)
  avg_score = np.average(score_)
  all_score.update({n_estimator:avg_score})
print(all_score)
print("Max value is: ",max(all_score.values()))

{10: 0.9198448168839232, 50: 0.9504593420235878, 80: 0.9443544382371197, 100: 0.9449099937926754, 150: 0.9499193047796399}
Max value is:  0.9504593420235878


**Here we used cross_val_score to fine tune our random forest classifier and figured that having around 50 trees in random forest gives best result.**