In [1]:
# KFold Cross validation: Train/test on various samples and use mean scores to get best scores.
# Source: https://www.youtube.com/watch?v=gJo0uNL-5Qw&list=PLeo1K3hjS3uvCeTYTeyfe0-rN5r8zn9rw&index=13
# Excercise: IRIS and give best model name and score

import pandas as ps
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as ny

from sklearn.datasets import load_iris

iris = load_iris()

# Describing the data set
dir(iris)

['DESCR',
 'data',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [4]:
irisdf = ps.DataFrame(iris.data)
irisdf.head()

# Defining targets

targets = ps.DataFrame(iris.target)
targets.shape #head(50)

(150, 1)

In [25]:
#T-T-S
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(irisdf,targets,test_size=0.3)
# Very important to give 2 dim array above if not setup into DFs.

print(len(X_train))

105


In [42]:
# Import 3 algos for cross validation:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree


lm = LogisticRegression()
lm.fit(X_train,y_train.values.ravel())
lm.score(X_test,y_test)

0.9333333333333333

In [43]:
svcm = SVC()
svcm.fit(X_train,y_train.values.ravel())
svcm.score(X_test,y_test)

0.9555555555555556

In [44]:
rfm = RandomForestClassifier(n_estimators=30)
rfm.fit(X_train,y_train.values.ravel())
rfm.score(X_test,y_test)

0.9333333333333333

In [45]:
dtm = tree.DecisionTreeClassifier()
dtm.fit(X_train,y_train.values.ravel())
dtm.score(X_test,y_test)

0.9333333333333333

In [46]:
# The above samples change and score changes for %test data. How to arrive after confident iterations to a score ?
# Enter Kfold
from sklearn.model_selection import KFold
kf = KFold(n_splits=4)

# Now prepare a iterator set using TTS that will hold split IDs for the models we assign.
for train_ind, test_ind in kf.split(range(0,12)):
    print(train_ind,test_ind)
    


[ 3  4  5  6  7  8  9 10 11] [0 1 2]
[ 0  1  2  6  7  8  9 10 11] [3 4 5]
[ 0  1  2  3  4  5  9 10 11] [6 7 8]
[0 1 2 3 4 5 6 7 8] [ 9 10 11]


In [47]:
# Using a def to call each model 

def fetch_score(model,X_train,y_train,X_test,y_test):
    model.fit(X_train,y_train)
    return model.score(X_test,y_test)

# Lets call above def using SVM
print(fetch_score(svcm,X_train,y_train.values.ravel(),X_test,y_test))
print(fetch_score(rfm,X_train,y_train.values.ravel(),X_test,y_test))
print(fetch_score(lm,X_train,y_train.values.ravel(),X_test,y_test))


0.9555555555555556
0.9333333333333333
0.9333333333333333


In [48]:
# Stratified Kfold; It divides classification in a uniform way say, iron's out bias.
# Build each def/loop for getting the score with splits using stratified kf

from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3) # and store all scores below into lists.

scores_logistic = []
scores_svm = []
scores_rf = []

#  Other wise, use a pre-set def to get scores to choose best.

from sklearn.model_selection import cross_val_score

print(cross_val_score(rfm,X_test,y_test.values.ravel()))
print(cross_val_score(lm,X_test,y_test.values.ravel()))
print(cross_val_score(svcm,X_test,y_test.values.ravel()))
print(cross_val_score(dtm,X_test,y_test.values.ravel()))

[0.77777778 1.         0.88888889 1.         0.88888889]
[0.88888889 1.         0.88888889 1.         0.88888889]
[0.88888889 1.         0.88888889 1.         0.88888889]
[0.66666667 1.         0.88888889 1.         0.88888889]
