# *--------------- Cross Validation ------------------*

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('iris.csv')
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
X = data.iloc[:,0:4].values
Y = data.iloc[:,4].values

## Hold-out cross validation

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train_x,test_x,train_y,test_y = train_test_split(X,Y,test_size=0.3,random_state=0,stratify=Y)

In [6]:
test_x

array([[6.3, 3.4, 5.6, 2.4],
       [5.8, 2.7, 5.1, 1.9],
       [5.1, 3.4, 1.5, 0.2],
       [5.1, 3.8, 1.9, 0.4],
       [7. , 3.2, 4.7, 1.4],
       [5.1, 3.3, 1.7, 0.5],
       [5.5, 2.6, 4.4, 1.2],
       [5.9, 3. , 5.1, 1.8],
       [5.1, 3.8, 1.6, 0.2],
       [5.7, 2.8, 4.5, 1.3],
       [5.1, 3.7, 1.5, 0.4],
       [6.5, 3. , 5.2, 2. ],
       [4.6, 3.2, 1.4, 0.2],
       [6.3, 2.7, 4.9, 1.8],
       [5.8, 2.7, 4.1, 1. ],
       [6.1, 2.6, 5.6, 1.4],
       [6.2, 2.2, 4.5, 1.5],
       [6.7, 3.1, 4.4, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [5.1, 3.5, 1.4, 0.2],
       [6.5, 2.8, 4.6, 1.5],
       [6.3, 2.9, 5.6, 1.8],
       [4.8, 3.4, 1.9, 0.2],
       [5.5, 2.4, 3.7, 1. ],
       [7.2, 3. , 5.8, 1.6],
       [7.1, 3. , 5.9, 2.1],
       [5.7, 2.5, 5. , 2. ],
       [6.3, 3.3, 6. , 2.5],
       [5.5, 2.5, 4. , 1.3],
       [6.5, 3.2, 5.1, 2. ],
       [5.7, 2.8, 4.1, 1.3],
       [5.1, 3.8, 1.5, 0.3],
       [4.6, 3.4, 1.4, 0.3],
       [5.6, 3. , 4.1, 1.3],
       [6.7, 3

In [7]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [12]:
xgb = XGBClassifier()
score = cross_val_score(xgb,train_x,train_y,cv=10,scoring="f1_micro",verbose=3,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.9s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.0s finished


In [13]:
score

array([1.        , 0.90909091, 0.90909091, 0.90909091, 1.        ,
       0.8       , 1.        , 1.        , 0.8       , 1.        ])

In [14]:
score.mean()

0.9327272727272728

In [10]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

## K-fold cross validation

In [15]:
from sklearn.model_selection import KFold

In [16]:
f = KFold(n_splits=10)

In [17]:
f.get_n_splits(X)

10

In [18]:
for train_index, test_index in f.split(X):
    train_x,test_x = X[train_index],X[test_index]
    train_y,test_y = Y[train_index],Y[test_index]

In [19]:
train_x

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [23]:
xgb = XGBClassifier()
score = cross_val_score(xgb,train_x,train_y,cv=10,scoring="f1_micro",verbose=3,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [24]:
score

array([1.        , 0.92857143, 1.        , 1.        , 0.85714286,
       0.92307692, 0.92307692, 0.92307692, 0.92307692, 0.92307692])

In [25]:
score.mean()

0.9401098901098901

## Stratified K-fold cross validation

In [29]:
from sklearn.model_selection import StratifiedKFold

In [32]:
skf = StratifiedKFold(n_splits=10)

In [33]:
skf.get_n_splits(X,Y)

10

In [34]:
for train_index,test_index in skf.split(X,Y):
    train_x,test_x = X[train_index],X[test_index]
    train_y,test_y = Y[train_index],Y[test_index]    

In [35]:
train_x

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [36]:
xgb = XGBClassifier()
score = cross_val_score(xgb,train_x,train_y,cv=10,scoring="f1_micro",verbose=3,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [37]:
score

array([1.        , 0.92857143, 1.        , 0.92857143, 1.        ,
       0.92307692, 0.92307692, 0.92307692, 0.92307692, 1.        ])

In [38]:
score.mean()

0.9549450549450549

### Repeated K-fold Cross-validation

In [39]:
from sklearn.model_selection import RepeatedKFold

In [40]:
rkf = RepeatedKFold(n_splits=10,n_repeats=5)

In [41]:
skf.get_n_splits(X,Y)

10

In [42]:
for train_index,test_index in rkf.split(X,Y):
    train_x,test_x = X[train_index],X[test_index]
    train_y,test_y = Y[train_index],Y[test_index]    

In [43]:
xgb = XGBClassifier()
score = cross_val_score(xgb,train_x,train_y,cv=10,scoring="f1_micro",verbose=3,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [44]:
score

array([1.        , 0.92857143, 1.        , 0.92857143, 0.92857143,
       0.92307692, 0.92307692, 0.84615385, 1.        , 1.        ])

In [46]:
score.mean()

0.9478021978021978

### Repeated Stratified K-fold Cross-validation

In [47]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [48]:
rskf = RepeatedStratifiedKFold(n_splits=10,n_repeats=5)

In [49]:
rskf.get_n_splits()

50

In [50]:
for train_index,test_index in rskf.split(X,Y):
    train_x,test_x = X[train_index],X[test_index]
    train_y,test_y = Y[train_index],Y[test_index]    

In [51]:
xgb = XGBClassifier()
score = cross_val_score(xgb,train_x,train_y,cv=10,scoring="f1_micro",verbose=3,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    3.1s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.1s finished


In [52]:
score

array([0.92857143, 1.        , 1.        , 0.92857143, 0.92857143,
       1.        , 0.92307692, 0.84615385, 1.        , 1.        ])

In [53]:
score.mean()

0.9554945054945054

# LeaveOneOut & LeavePOut cross validtion

In [17]:
from sklearn.model_selection import LeaveOneOut,LeavePOut

In [40]:
loo=LeaveOneOut()
lpo=LeavePOut(p=50)
loo.get_n_splits(X)
lpo.get_n_splits(X)

20128660909731932294240234380929315748140

In [41]:
for train_index,test_index in loo.split(X):
    train_x,test_x = X[train_index],X[test_index]
    train_y,test_y = Y[train_index],Y[test_index]    

In [42]:
print(train_x,train_y,test_x,test_y)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [43]:
# for train_index,test_index in lpo.split(X):
#     train_x,test_x = X[train_index],X[test_index]
#     train_y,test_y = Y[train_index],Y[test_index] 

In [44]:
# print(test_x,test_y)