# Cross Validation

In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, LeaveOneOut,\
RepeatedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [5]:
df.fillna('missing', inplace=True)

In [6]:
df.isna().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [7]:
X = df.drop(' income', axis=1) #feature X
y = df[' income'] # target variable

In [8]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [9]:
y.head()

0     <=50K
1     <=50K
2     <=50K
3     <=50K
4     <=50K
Name:  income, dtype: object

In [10]:
X.shape

(32561, 14)

In [11]:
y.value_counts()

 <=50K    24720
 >50K      7841
Name:  income, dtype: int64

# KFold

In [12]:
kf = KFold(n_splits=5)
#Use Shift+Tab for Documentation

In [13]:
# object generation
kf.split(X=X) #This is the method

<generator object _BaseKFold.split at 0x7fe761f0a0b0>

In [14]:
# Seeing created folds
i = 1
for train_set, test_set in kf.split(X=X):
    print("iteration ", i)
    print(train_set, " having :" , len(train_set))
    print(test_set, " having :" , len(test_set))
    print("-------------------------")
    i += 1

iteration  1
[ 6513  6514  6515 ... 32558 32559 32560]  having : 26048
[   0    1    2 ... 6510 6511 6512]  having : 6513
-------------------------
iteration  2
[    0     1     2 ... 32558 32559 32560]  having : 26049
[ 6513  6514  6515 ... 13022 13023 13024]  having : 6512
-------------------------
iteration  3
[    0     1     2 ... 32558 32559 32560]  having : 26049
[13025 13026 13027 ... 19534 19535 19536]  having : 6512
-------------------------
iteration  4
[    0     1     2 ... 32558 32559 32560]  having : 26049
[19537 19538 19539 ... 26046 26047 26048]  having : 6512
-------------------------
iteration  5
[    0     1     2 ... 26046 26047 26048]  having : 26049
[26049 26050 26051 ... 32558 32559 32560]  having : 6512
-------------------------


In [15]:
#let's match if fold value is correct
32561/5

6512.2

In [16]:
#32561 came from shape in above cell

Note: Iteration 1: 6513 is test set,26048 are training set

for 5 folds, 4*6513 should be equal to 26048 (1 is trainset)

In [17]:
4*6513

26052

In [18]:
# ------------------------ col transformer ---------------------- 

In [19]:
# robust scaling on num cols, ohe on cat cols

In [20]:
num_cols = X.select_dtypes(include=np.number).columns
num_cols

Index(['age', ' fnlwgt', ' education-num', ' capital-gain', ' capital-loss',
       ' hours-per-week'],
      dtype='object')

In [21]:
cat_cols = X.select_dtypes(exclude=np.number).columns
cat_cols

Index([' workclass', ' education', ' marital-status', ' occupation',
       ' relationship', ' race', ' sex', ' native-country'],
      dtype='object')

In [22]:
ct = ColumnTransformer([
    ('rob', RobustScaler(), num_cols),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_cols)
])

In [23]:
pipe = Pipeline([
    ('ct_step', ct),
    ('model', RandomForestClassifier(n_estimators=10, random_state=0))
])

In [24]:
#let's fit our model

In [25]:
scores = []
i = 1
for train_set, test_set in kf.split(X):
    pipe.fit(X.loc[train_set], y[train_set]) #pipe fit
    sco = pipe.score(X.loc[test_set], y[test_set]) # score on test sets
    scores.append(sco)
    print("iteration ", i)
    i += 1

iteration  1
iteration  2
iteration  3
iteration  4
iteration  5


In [26]:
np.array(scores)

array([0.84784278, 0.84520885, 0.84613022, 0.84858722, 0.85165848])

In [27]:
np.array(scores).mean()

0.8478855085142512

In [28]:
np.array(scores).std()

0.0022349531977626388

# Stratified K fold

Context: 

In [29]:
y.value_counts()

 <=50K    24720
 >50K      7841
Name:  income, dtype: int64

In [30]:
#each split should have
7841/5 #of a

1568.2

In [31]:
24720/5 #of b

4944.0

The mix of a and b should be there in each fold equally

In [32]:
skf = StratifiedKFold(n_splits=5)

In [33]:
scores_skf = []
i = 1
for train_set, test_set in skf.split(X, y):
    pipe.fit(X.loc[train_set], y[train_set])
    sco = pipe.score(X.loc[test_set], y[test_set])
    scores_skf.append(sco)
    print("iteration ", i)
    i += 1

iteration  1
iteration  2
iteration  3
iteration  4
iteration  5


In [34]:
scores_skf

[0.8473821587594043,
 0.8432125307125307,
 0.8421375921375921,
 0.8425982800982801,
 0.8536547911547911]

In [35]:
i = 1
for train_set, test_set in skf.split(X=X, y=y):
    print("iteration ", i)
    print(train_set, " having :" , len(train_set))
    print(test_set, " having :" , len(test_set))
    print()
    print("y train counts: \n", y[train_set].value_counts())
    print("y test counts: \n", y[test_set].value_counts())
    print("-------------------------")
    i += 1

iteration  1
[ 6499  6500  6512 ... 32558 32559 32560]  having : 26048
[   0    1    2 ... 6514 6515 6516]  having : 6513

y train counts: 
  <=50K    19776
 >50K      6272
Name:  income, dtype: int64
y test counts: 
  <=50K    4944
 >50K     1569
Name:  income, dtype: int64
-------------------------
iteration  2
[    0     1     2 ... 32558 32559 32560]  having : 26049
[ 6499  6500  6512 ... 13121 13123 13125]  having : 6512

y train counts: 
  <=50K    19776
 >50K      6273
Name:  income, dtype: int64
y test counts: 
  <=50K    4944
 >50K     1568
Name:  income, dtype: int64
-------------------------
iteration  3
[    0     1     2 ... 32558 32559 32560]  having : 26049
[12997 12999 13000 ... 19727 19729 19733]  having : 6512

y train counts: 
  <=50K    19776
 >50K      6273
Name:  income, dtype: int64
y test counts: 
  <=50K    4944
 >50K     1568
Name:  income, dtype: int64
-------------------------
iteration  4
[    0     1     2 ... 32558 32559 32560]  having : 26049
[19482 1948

In [36]:
# We can check this distribution of data skf by using cross validation
result_kf = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=5)
# default cv is Stratified k fold

In [37]:
result_kf

array([0.84738216, 0.84321253, 0.84213759, 0.84259828, 0.85365479])

In [38]:
start = time.time()
result_kf10 = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=KFold(n_splits=10)) #cv kfold
result_kf10
print("time taken: ", time.time()-start)


time taken:  5.185133934020996


In [39]:
result_kf10

array([0.83880872, 0.85165848, 0.84981572, 0.84367322, 0.85135135,
       0.84613022, 0.84520885, 0.84797297, 0.8544226 , 0.84459459])

# LOO CV

In [40]:
start = time.time()
result_loocv = cross_val_score(estimator=pipe, X=X.head(100), y=y.head(100),
                               scoring='accuracy', cv=LeaveOneOut())
print("time taken: ", time.time()-start)

time taken:  2.485150098800659


In [41]:
# it takes more time for only first 100 

In [42]:
result_loocv

array([1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,
       0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1.])

In [43]:
result_loocv.mean()

0.8

In [44]:
# accuray for first 100

# Repeated KFold

In [45]:
start = time.time()
result_rkf = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy',
                              cv=RepeatedKFold(n_splits=5, n_repeats=5))
result_rkf
print("time taken: ", time.time()-start)

time taken:  11.113617897033691


In [46]:
result_rkf

array([0.85198833, 0.84490172, 0.84781941, 0.84889435, 0.84382678,
       0.85060648, 0.84213759, 0.84198403, 0.8482801 , 0.85227273,
       0.84523261, 0.84781941, 0.84459459, 0.84874079, 0.84229115,
       0.85060648, 0.8470516 , 0.84351966, 0.84966216, 0.84305897,
       0.85521265, 0.83737715, 0.84643735, 0.85472973, 0.84874079])

# Until now, we've seen implementation of cross validation on entire datasets, which isn't wrong.
# But we can also do it on a training set, and then cross verify our results on a separate test set to see if we're able to generalize our results properly.

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [48]:
start = time.time()
result_tts = cross_val_score(estimator=pipe, X=X_train, y=y_train,
                              scoring='accuracy', cv=KFold(n_splits=5))
print("time taken: ", time.time()-start)

time taken:  1.8605618476867676


In [49]:
result_tts

array([0.85547025, 0.84702495, 0.84184261, 0.84661163, 0.84968324])

In [50]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('ct_step',
                 ColumnTransformer(transformers=[('rob', RobustScaler(),
                                                  Index(['age', ' fnlwgt', ' education-num', ' capital-gain', ' capital-loss',
       ' hours-per-week'],
      dtype='object')),
                                                 ('ohe',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  Index([' workclass', ' education', ' marital-status', ' occupation',
       ' relationship', ' race', ' sex', ' native-country'],
      dtype='object'))])),
                ('model',
                 RandomForestClassifier(n_estimators=10, random_state=0))])

In [51]:
pipe.score(X_test, y_test)

0.8446184553968985

# By comparing this score with the 5 CV scores on the training set, we see the results are quite similar, which is a good sign

In [52]:
# instead of score = accuracy we can pass
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']