# Cross Validation

### Data preparation

In [1]:
import numpy as np
import pandas as pd

auto = pd.read_csv('auto.csv')
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
small_index = auto["displacement"] <= np.mean(auto["displacement"]) #indexes where displacement is less than the mean

auto.loc[small_index,"displacement_binary"] = 'small'

auto.loc[~small_index,"displacement_binary"] = 'big'
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,displacement_binary
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,big
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,big
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,big
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst,big
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino,big


In [5]:
auto['displacement_big'] = np.where(auto['displacement_binary'] == 'big', 1, 0) #1 for big, else 0
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,displacement_binary,displacement_big
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,big,1
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,big,1
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,big,1
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst,big,1
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino,big,1


### 10-fold CV

#### Train test split

In [6]:
from sklearn.model_selection import KFold ## for regression
from sklearn.model_selection import StratifiedKFold ## for classification

kfolds = StratifiedKFold(n_splits = 10, random_state = 1, shuffle = True)
print(kfolds)

StratifiedKFold(n_splits=10, random_state=1, shuffle=True)


In [8]:
for train_index, test_index in kfolds.split(auto, auto['displacement_big']):
  print("trian_index:{}\n\ntest_index;{}".format(train_index, test_index))
  break

trian_index:[  0   1   2   3   4   5   6   8   9  12  13  14  15  16  17  18  19  20
  21  22  24  25  26  27  28  29  30  31  32  33  34  35  36  38  39  40
  41  43  44  45  46  47  48  49  50  51  52  53  54  56  57  58  59  61
  62  63  64  65  66  67  68  69  70  71  72  73  74  76  77  78  79  81
  82  83  84  85  87  88  89  90  91  92  93  94  95  96  97  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 118 119 120 121
 122 124 125 127 128 129 130 131 133 134 135 136 137 138 139 141 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 159 160 161 162 163
 164 165 166 167 168 169 170 171 173 174 175 176 177 178 179 180 181 182
 183 184 185 186 187 188 190 191 192 194 195 196 197 198 199 200 201 202
 203 204 205 206 207 208 209 210 211 212 214 215 216 217 218 219 220 221
 222 223 224 225 226 227 228 230 231 232 233 234 236 237 238 240 241 242
 243 244 245 246 247 248 249 250 251 252 253 255 256 258 259 260 261 263
 264 265 266 267 268 269 270 271 272 27

The above split, keeps the proportions for our split

In [19]:
auto['displacement_big'].value_counts()

0    222
1    170
Name: displacement_big, dtype: int64

In [20]:
auto.loc[test_index]['displacement_big'].value_counts()

0    23
1    17
Name: displacement_big, dtype: int64

#### CV implementation

In [21]:
cv_classification_errors_1 = []
cv_auc_1 = []

In [22]:
import statsmodels.formula.api as smf
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

for train_index, test_index in kfolds.split(auto,auto['displacement_big']):
  # train the logistic model
  result = smf.logit('displacement_big ~ mpg + horsepower', data=auto, subset = train_index).fit()

  # select the test set according to test_index produced by kfolds.split
  X_test = auto.loc[test_index,["mpg","horsepower"]]
  y_test = auto.loc[test_index,"displacement_big"]

  # compute the probabilities of test data
  result_prob = result.predict(X_test)

  # select 0.5 as the threshold
  result_pred = (result_prob > 0.5)

  # compute the classification error
  classification_error = np.mean(result_pred != y_test) #1 and 0

  # add the computed classification error to "cv_classification_errors_1" to store the result
  cv_classification_errors_1.append(classification_error)

  # calculate the auc
  fpr,tpr,threshold = roc_curve(y_test, result_prob)
  roc_auc = auc(fpr,tpr) #calculate auc from fpr and tpc

  # add the computed auc to "cv_auc_1" to store the result
  cv_auc_1.append(roc_auc)

Optimization terminated successfully.
         Current function value: 0.224324
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.240459
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.221340
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.232408
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.224961
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.227415
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.218369
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.220679
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.202915
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.229121
  

Classification Error

In [27]:
print("classification errors using 10-fold CV: {}\n".format(cv_classification_errors_1))

classification errors using 10-fold CV: [0.125, 0.025, 0.1282051282051282, 0.05128205128205128, 0.1282051282051282, 0.05128205128205128, 0.15384615384615385, 0.10256410256410256, 0.07692307692307693, 0.10256410256410256]



In [28]:
print("mean of classification errors using 10-fold CV: {}\n".format(np.mean(cv_classification_errors_1)))

mean of classification errors using 10-fold CV: 0.09448717948717947



AUC - aggregate performance measure across all classes

In [29]:
print("auc using 10-fold CV: {}\n".format(cv_auc_1))
print("mean of auc using 10-fold CV: {}\n".format(np.mean(cv_auc_1)))

auc using 10-fold CV: [0.9744245524296675, 1.0, 0.9545454545454546, 0.9893048128342246, 0.9679144385026738, 0.9705882352941176, 0.9598930481283422, 0.9572192513368984, 0.93048128342246, 0.981283422459893]

mean of auc using 10-fold CV: 0.9685654498953731



#### Easier with sklearn
Comparing two models with CV

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

logistic_model = LogisticRegression(penalty='none', max_iter = 10000)
error_model_1_cv = cross_val_score(logistic_model, auto[['mpg','horsepower']], auto['displacement_big'], cv=10)
error_model_2_cv = cross_val_score(logistic_model, auto[['weight','acceleration']], auto['displacement_big'], cv=10)
print("Logisgic Regression: \n")
print("accuracies of 10-folds:",error_model_1_cv,"(mean classification error:",1-np.mean(error_model_1_cv),")")
print("accuracies of 10-folds:",error_model_2_cv,"(mean classification error:",1-np.mean(error_model_2_cv),")")


Logisgic Regression: 

accuracies of 10-folds: [0.925      0.75       0.97435897 0.92307692 0.87179487 0.8974359
 0.8974359  1.         0.87179487 0.84615385] (mean classification error: 0.10429487179487162 )
accuracies of 10-folds: [0.925      0.95       1.         0.84615385 0.8974359  0.94871795
 0.92307692 0.94871795 0.87179487 0.92307692] (mean classification error: 0.07660256410256405 )


We use the previous kfolds (Stratified with shuffle) we created earlier instead of using 10

In [35]:
kfolds

StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

In [34]:
logistic_model = LogisticRegression(penalty='none', max_iter = 10000)
error_model_1_cv = cross_val_score(logistic_model, auto[['mpg','horsepower']], auto['displacement_big'], cv=kfolds)
error_model_2_cv = cross_val_score(logistic_model, auto[['weight','acceleration']], auto['displacement_big'], cv=kfolds)
print("Logisgic Regression: \n")
print("accuracies of 10-folds:",error_model_1_cv,"(mean classification error:",1-np.mean(error_model_1_cv),")")
print("accuracies of 10-folds:",error_model_2_cv,"(mean classification error:",1-np.mean(error_model_2_cv),")")

Logisgic Regression: 

accuracies of 10-folds: [0.875      0.975      0.87179487 0.94871795 0.87179487 0.94871795
 0.84615385 0.8974359  0.92307692 0.8974359 ] (mean classification error: 0.09448717948717944 )
accuracies of 10-folds: [0.9        0.975      0.84615385 0.97435897 0.94871795 0.8974359
 0.87179487 0.94871795 0.97435897 0.87179487] (mean classification error: 0.07916666666666661 )
