### Evaluating Cross validation methods on Cancer data


In [1]:
import numpy as np
from sklearn.model_selection import KFold
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [3]:
data = pd.read_csv('cancer.csv')

In [4]:
data.head(20)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,
5,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,
6,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,...,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368,
7,84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,
8,844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,
9,84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,


In [5]:
data.isnull().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

In [6]:
data = data.drop(['Unnamed: 32'], axis=1)

Id feature will not be of any use for our model.

So I will drop it.

The data doesn't have any null values.

I will go ahead with model building without doing any exploratory data analysis as this is a widely used data.

And our purpose is to evaluate different cross validation techniques on the model

Our aim is to predict the severity of cancer based on the input features.

So we can use Decision trees to solve this classification problem

In [7]:
data['diagnosis'].nunique()

2

In [8]:
data['diagnosis'].value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

In [10]:
X = data.drop(['id','diagnosis'], axis=1)
y = data['diagnosis']

#### Hold One Out Method

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [13]:
dt = DecisionTreeClassifier()

In [14]:
dtmodel = dt.fit(X_train,y_train)

In [15]:
dt.score(X_train,y_train)

1.0

In [16]:
hoo_result = dtmodel.score(X_test,y_test)

In [17]:
print("The accuracy score is for Hold one out method :",hoo_result)

The accuracy score is for Hold one out method : 0.9415204678362573


#### K-Fold Method

In [22]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score

In [23]:
dt = DecisionTreeClassifier()

In [24]:
kf= KFold(n_splits=5)

In [25]:
kfold_score = cross_val_score(dt,X,y,cv=kf)

In [26]:
print("The cross validation scores of k-fold method with 5 folds is",kfold_score)

The cross validation scores of k-fold method with 5 folds is [0.87719298 0.9122807  0.94736842 0.89473684 0.90265487]


In [27]:
kfold_score_mean = kfold_score.mean()

In [28]:
print("The min accuracy from k-fold CV is",min(kfold_score))
print("The max accuracy from k-fold CV is", max(kfold_score))
print("The mean cross validation scores of k-fold method with 5 folds is",kfold_score_mean)

The min accuracy from k-fold CV is 0.8771929824561403
The max accuracy from k-fold CV is 0.9473684210526315
The mean cross validation scores of k-fold method with 5 folds is 0.9068467629250115


#### complete implementation of k-fold cross validation

In [29]:
kf2=KFold(n_splits=5)
acc_score_kfold = []


In [30]:
for train_ind,test_ind in kf2.split(X,y):
    #print("Train:",train_ind,'Test:',test_ind)
    X_train,X_test = X.iloc[train_ind,:], X.iloc[test_ind,:]
    y_train,y_test = y[train_ind], y[test_ind]
    
    dt.fit(X_train,y_train)
    pred_values = dt.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score_kfold.append(acc)
     
avg_acc_score = sum(acc_score_kfold)/5
 
print('accuracy of each fold : {}'.format(acc_score_kfold))
print('Avg accuracy : {}'.format(avg_acc_score))


accuracy of each fold : [0.8771929824561403, 0.9298245614035088, 0.9473684210526315, 0.9385964912280702, 0.831858407079646]
Avg accuracy : 0.9049681726439992


## Stratified K-Fold method


In [31]:
from sklearn.model_selection import StratifiedKFold

In [32]:
skfold = StratifiedKFold(n_splits=10)

In [33]:
skfold_score = cross_val_score(dt,X,y,cv=skfold)

In [34]:
print("The accuracy of Stratified k-fold method with 10 folds is",skfold_score)

The accuracy of Stratified k-fold method with 10 folds is [0.92982456 0.85964912 0.9122807  0.89473684 0.9122807  0.89473684
 0.89473684 0.94736842 0.9122807  0.92857143]


In [35]:
skfold_score_mean = skfold_score.mean()
print("The accuracy of Stratifieid k-fold method with 10 folds is",skfold_score_mean)

The accuracy of Stratifieid k-fold method with 10 folds is 0.9086466165413534


### Leave One Out Method

In [None]:
from sklearn.model_selection import LeaveOneOut

In [None]:
loocv = LeaveOneOut()

In [None]:
loocv_score = cross_val_score(dt,X,y,cv=loocv)

In [None]:
print("The accuracy of Leave one out method is",loocv_score)

In [None]:
loocv_score_mean = loocv_score.mean()
print("The average accuracy of Leave one out method with is",loocv_score_mean)