### Cross Validation and K-Fold Exercice

#### Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold

#### Importing the dataset

In [3]:
X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#### Data Preprocessing

##### Verify if is necessary to apply feature scaling

In [4]:
df_train = pd.DataFrame(X_train)
df_train.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,...,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0
mean,14.067213,19.247363,91.557407,648.541099,0.096167,0.103869,0.089193,0.048344,0.180618,0.06282,...,16.177226,25.647297,106.625297,869.026593,0.132329,0.254329,0.276578,0.113904,0.290865,0.083945
std,3.49938,4.405291,24.149231,344.944564,0.013458,0.053522,0.081747,0.038925,0.028074,0.007159,...,4.77002,6.22547,33.195053,552.926912,0.02255,0.159882,0.215937,0.066784,0.064624,0.018408
min,6.981,9.71,43.79,143.5,0.06251,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.635,16.0,74.72,415.65,0.086475,0.06375,0.02801,0.02022,0.16175,0.057685,...,13.01,21.09,83.715,513.9,0.11595,0.14585,0.1079,0.06339,0.2494,0.071835
50%,13.27,18.82,85.98,541.8,0.09566,0.09097,0.05999,0.03263,0.1781,0.06144,...,14.91,25.4,97.59,683.4,0.1314,0.2116,0.2298,0.09722,0.2819,0.07993
75%,15.74,21.71,103.7,770.05,0.10485,0.1301,0.1322,0.07382,0.1953,0.06625,...,18.55,29.37,124.95,1033.5,0.1462,0.3368,0.3853,0.1625,0.3201,0.09207
max,28.11,39.28,188.5,2499.0,0.1447,0.3454,0.4268,0.2012,0.304,0.09744,...,33.13,49.54,229.3,3432.0,0.2184,1.058,1.252,0.291,0.6638,0.2075


In [5]:
df_test = pd.DataFrame(X_text)
df_test.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,...,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0
mean,14.367079,19.458421,93.61193,680.225439,0.09713,0.106225,0.087227,0.051215,0.183332,0.062709,...,16.636237,25.796667,109.799298,926.707895,0.132527,0.254011,0.254668,0.117408,0.286924,0.083951
std,3.626623,3.870388,24.928082,379.002396,0.016305,0.050065,0.071367,0.038397,0.024598,0.00668,...,5.08294,5.844268,35.218674,631.217286,0.024028,0.14741,0.176207,0.061556,0.049475,0.016683
min,7.76,11.97,47.92,181.0,0.05263,0.03116,0.0,0.0,0.1353,0.05044,...,9.456,14.1,59.16,268.6,0.08864,0.05232,0.0,0.0,0.1999,0.05933
25%,11.89,16.685,76.375,434.25,0.085377,0.067132,0.03909,0.022295,0.16425,0.058147,...,13.035,21.0225,85.075,515.95,0.120025,0.1486,0.1214,0.073853,0.25315,0.07118
50%,13.68,19.025,87.91,571.45,0.09687,0.09762,0.06694,0.04165,0.18435,0.062325,...,15.0,25.43,98.79,693.3,0.1312,0.2177,0.21065,0.10955,0.28225,0.080535
75%,16.245,21.9225,106.525,811.05,0.1075,0.131525,0.117375,0.073072,0.197375,0.06575,...,19.49,30.3775,128.425,1145.0,0.14545,0.34235,0.3758,0.157,0.307475,0.092165
max,27.42,29.33,186.9,2501.0,0.1634,0.277,0.3635,0.1878,0.2595,0.09502,...,36.04,41.85,251.2,4254.0,0.2226,0.8681,0.9387,0.2688,0.4753,0.1431


In [6]:
#### The data is not in the same scale, so we need to apply feature scaling

In [7]:
X_train_std = StandardScaler().fit_transform(X_train)
X_test_std = StandardScaler().fit_transform(X_text)

#### View the data after feature scaling

In [8]:
df_train_std = pd.DataFrame(X_train_std)
df_train_std.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
count,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,...,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0,455.0
mean,-4.337434e-15,2.240942e-15,-7.437274e-16,1.503071e-16,5.22366e-15,-2.775802e-15,-7.046866e-16,6.031805e-16,-3.263812e-15,-3.031519e-15,...,4.060244e-16,5.676534e-15,-2.402962e-15,-1.022869e-15,4.172487e-15,-5.465713e-16,-6.383172e-16,-4.489693e-16,-1.034581e-15,-2.088683e-15
std,1.001101,1.001101,1.001101,1.001101,1.001101,1.001101,1.001101,1.001101,1.001101,1.001101,...,1.001101,1.001101,1.001101,1.001101,1.001101,1.001101,1.001101,1.001101,1.001101,1.001101
min,-2.02722,-2.167362,-1.980187,-1.465734,-2.50373,-1.58033,-1.092292,-1.243358,-2.660791,-1.798182,...,-1.730874,-2.191368,-1.695348,-1.238101,-2.715107,-1.421602,-1.282241,-1.707442,-2.081459,-1.571972
25%,-0.6958063,-0.737962,-0.6979907,-0.6758983,-0.7210063,-0.7504076,-0.7492716,-0.72332,-0.6728116,-0.7179958,...,-0.6647169,-0.7328464,-0.6909317,-0.6429737,-0.7271308,-0.6792384,-0.7820068,-0.7572169,-0.6423407,-0.6585793
50%,-0.2280663,-0.097118,-0.2312101,-0.3097848,-0.03774691,-0.2412686,-0.3576338,-0.4041472,-0.08978972,-0.1929345,...,-0.2659572,-0.0397671,-0.2724877,-0.3360859,-0.04123889,-0.2675449,-0.2168678,-0.2501005,-0.1388815,-0.2183342
75%,0.47855,0.5596334,0.5033684,0.352644,0.6458845,0.4906419,0.5266743,0.6552186,0.5235422,0.4796473,...,0.4979823,0.5986377,0.5526388,0.297787,0.6157967,0.5163947,0.5040438,0.7284559,0.4528766,0.4418976
max,4.017353,4.55241,4.018733,5.370416,3.610271,4.51774,4.134445,3.931305,4.399657,4.840942,...,3.557938,3.84212,3.69964,4.640386,3.821065,5.032188,4.52214,2.654689,5.777151,6.719538


#### Use the k-fold cross validation to evaluate the Decision Tree model

In [11]:
k_outer = 5
k_inner = 5

accuracies = []

best_hyper_parameter = []

outer_cv = KFold(n_splits=k_outer, shuffle=True, random_state=42)
for train_index, validation_index in outer_cv.split(X_train_std):
    X_train_outer, X_validation_outer = X_train_std[train_index], X_train_std[validation_index]
    y_train_outer, y_validation_outer = y_train[train_index], y_train[validation_index]

    bestAccuracy = 0
    bestHyperParameter = None

    for max_depth in range(10, 101, 10):
        inner_accuracies = []

        inner_cv = KFold(n_splits=k_inner, shuffle=True, random_state=42)
        for train_index, validation_index in inner_cv.split(X_train_outer):
            X_train_inner, X_validation_inner = X_train_outer[train_index], X_train_outer[validation_index]
            y_train_inner, y_validation_inner = y_train_outer[train_index], y_train_outer[validation_index]

            classifier = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=42)
            classifier.fit(X_train_inner, y_train_inner)

            y_pred = classifier.predict(X_validation_inner)
            inner_accuracies.append(accuracy_score(y_validation_inner, y_pred))

        meanAccuracy = np.mean(inner_accuracies)

        if meanAccuracy > bestAccuracy:
            bestAccuracy = meanAccuracy
            bestHyperParameter = max_depth

    finalClassifier = DecisionTreeClassifier(criterion='entropy', max_depth=bestHyperParameter, random_state=42)
    finalClassifier.fit(X_train_outer, y_train_outer)

    y_pred = finalClassifier.predict(X_validation_outer)
    accuracy = accuracy_score(y_validation_outer, y_pred)
    accuracies.append(accuracy)
    best_hyper_parameter.append(bestHyperParameter)


meanAccuracy = np.mean(accuracies)

print("accuracies = ", accuracies)

bestHyperParameter = max(set(best_hyper_parameter), key=best_hyper_parameter.count)
print("bestHyperParameter = ", bestHyperParameter)

accuracies =  [0.9560439560439561, 0.945054945054945, 0.945054945054945, 0.9230769230769231, 0.8791208791208791]
bestHyperParameter =  10


#### Use the k-fold cross validation to evaluate the KNN model

In [18]:
k_outer = 5
k_inner = 5

accuracies = []
best_hyper_parameter = []
hyperParamterK = [1, 3, 5, 11, 21, 31]

outer_cv = KFold(n_splits=k_outer, shuffle=True, random_state=42)
for train_index, validation_index in outer_cv.split(X_train_std):
    X_train_outer, X_validation_outer = X_train_std[train_index], X_train_std[validation_index]
    y_train_outer, y_validation_outer = y_train[train_index], y_train[validation_index]

    bestAccuracy = 0
    bestHyperParameter = None

    for kValue in hyperParamterK:
        inner_accuracies = []

        inner_cv = KFold(n_splits=k_inner, shuffle=True, random_state=42)
        for train_index, validation_index in inner_cv.split(X_train_outer):
            X_train_inner, X_validation_inner = X_train_outer[train_index], X_train_outer[validation_index]
            y_train_inner, y_validation_inner = y_train_outer[train_index], y_train_outer[validation_index]

            classifier = KNeighborsClassifier(n_neighbors=kValue, metric='minkowski', p=2)
            classifier.fit(X_train_inner, y_train_inner)

            y_pred = classifier.predict(X_validation_inner)
            inner_accuracies.append(accuracy_score(y_validation_inner, y_pred))

        meanAccuracy = np.mean(inner_accuracies)

        if meanAccuracy > bestAccuracy:
            bestAccuracy = meanAccuracy
            bestHyperParameter = kValue

    finalClassifier = KNeighborsClassifier(n_neighbors=bestHyperParameter, metric='minkowski', p=2)
    finalClassifier.fit(X_train_outer, y_train_outer)

    y_pred = finalClassifier.predict(X_validation_outer)
    accuracy = accuracy_score(y_validation_outer, y_pred)
    accuracies.append(accuracy)
    best_hyper_parameter.append(bestHyperParameter)

meanAccuracy = np.mean(accuracies)
print("meanAccuracy = ", meanAccuracy)
print("accuracies = ", accuracies)

bestHyperParameter = max(set(best_hyper_parameter), key=best_hyper_parameter.count)
print("bestHyperParameter = ", bestHyperParameter)

meanAccuracy =  0.9626373626373628
accuracies =  [0.9560439560439561, 0.989010989010989, 0.978021978021978, 0.967032967032967, 0.9230769230769231]
bestHyperParameter =  3


#### Choose the best model and predict the test set

In [15]:
classifier = KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=2)
classifier.fit(X_train_std, y_train)
y_pred = classifier.predict(X_test_std)

#### Evaluate the model

In [16]:
accuracy = accuracy_score(y_test, y_pred)
print("accuracy = ", accuracy)

accuracy =  0.9736842105263158


In [17]:
cm = confusion_matrix(y_test, y_pred)
print("cm = ", cm)

cm =  [[39  3]
 [ 0 72]]
