# 3_cross_validation_on_classification.ipynb

## Data import and test

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import mean_squared_error

In [2]:
name_data_file="heart_failure_clinical_records_dataset"

data = pd.read_csv(f"../../raw_data/{name_data_file}.csv", na_values=["?"])

In [5]:
# Split the data frame into features and labels
X = data.drop(columns=["DEATH_EVENT"])
y = data["DEATH_EVENT"]

assert X.shape == (299, 12), "There should be 299 samples and 12 features in the heart_failure_clinical_records_dataset dataset."
assert y.shape == (299,), "There should be 299 labels in the heart_failure_clinical_records_dataset dataset."

# Cross validation of the methods

In [None]:
outer_folds_k_1 = 10
inner_folds_k_2 = 10
random_state = 42

CV_outer = KFold(n_splits=outer_folds_k_1, shuffle=True, random_state=random_state) 

fold_results = []  # store per-fold errors
outer_test_mse = []
outer_fold_index = 0
inner_fold_index = 0

for outer_train_idx, outer_test_idx in CV_outer.split(X):
    outer_fold_index += 1
    # KFold object returns indices for train and test sets with shuffling meaning that indices are not sequential, use print to verify
    # .split() returns the indices of the samples for each fold, test and train sets

    X_train_outer, X_test_outer = X.iloc[outer_train_idx], X.iloc[outer_test_idx]
    #print(X_train_outer.shape, X_test_outer.shape)
    #print(outer_train_idx)
    #print(outer_test_idx)
    y_train_outer, y_test_outer = y.iloc[outer_train_idx], y.iloc[outer_test_idx]
    #print(y_train_outer.shape, y_test_outer.shape)

    # let´s start of with the inner cross validation

    CV_inner = KFold(n_splits=inner_folds_k_2, shuffle=True, random_state=random_state) # folds for inner cross-validation object
    inner_mse_baseline = [] # store per-fold errors for inner CV
    for inner_train_idx, inner_test_idx in CV_inner.split(X_train_outer): 
        inner_fold_index += 1

        X_train_inner, X_test_inner = X_train_outer.iloc[inner_train_idx], X_train_outer.iloc[inner_test_idx]
        y_train_inner, y_test_inner = y_train_outer.iloc[inner_train_idx], y_train_outer.iloc[inner_test_idx] 

        y_train_largest_class = y_train_inner.mode()[0] # calculate mode of y in training set, values are 0 and 1, so mode is the largest class
        #print(f"y_train_largest_class: {y_train_largest_class}")
        #print(f"y_train_mean: {y_train_mean}")
        y_test_pred_inner = pd.Series(y_train_largest_class, index=y_test_inner.index)
        #print(y_test_pred_inner)
        #print("\n")
        #print(f"Number of 0s in y_test_inner: {sum(y_test_inner == 0)}, Number of 1s in y_test_inner: {sum(y_test_inner == 1)}")

        inner_mse_baseline.append(mean_squared_error(y_test_inner, y_test_pred_inner)) # calculate MSE for this inner fold, makes mean 
        #print(inner_mse_baseline)
        # value of the differences between predicted and actual y values for the test set

    inner_mse_mean = np.mean(inner_mse_baseline) # average MSE across inner folds
    print(f"Baseline: For outer fold {outer_fold_index} Mean Inner fold MSE:", inner_mse_mean) # just a print to see progress, in reality we will have to do the cross validation using 3 models
    # and the one who has the lowest inner MSE will be selected for the outer test set evaluation

Baseline: For outer fold 1 Mean Inner fold MSE: 0.31168091168091167
Baseline: For outer fold 2 Mean Inner fold MSE: 0.30826210826210826
Baseline: For outer fold 3 Mean Inner fold MSE: 0.3121082621082621
Baseline: For outer fold 4 Mean Inner fold MSE: 0.3085470085470085
Baseline: For outer fold 5 Mean Inner fold MSE: 0.3384615384615385
Baseline: For outer fold 6 Mean Inner fold MSE: 0.3232193732193732
Baseline: For outer fold 7 Mean Inner fold MSE: 0.3264957264957265
Baseline: For outer fold 8 Mean Inner fold MSE: 0.33048433048433046
Baseline: For outer fold 9 Mean Inner fold MSE: 0.3121082621082621
Baseline: For outer fold 10 Mean Inner fold MSE: 0.337037037037037
