# reg_part_B_NN

# Data import libraries

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import torch

In [22]:
name_data_file="heart_failure_clinical_records_dataset"

data = pd.read_csv(f"../../raw_data/{name_data_file}.csv", na_values=["?"])

In [3]:
data

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


# Cross validation pipeline

In [60]:
X = data.drop(columns=['time']).values
y = data['time'].values.reshape(-1, 1)

N, M = X.shape

input_dim  = M # M number of features
output_dim = 1 # regression problem

# X.shape, y.shape print shapes of X and y to undestand their dimensions

y

array([[  4],
       [  6],
       [  7],
       [  7],
       [  8],
       [  8],
       [ 10],
       [ 10],
       [ 10],
       [ 10],
       [ 10],
       [ 10],
       [ 11],
       [ 11],
       [ 12],
       [ 13],
       [ 14],
       [ 14],
       [ 15],
       [ 15],
       [ 16],
       [ 20],
       [ 20],
       [ 22],
       [ 23],
       [ 23],
       [ 24],
       [ 26],
       [ 26],
       [ 26],
       [ 27],
       [ 28],
       [ 28],
       [ 29],
       [ 29],
       [ 30],
       [ 30],
       [ 30],
       [ 30],
       [ 30],
       [ 31],
       [ 32],
       [ 33],
       [ 33],
       [ 33],
       [ 35],
       [ 38],
       [ 40],
       [ 41],
       [ 42],
       [ 43],
       [ 43],
       [ 43],
       [ 44],
       [ 45],
       [ 50],
       [ 54],
       [ 54],
       [ 55],
       [ 59],
       [ 60],
       [ 60],
       [ 60],
       [ 61],
       [ 63],
       [ 64],
       [ 65],
       [ 65],
       [ 66],
       [ 67],
       [ 68],
      

## Help Functions

In [24]:
# Normalize data based on training set

def get_fold_data(X, y, train_idx, val_idx):
   
    X_train = X[train_idx]
    X_val   = X[val_idx]
    y_train = y[train_idx]
    y_val   = y[val_idx]

    mean = X_train.mean(axis=0)
    std  = X_train.std(axis=0)

    X_train_norm = (X_train - mean) / std
    X_val_norm   = (X_val   - mean) / std

    return X_train_norm, X_val_norm, y_train, y_val

# Tensor conversion

def torch_tensor_conversion(X_train, y_train, X_val, y_val):

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
    X_val_tensor   = torch.tensor(X_val, dtype=torch.float32)
    y_val_tensor   = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)

    return X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor

## 2 layer cross validation

In [None]:
#Parameters:

outer_folds_k_1 = 10
inner_folds_k_2 = 10
random_state = 42

In [59]:
X = data.drop(columns=['time'])
y = data['time']   # pandas Series

CV_outer = KFold(n_splits=outer_folds_k_1, shuffle=True, random_state=random_state) 

fold_results = []  # store per-fold errors
outer_test_mse = []
outer_fold_index = 0
inner_fold_index = 0

for outer_train_idx, outer_test_idx in CV_outer.split(X):
    outer_fold_index += 1
    # KFold object returns indices for train and test sets with shuffling meaning that indices are not sequential, use print to verify
    # .split() returns the indices of the samples for each fold, test and train sets

    X_train_outer, X_test_outer = X.iloc[outer_train_idx], X.iloc[outer_test_idx]
    #print(X_train_outer.shape, X_test_outer.shape)
    #print(outer_train_idx)
    #print(outer_test_idx)
    y_train_outer, y_test_outer = y.iloc[outer_train_idx], y.iloc[outer_test_idx]
    #print(y_train_outer.shape, y_test_outer.shape)

    # let´s start of with the inner cross validation

    CV_inner = KFold(n_splits=inner_folds_k_2, shuffle=True, random_state=random_state) # folds for inner cross-validation object
    inner_mse_baseline = [] # store per-fold errors for inner CV
    for inner_train_idx, inner_test_idx in CV_inner.split(X_train_outer):
        #print("Inner test index:", inner_test_idx)
        #print("Inner train index:", inner_train_idx)

        X_train_inner, X_test_inner = X_train_outer.iloc[inner_train_idx], X_train_outer.iloc[inner_test_idx]
        y_train_inner, y_test_inner = y_train_outer.iloc[inner_train_idx], y_train_outer.iloc[inner_test_idx] 

        y_train_mean = y_train_inner.mean() # calculate mean of y in training set
        #print(f"y_train_mean: {y_train_mean}")
        #print(f"y_test_inner shape: {y_test_inner.shape}")
        #print(f"y_test_inner index: {y_test_inner.index}")
        y_test_pred_inner = pd.Series(y_train_mean, index=y_test_inner)
        #print("\n")
        #print(y_test_inner)

        inner_mse_baseline.append(mean_squared_error(y_test_inner, y_test_pred_inner)) # calculate MSE for this inner fold, makes mean 
        # value of the differences between predicted and actual y values for the test set

    inner_mse_mean = np.mean(inner_mse_baseline) # average MSE across inner folds
    print(f"For outer fold {outer_fold_index} Mean Inner fold MSE:", inner_mse_mean) # just a print to see progress, in eality we will have to do the cross validation using 3 models
    # and the one who has the lowest inner MSE will be selected for the outer test set evaluation


For outer fold 1 Mean Inner fold MSE: 6068.262610662071
For outer fold 2 Mean Inner fold MSE: 6064.930475785971
For outer fold 3 Mean Inner fold MSE: 5976.123268870377
For outer fold 4 Mean Inner fold MSE: 6117.984225693169
For outer fold 5 Mean Inner fold MSE: 6164.234766059408
For outer fold 6 Mean Inner fold MSE: 5928.042359268602
For outer fold 7 Mean Inner fold MSE: 5910.197012358446
For outer fold 8 Mean Inner fold MSE: 5868.612915368192
For outer fold 9 Mean Inner fold MSE: 6024.998737762518
For outer fold 10 Mean Inner fold MSE: 6037.150942437636
