Copyright © 2023 "Bronte" Sihan Li

## Heart Disease Prediction Using an ANN

In [1]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
import torch
from torch.utils.data.dataloader import default_collate

In [2]:
# Load training and test set
train_data = pd.read_csv('data/heart_train.csv')
test_data = pd.read_csv('data/heart_test.csv')
train_data.head()

train_data['HeartDisease'].value_counts()

1    403
0    315
Name: HeartDisease, dtype: int64

In [3]:
train_data.shape

(718, 12)

### Preprocessing

In [4]:
from sklearn.preprocessing import StandardScaler
from prince import FAMD

In [5]:
NUMERICAL_COLS = []
CATEGORICAL_COLS = []
for col in train_data.drop(columns=['HeartDisease']).columns:
    print(col, train_data[col].unique())
    if type(train_data[col].unique()[0]) == str:
        print(train_data[col].value_counts())
        CATEGORICAL_COLS.append(col)
    else:
        NUMERICAL_COLS.append(col)

Age [66 65 63 58 54 38 51 62 55 52 46 60 59 36 43 41 47 49 45 53 40 61 57 64
 69 74 75 33 56 35 39 37 50 32 42 76 28 34 72 71 48 44 70 30 67 68 31 77
 29]
Sex ['F' 'M']
M    575
F    143
Name: Sex, dtype: int64
ChestPainType ['NAP' 'ASY' 'ATA' 'TA']
ASY    389
NAP    154
ATA    136
TA      39
Name: ChestPainType, dtype: int64
RestingBP [146 150 136 192 105 100 130 110 140 120 115 112 155 134 125 160 106 200
 142 154 145 104 156 135  80 148 122 118 124 137  95 152 132 126 131 172
 128 170 138 133 114 174 180 108 141 144 101 139 178 190 165 158 143  92
 123  96 102 164 127 117 113   0 185]
Cholesterol [278 235 223 164 283   0 213 224 203 214 100 230 284 293 169 308 204 267
 315 268 211 271 253 237 212 246 264 254 240 177 186 210 247 276 231 338
 303 216 208 173 201 310 298 274 192 225 289 219 160 207 185 281 341 245
 282 295 257 197 312 132 233 221 238 417 152 297 199 184 161 123 180 412
 262 306 232 342 159 258 458 249 327 181 209 260 166 248 266 141 222 234
 182 218 318 564 518 316 294

In [6]:
# Normalize numerical columns with standard scaler
sc = StandardScaler()
train_data[NUMERICAL_COLS] = sc.fit_transform(train_data[NUMERICAL_COLS])
test_data[NUMERICAL_COLS] = sc.transform(test_data[NUMERICAL_COLS])

def transform_famd(data):
    # Apply FAMD to data
    famd = FAMD(n_components=12, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42)
    famd.fit(data.drop(columns=['HeartDisease']))
    projected_famd = famd.transform(data.drop(columns=['HeartDisease']))
    return projected_famd

In [7]:
train_data_projected = transform_famd(train_data)

# Split data into train and validation
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_data_projected, train_data['HeartDisease'], test_size=0.2, random_state=42)

In [8]:
class HeartDiseaseDataset(torch.utils.data.Dataset):
    def __init__(self, X_train: pd.DataFrame, y_train: pd.DataFrame, transform=None):
        self.transform = transform

        self.x_data = X_train.to_numpy()
        self.y_data = y_train.to_numpy()

    # Get data size
    def __len__(self):
        return len(self.x_data)

    # Getting data samples
    def __getitem__(self, idx):
        x = torch.from_numpy(self.x_data[idx])
        x = x.to(torch.float32)
        y = torch.from_numpy(np.array(self.y_data[idx]))
        sample = (x, y)

        if self.transform:
            sample = self.transform(sample)

        return sample

In [9]:
hd_dataset_train = HeartDiseaseDataset(X_train, y_train)
first_data = hd_dataset_train[0]
features, labels = first_data
print(type(features), type(labels))
print(features, labels)

<class 'torch.Tensor'> <class 'torch.Tensor'>
tensor([-2.3466, -0.0225,  0.6223,  0.4052, -0.3945, -0.6796, -0.4774,  0.1325,
        -0.0051, -1.0397,  0.4054,  0.3462]) tensor(0)


In [10]:

if torch.backends.mps.is_available():
   mps_device = torch.device("mps")

batch_size = 32

#Load training set
train_loader = torch.utils.data.DataLoader(dataset=hd_dataset_train,
                                             batch_size=batch_size,
                                                shuffle=True,
                                                collate_fn=lambda x: [y.to(mps_device) for y in default_collate(x)])

# Load validation set
test_loader = torch.utils.data.DataLoader(dataset=HeartDiseaseDataset(X_val, y_val),
                                             batch_size=X_val.shape[0], shuffle=False,
                                             collate_fn=lambda x: [y.to(mps_device) for y in default_collate(x)])

In [11]:
from utils.net import LinearNN
from utils.train_test import train, test

random_seed = 1
torch.backends.cudnn.enabled = False
torch.manual_seed(random_seed)

# Initialize model and optimizer
model = LinearNN()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)


In [12]:
print(model)
for param in model.parameters():
    print(param.shape)

LinearNN(
  (fc1): Linear(in_features=12, out_features=30, bias=True)
  (activation): ReLU()
  (fc2): Linear(in_features=30, out_features=2, bias=True)
  (softmax): Softmax(dim=None)
)
torch.Size([30, 12])
torch.Size([30])
torch.Size([2, 30])
torch.Size([2])


In [13]:
# Train model
n_epochs = 200
train_losses_all = []
train_counter_all = []
test_losses = test(network=model, test_loader=test_loader)
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
for epoch in range(1, n_epochs + 1):
  train_losses, train_counter = train(epoch=epoch, train_loader=train_loader, batch_size_train=batch_size,
        network=model, optimizer=optimizer, log_interval=10, save_dir='results/heart_disease/nn1')
  test_losses += test(network=model, test_loader=test_loader)
  train_losses_all += train_losses
  train_counter_all += train_counter


Test set: Avg. loss: 0.7105, Accuracy: 55/144 (38%)

torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12

  x = self.softmax(x)



Test set: Avg. loss: 0.7048, Accuracy: 57/144 (40%)

torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12

In [14]:
fig = px.line(x=train_counter_all, y=train_losses_all, title='Training Loss and Test Loss')
fig.add_trace(
    px.scatter(x=test_counter,
               y=test_losses,
               color_discrete_sequence=['red']).data[0])
fig.update_layout(
    xaxis_title='Number of training examples seen',
    yaxis_title='Loss',
    )
fig.show()

Now let's try a different network with more epochs and a different optimizer:

In [15]:
# Initialize model and optimizer
model2 = LinearNN(l2=5, l3=10)
optimizer2 = torch.optim.Adam(model2.parameters(), lr=0.01)
n_epochs = 200

print(model2)
for param in model2.parameters():
    print(param.shape)

LinearNN(
  (fc1): Linear(in_features=12, out_features=5, bias=True)
  (activation): ReLU()
  (fc2): Linear(in_features=5, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (softmax): Softmax(dim=None)
)
torch.Size([5, 12])
torch.Size([5])
torch.Size([10, 5])
torch.Size([10])
torch.Size([2, 10])
torch.Size([2])


In [16]:
train_losses_all2 = []
train_counter_all2 = []
test_losses2 = test(network=model2, test_loader=test_loader)
test_counter2 = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
for epoch in range(1, n_epochs + 1):
  train_losses2, train_counter2 = train(epoch=epoch, train_loader=train_loader, batch_size_train=batch_size,
        network=model2, optimizer=optimizer2, log_interval=10, save_dir='results/heart_disease/nn2/')
  test_losses2 += test(network=model2, test_loader=test_loader)
  train_losses_all2 += train_losses2
  train_counter_all2 += train_counter2


Test set: Avg. loss: 0.6953, Accuracy: 63/144 (44%)

torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])



Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.


size_average and reduce args will be deprecated, please use reduction='sum' instead.



torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([30, 12])
torch.Size([30, 2]) torch.Size([30])

Test set: Avg. loss: 0.6604, Accuracy: 103/144 (72%)

torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 12])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 1

In [17]:
fig = px.line(x=train_counter_all2, y=train_losses_all2, title='Training Loss and Test Loss')
fig.add_trace(
    px.scatter(x=test_counter2,
               y=test_losses2,
               color_discrete_sequence=['red']).data[0])
fig.update_layout(
    xaxis_title='Number of training examples seen',
    yaxis_title='Loss',
    )
fig.show()

Let us use the hold-out set to test the performances of our two models:

In [18]:
# Load holdout test set
test_data_projected = transform_famd(test_data)
hd_dateset_test = HeartDiseaseDataset(test_data_projected, test_data['HeartDisease'])

In [19]:
def test_accuracy(net):
    trainset, testset = hd_dataset_train, hd_dateset_test

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False,
        collate_fn=lambda x: [y.to(mps_device) for y in default_collate(x)])

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            features, labels = data
            outputs = net(features)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [20]:
print(test_accuracy(model))
print(test_accuracy(model2))

0.82
0.79



Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.



In [21]:
total_params1 = sum(
	param.numel() for param in model.parameters()
)
total_params2 = sum(
    param.numel() for param in model2.parameters()
)
print(total_params1, total_params2)

452 147
