# Import dependencies

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from tensorflow.keras.utils import to_categorical
import numpy as np

# Load dataset
Task 1

In [2]:
data = pd.read_csv('heart_failure_clinical_records_dataset.csv')
data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


# Inspect columns and datatypes
Task 2

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


# Identify classes of labels column and count how many in each class
Task 3

In [4]:
print('Classes and number of values in `data`: ', Counter(data['DEATH_EVENT']))

Classes and number of values in `data`:  Counter({0: 203, 1: 96})


# Extract label columns into pandas Series
Task 4

**DO NOT** use `y = data[['DEATH_EVENT']]` as it will generate a *pandas DataFrame* instead!

In [5]:
y = data['DEATH_EVENT']

# Extract features columns
Task 5

Select specific columns by name, not by position; using column names is safer and clearer, especially if the column order changes.

(So don't use `X = data.iloc[:,0:-1]`)

In [6]:
print(data.columns)

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')


In [7]:
X = data[[
    'age',
    'anaemia',
    'creatinine_phosphokinase',
    'diabetes',
    'ejection_fraction',
    'high_blood_pressure',
    'platelets',
    'serum_creatinine',
    'serum_sodium',
    'sex',
    'smoking',
    'time'
]]

# One-hot encode relevant columns in X
Task 6

But since `data` already has the categorical columns (`'anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking'`) as 1s and 0s (integers), you do not need to use one-hot encoding with `pd.get_dummies()` for those columns. We can skip task 6 in that case, since the features are already in a numeric format suitable for machine learning models.

# `train_test_split` data
Task 7

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 420)

# Scaling features
Task 8

`ColumnTransformer()` lets you apply different preprocessing steps to specific columns in your data.

When you fit and transform your data with it, each transformation is applied to its specified columns, and the results are combined into a single output array.

In [9]:
ct = ColumnTransformer([(
    'numeric',
    # The string `'numeric'` in the `ColumnTransformer` is just a name or label for that transformation step.
    # It helps you identify what the transformation is doing,
    # especially if you have multiple steps (like scaling numeric features and encoding categorical features). 
    # It does not affect the transformation itself—it's only for reference and readability.
    StandardScaler(),
    [
        'age',
        'creatinine_phosphokinase',
        'ejection_fraction',
        'platelets',
        'serum_creatinine',
        'serum_sodium',
        'time'
    ]
)])

# Apply `ct` on `X_train`
Task 9

In [10]:
X_train = ct.fit_transform(X_train)

# Apply `ct` on `X_test`
Task 10

In [11]:
X_test = ct.transform(X_test)

# Initialize `LabelEncoder`
Task 11

In [12]:
le = LabelEncoder()

# Fit `le` to `y_train`
Task 12

`.astype(str)` converts the data in my_labels to strings. This is useful if your labels are not already strings, or if they are a mix of types (like integers and strings).

`LabelEncoder` expects the input to be a consistent type. By converting to string, you make sure all values are treated the same way, which helps avoid errors during encoding.

In [13]:
y_train = le.fit_transform(y_train.astype(str))

# Transform `y_test` using `le`
Task 13

In [14]:
y_test = le.transform(y_test.astype(str))

# Transform encoded training labels y_train into a binary vector
Task 14

In [15]:
y_train = to_categorical(y_train)

# Transform encoded test labels y_test into a binary vector
Task 15

In [16]:
y_test = to_categorical(y_test)

# Setting architecture
Task 16

In [17]:
model = Sequential()

# Add input layer
Task 17

In [18]:
model.add(
    InputLayer(
        shape = (X_train.shape[1], )
    )
)

# Add hidden layers with ReLU
Task 18

In [19]:
model.add(Dropout(0.1))
model.add(
    Dense(
        512,
        activation = 'relu'
    )
)
model.add(Dropout(0.1))
model.add(
    Dense(
        64,
        activation = 'relu'
    )
)
model.add(Dropout(0.1))
model.add(
    Dense(
        16,
        activation = 'relu'
    )
)

# Add an output layer with softmax
Task 19

In [20]:
model.add(
    Dense(
        y_train.shape[1],
        activation = 'softmax'
    )
)

# Compile the model instance `model`
Task 20

In [21]:
model.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

# Fit `model`
Task 21

In [22]:
early_stop = EarlyStopping(monitor='val_loss', patience=100)

model.fit(
    X_train,
    y_train,
    epochs = 500,
    batch_size = 16, 
    validation_split=0.2,
    callbacks=[early_stop]
)

Epoch 1/500
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7185 - loss: 0.6017 - val_accuracy: 0.6875 - val_loss: 0.5497
Epoch 2/500
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6962 - loss: 0.4739 - val_accuracy: 0.7292 - val_loss: 0.5073
Epoch 3/500
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7829 - loss: 0.4578 - val_accuracy: 0.7708 - val_loss: 0.5046
Epoch 4/500
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8170 - loss: 0.4461 - val_accuracy: 0.7500 - val_loss: 0.4619
Epoch 5/500
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8715 - loss: 0.3775 - val_accuracy: 0.7500 - val_loss: 0.4770
Epoch 6/500
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7999 - loss: 0.4299 - val_accuracy: 0.7708 - val_loss: 0.4754
Epoch 7/500
[1m12/12[0m [32m━━━

<keras.src.callbacks.history.History at 0x3209177d0>

# Evaluate trained model
Task 22

In [23]:
loss, acc = model.evaluate(X_test, y_test)
print('Final loss: ', loss)
print('Final accuracy: ', acc)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8264 - loss: 0.4884 
Final loss:  0.4576593339443207
Final accuracy:  0.8333333134651184


# Get predictions for the test data `X_test`
Task 23

In [24]:
y_estimate = model.predict(X_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


# Use `argmax()` to select the indices of the true classes for each label encoding in `y_estimate`
Task 24

If you do not specify the `axis` in `np.argmax(y_estimate, axis=1)`, NumPy will use the default, which is `axis=None`. This means it will find the index of the maximum value in the **flattened** array, not along each row.

For one-hot encoded labels, you want the index of the maximum value **for each row** (each sample), which is why you use `axis=1`. If you leave out `axis=1`, you will get a single value, not a vector of class labels for each sample.

In [25]:
y_estimate = np.argmax(y_estimate, axis = 1)

# Use `argmax()` to select the indices of the true classes for each label encoding in `y_test`
Task 25

If you do not specify the `axis` in `np.argmax(y_test, axis=1)`, NumPy will use the default, which is `axis=None`. This means it will find the index of the maximum value in the **flattened** array, not along each row.

For one-hot encoded labels, you want the index of the maximum value **for each row** (each sample), which is why you use `axis=1`. If you leave out `axis=1`, you will get a single value, not a vector of class labels for each sample.

In [26]:
y_true = np.argmax(y_test, axis = 1)

# Display classification_report
Task 26

In [27]:
print(classification_report(y_true, y_estimate))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87        39
           1       0.76      0.76      0.76        21

    accuracy                           0.83        60
   macro avg       0.82      0.82      0.82        60
weighted avg       0.83      0.83      0.83        60



In [28]:
model.summary()