In [None]:
# Import dependencies
import pandas as pd
import tensorflow as tf
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report

!pip install -q -U keras-tuner
import keras_tuner as kt

Using TensorFlow backend


In [None]:
# Read in the cardiovascular dataset from Google Sheets
cvd_df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSDchXr1EhgCSsxlxJ3lWPhh1kT5EJS3yv4DJ2YLeMIC3y4uq-Pp4EQknrs9zAiaI3ulne2Jyi6gR6G/pub?gid=602879552&single=true&output=csv")
cvd_df.head()

Unnamed: 0,general_health,checkup,exercise,heart_disease,skin_cancer,other_cancer,depression,diabetes,arthritis,sex,age_category,height_cm,weight_kg,bmi,smoking_history,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150,32.66,14.54,Yes,0,30,16,12
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165,77.11,28.29,No,0,30,0,4
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163,88.45,33.47,No,4,12,3,16
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180,93.44,28.73,No,0,30,30,8
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191,88.45,24.37,Yes,0,8,4,0


# Data Preprocessing

In [None]:
# Define categorical columns for encoding and numeric columns for scaling
categorical_cols = cvd_df.dtypes[cvd_df.dtypes == 'object'].index.tolist()

numeric_cols = cvd_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Calculate mean and standard deviation for numeric columns
means = cvd_df[numeric_cols].mean()
stds = cvd_df[numeric_cols].std()

# Define the number of standard deviations for the threshold
threshold_std = 1

# Identify outliers for numeric columns based on standard deviations
outliers = ((cvd_df[numeric_cols] - means).abs() > threshold_std * stds).any(axis=1)

# Filter rows where 'heart_disease' is 'No' and not an outlier
filtered_rows = (~outliers) | (cvd_df['heart_disease'] == 'Yes')

# Create the filtered DataFrame
cvd_df_filtered = cvd_df[filtered_rows]

# Check value counts of target variable
cvd_df_filtered['heart_disease'].value_counts()

No     55566
Yes    24081
Name: heart_disease, dtype: int64

In [None]:
# Encode categorical columns using get_dummies
encoded_df = pd.get_dummies(cvd_df_filtered, columns=categorical_cols, drop_first=False)

# Scale numerical columns using StandardScaler
scaler = StandardScaler()
encoded_df[numeric_cols] = scaler.fit_transform(encoded_df[numeric_cols])
encoded_df.head()

Unnamed: 0,height_cm,weight_kg,bmi,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption,general_health_Excellent,general_health_Fair,general_health_Good,...,age_category_45-49,age_category_50-54,age_category_55-59,age_category_60-64,age_category_65-69,age_category_70-74,age_category_75-79,age_category_80+,smoking_history_No,smoking_history_Yes
1,-0.83226,-0.393566,-0.00059,-0.556508,0.413674,-1.25037,-0.166917,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,1.184427,0.70489,0.095604,-0.556508,0.413674,1.515171,0.547281,0,0,0,...,0,0,0,0,0,0,1,0,1,0
6,0.512198,-0.881918,-1.213951,-0.556508,-0.472114,-0.512893,-0.881115,0,1,0,...,0,0,0,1,0,0,0,0,0,1
8,-1.101152,-0.698954,-0.182048,-0.556508,-0.725196,-0.144154,-0.166917,0,1,0,...,0,0,0,0,1,0,0,0,0,1
9,-1.101152,0.583139,1.394228,-0.556508,-0.725196,-0.144154,-0.702566,0,1,0,...,0,0,0,0,0,1,0,0,1,0


In [None]:
# Assign the target variable 'heart_disease' to y
y = encoded_df['heart_disease_Yes']

# Assign the remaining columns (features) to X
X = encoded_df.drop(columns=['heart_disease_Yes', 'heart_disease_No'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Fit a random over sampler model to the training data to balance out the classes fully
ROS = RandomOverSampler(random_state=78)
X_train_resampled, y_train_resampled = ROS.fit_resample(X_train, y_train)

# Check the value counts to ensure oversampling has worked
y_train_resampled.value_counts()

1    41653
0    41653
Name: heart_disease_Yes, dtype: int64

# Neural Network Optimisation 2

In [None]:
# Defining hyperparameters - the 3rd best performing according to kerastuner with only 2 layers to reduce complexity
hyperparameters = {
    'activation': 'relu',
    'first_units': 26,
    'num_layers': 2,
    'units_0': 16,
    'units_1': 6,
    'units_2': 11,
    'units_3': 6,
    'units_4': 26,
    'tuner/epochs': 20,
    'tuner/initial_epoch': 0,
    'tuner/bracket': 0,
    'tuner/round': 0}

In [None]:
# Defining new model for the best model
def build_best_model(input_shape, activation, num_layers, units):
    best_nn = tf.keras.models.Sequential()

    # Input layer
    best_nn.add(tf.keras.layers.Input(shape=(input_shape,)))

    # Hidden layers
    for _ in range(num_layers):
        best_nn.add(tf.keras.layers.Dense(units, activation=activation))

    # Output layer (assuming binary classification with sigmoid activation)
    best_nn.add(tf.keras.layers.Dense(1, activation="sigmoid"))

    # Compile the model
    best_nn.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

    return best_nn

In [None]:
# Defining best hyperparameters
input_shape = X_train_resampled.shape[1]
activation = hyperparameters['activation']
num_layers = hyperparameters['num_layers']
# Calculate the total number of units across all layers
total_units = sum([hyperparameters[f'units_{i}'] for i in range(num_layers)])
# Implement early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

In [None]:
# Build the best model for retraining
best_model = build_best_model(input_shape, activation, num_layers, total_units)

In [None]:
# Retrain the best model on the full training dataset (including both original and resampled data)
best_model.fit(X_train_resampled, y_train_resampled, epochs=20, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20


<keras.src.callbacks.History at 0x7d35807e48e0>

Though only 2 layers have been utilised in this model, it would appear that overfitting is still occuring.

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = best_model.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

623/623 - 1s - loss: 0.2167 - accuracy: 0.9129 - 786ms/epoch - 1ms/step
Loss: 0.21667662262916565, Accuracy: 0.9129168391227722


In [None]:
# Print model summary
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 22)                1034      
                                                                 
 dense_1 (Dense)             (None, 22)                506       
                                                                 
 dense_2 (Dense)             (None, 1)                 23        
                                                                 
Total params: 1563 (6.11 KB)
Trainable params: 1563 (6.11 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Make predictions using the trained model
# Setting threshold of 0.5 since our model utilises sigmoid activation on a binary classification problem
y_pred = (best_model.predict(X_test) > 0.5).astype(np.int32)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94     13913
           1       0.89      0.81      0.85      5999

    accuracy                           0.91     19912
   macro avg       0.91      0.88      0.89     19912
weighted avg       0.91      0.91      0.91     19912



- For class 0 (no heart disease), precision decreased by 1% from the previous optimisaton whilst recall increased by 1%. The f1-score has therefore remained the same.
- For class 1 (heart disease), precision increased by 2% whilst recall decreased by 1% result in the same f1 score.
Accuracy overall has remained the same compared to the previous model suggesting that though model complexity was reduced, there is no significant further optimisation.
- Though our final accuracy value of 91% is considered good for a predictive model, we believe further scope for improvement still exists. The issue of overfitting still persists and would need to be addressed before such a model can be deployed to make life-altering predictions.
- In the future, we plan to explore advanced regularisation techniques to further combat overfitting, seek methods to balance class distribution without sacrificing dataset richness, expand our data sources beyond the United States, and place increased emphasis on feature engineering and ongoing model evaluation to enhance the predictive capabilities.

In [None]:
# Download h5 file
from google.colab import files
best_model.save('cvd_nn_model3.h5')
files.download("cvd_nn_model3.h5")

  saving_api.save_model(


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>