# **Cardiovascular Disease Prediction using Artificial Neural Network (ANN)**

This model is designed to predict the likelihood of cardiovascular disease based on a variety of health-related features. The dataset includes key attributes such as:

1. Height (cm)
2. Weight (kg)
3. Systolic Blood Pressure (ap_hi)
4. Diastolic Blood Pressure (ap_lo)
5. Age in Years
6. Gender
7. Cholesterol Levels
8. Glucose Levels
9. Smoking Habits
10. Alcohol Consumption
11. Physical Activity

The model is built using an Artificial Neural Network (ANN) and leverages these features to predict the presence or absence of cardiovascular disease (the target variable, cardio). The features were carefully processed, including the handling of outliers and transformations to ensure the best possible model performance. The goal is to assist in the early detection of cardiovascular conditions, enabling better prevention and management strategies.

This project is a step towards integrating machine learning into healthcare, providing a tool that can help in predicting and potentially reducing the risk of cardiovascular diseases through data-driven insights.

In [None]:
#Importing Important Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/cardiovascular-disease-dataset/health_data.csv')

In [None]:
df.head()

In [None]:
df.shape

# **Cleaning The Data**

In [None]:
df.drop(columns=['Unnamed: 0', 'id'],inplace=True)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
#Removing Extreme values to set biologically relevent range for diastolic blood pressure and systolic blood pressure

df = df[(df['ap_hi'] > 0) & (df['ap_lo'] > 0)]
df = df[(df['ap_hi'] < 300) & (df['ap_lo'] < 300)]

df['ap_hi'] = df['ap_hi'].apply(lambda x: 90 if x < 90 else (180 if x > 180 else x))
df['ap_lo'] = df['ap_lo'].apply(lambda x: 60 if x < 60 else (120 if x > 120 else x))

print(df[['ap_hi', 'ap_lo']].describe())

In [None]:
#Optimizing other features as well

df['age_years'] = df['age']/365
df['age_years'] = df['age_years'].apply(lambda x: 40 if x<40 else (80 if x>80 else x))
df['weight'] = df['weight'].apply(lambda x: 40 if x < 40 else (120 if x > 120 else x))
df['height'] = df['height'].apply(lambda x: 140 if x < 140 else (190 if x > 190 else x))

print(df[['age_years', 'weight']].describe())

In [None]:
df.shape

In [None]:
df.drop(columns=['age'],inplace=True)

In [None]:
df.head()

In [None]:
#Lets check for null values
sns.heatmap(df.isnull(), yticklabels=False, cmap='viridis')

In [None]:
#Function to remove outliers, we are removing the values after +-3 Standard deviation in a normal distribution.

def remove_outliers(df, column_name):

    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    mean_value = df[column_name].mean()
    std_value = df[column_name].std()

    outliers = df[(df[column_name] < mean_value - 3 * std_value) | (df[column_name] > mean_value + 3 * std_value)]
    print(f"\nNumber of outliers in the '{column_name}' column: {len(outliers)}")

    df_filtered = df[(df[column_name] >= mean_value - 3 * std_value) & (df[column_name] <= mean_value + 3 * std_value)]

    print(f"Original DataFrame shape: {df.shape}")
    print(f"DataFrame shape after removing outliers: {df_filtered.shape}")

    return df_filtered

In [None]:
features = ['height', 'weight', 'ap_hi', 'ap_lo', 'age_years']

for feature in features:
    df_filtered = remove_outliers(df, feature)

In [None]:
df_filtered.head()

# **Exploring The Data**

In [None]:
#Checking the distibution of continuous features
for feature in features:
    sns.histplot(df_filtered[feature], kde=True,bins=10)
    plt.show()

In [None]:
#Checking the disribution each continuous variable based on target variable
features = ['height', 'weight', 'ap_hi', 'ap_lo', 'age_years']

for feature in features:
    plt.figure(figsize=(10, 6))
    sns.kdeplot(df_filtered[df_filtered['cardio'] == 0][feature], label='No Cardio', shade=True)
    sns.kdeplot(df_filtered[df_filtered['cardio'] == 1][feature], label='Cardio', shade=True)
    plt.title(f'Distribution of {feature}')
    plt.legend()
    plt.show()

In [None]:
#Checking the disribution each discrete variable based on target variable
binary_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

for feature in binary_features:
    plt.figure(figsize=(8, 5))
    sns.countplot(x=feature, hue='cardio', data=df_filtered)
    plt.title(f'Distribution of {feature} by Cardio')
    plt.legend(title='Cardio')
    plt.show()

In [None]:
#Checking the representation of each class(0,1) in cardio feature
cardio_counts = df['cardio'].value_counts()

plt.figure(figsize=(8, 6))
sns.barplot(x=cardio_counts.index, y=cardio_counts.values, palette='viridis')

plt.title('Diagnosed with CD vs Not Diagnosed with CD')
plt.xlabel('Cardiovascular Disease (0 = No, 1 = Yes)')
plt.ylabel('Number of Cases')
plt.xticks(ticks=[0, 1], labels=['Not Diagnosed', 'Diagnosed'])

plt.show()

In [None]:
#Check for Multicolinearity
correlation_matrix = df_filtered.corr()

plt.figure(figsize=(8, 5))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap')
plt.show()

# **Training The Model Using ANN**

In [None]:
#Scaling the data before feeding to ANN
scaler = MinMaxScaler()
df_filtered[features] = scaler.fit_transform(df_filtered[features])

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from keras.metrics import Precision, Recall

In [None]:
#Lets split the data
X = df_filtered.drop('cardio', axis=1)
y = df_filtered['cardio']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Creating model, note these hyperparameters and parameters provides best accuracy for this model as provided by keras autotuner
model = Sequential()

model.add(Dense(units=80, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(units=48, activation='relu'))
model.add(Dense(units=20, activation='relu'))

model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=0.00042972),
              loss='binary_crossentropy',
              metrics=['accuracy', Precision(), Recall()])

In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

In [None]:
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(X_test, y_test)

y_pred = model.predict(X_test)
y_pred = np.round(y_pred).astype(int)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')
print(f'Test Precision: {test_precision:.4f}')
print(f'Test Recall: {test_recall:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

# **Code For Hyperparameter Tuning**

**The code below can be used for hyperparameter tuning to search for the best optimal hyperparamets, you can continue with it.**

In [None]:
# import keras_tuner as kt

# def build_model(hp):
#     model = Sequential()
#     # Tune the number of units in the first Dense layer
#     model.add(Dense(units=hp.Int('units1', min_value=16, max_value=128, step=16), activation='relu', input_shape=(X_train.shape[1],)))
#     model.add(Dense(units=hp.Int('units2', min_value=8, max_value=64, step=8), activation='relu'))
#     model.add(Dense(units=hp.Int('units3', min_value=4, max_value=32, step=4), activation='relu'))
#     model.add(Dense(1, activation='sigmoid'))

#     model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-1, sampling='LOG')),
#                   loss='binary_crossentropy',
#                   metrics=['accuracy'])
#     return model

In [None]:
# tuner = kt.Hyperband(
#     build_model,
#     objective='val_accuracy',
#     max_epochs=50,
#     hyperband_iterations=2,
#     directory='my_dir',
#     project_name='heart_disease_detection'
# )

# tuner.search(X_train, y_train, epochs=50, validation_split=0.2)

In [None]:
# best_model = tuner.get_best_models(num_models=1)[0]

**Note: You can use this notebook to further optimize this model.**