In [23]:
# importing necessary libraries 

# Basic data pre-processing libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

# For splitting dataset between train and test set .
from sklearn.model_selection import train_test_split 

# For applying the feature scaling on dataset
from sklearn.preprocessing import StandardScaler

# For oversampling the unbalanced dataset
from imblearn.over_sampling import SMOTE  

# Select important features from dataset
from sklearn.feature_selection import SelectFromModel

# RandomForest Library
from sklearn.ensemble import RandomForestClassifier

# XGBOOST Library 
import xgboost as xgb
from xgboost import XGBClassifier
import optuna

# Neural Network Library
import tensorflow as tf 
from tensorflow.keras import Sequential,regularizers 
from tensorflow.keras.layers import Dense, Dropout , BatchNormalization,LeakyReLU
from tensorflow.keras.losses import BinaryCrossentropy

# calculating Precision , Recall and F-1 score and other tools  for model evaluation
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score ,mean_squared_error,make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV 

In [3]:
# Load the Dataset 
df = pd.read_csv('heart_attack_prediction_dataset.csv')

# Printing the dataset 
df.head()


Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [4]:
# Info about dataset 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Patient ID                       8763 non-null   object 
 1   Age                              8763 non-null   int64  
 2   Sex                              8763 non-null   object 
 3   Cholesterol                      8763 non-null   int64  
 4   Blood Pressure                   8763 non-null   object 
 5   Heart Rate                       8763 non-null   int64  
 6   Diabetes                         8763 non-null   int64  
 7   Family History                   8763 non-null   int64  
 8   Smoking                          8763 non-null   int64  
 9   Obesity                          8763 non-null   int64  
 10  Alcohol Consumption              8763 non-null   int64  
 11  Exercise Hours Per Week          8763 non-null   float64
 12  Diet                

In [5]:
"""Setting column 'Blood Pressure' 
Splitting Between Diastolic and Systolic Blood Pressure"""

df['BP_Systolic'] = df['Blood Pressure'].apply(lambda x: x.split('/')[0])
df['BP_Diastolic'] = df['Blood Pressure'].apply(lambda x: x.split('/')[1])

In [6]:
# Converting object(here category present) into integer 
mapping = {"Unhealthy": -1, "Average": 0, "Healthy": 1}
df["Diet"] = df["Diet"].map(mapping)

In [7]:
# Converting object(here category present) into integer 
mapping = {"Male": -1, "Female": 0}
df["Sex"] = df["Sex"].map(mapping)

In [8]:
"""Converting 'Object' and 'Boolean' Datatype into int"""
cat_columns = ['BP_Systolic','BP_Diastolic']
df[cat_columns] = df[cat_columns].astype(int)

In [9]:
# Again check the datatypes of the columns
df.dtypes

Patient ID                          object
Age                                  int64
Sex                                  int64
Cholesterol                          int64
Blood Pressure                      object
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                 int64
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent  

In [10]:
# Removing the target column from the features set
X = df.drop(['Patient ID', 'Blood Pressure', 'Country', 'Continent', 'Hemisphere', 'Heart Attack Risk'], axis = 1)
y = df['Heart Attack Risk']
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              8763 non-null   int64  
 1   Sex                              8763 non-null   int64  
 2   Cholesterol                      8763 non-null   int64  
 3   Heart Rate                       8763 non-null   int64  
 4   Diabetes                         8763 non-null   int64  
 5   Family History                   8763 non-null   int64  
 6   Smoking                          8763 non-null   int64  
 7   Obesity                          8763 non-null   int64  
 8   Alcohol Consumption              8763 non-null   int64  
 9   Exercise Hours Per Week          8763 non-null   float64
 10  Diet                             8763 non-null   int64  
 11  Previous Heart Problems          8763 non-null   int64  
 12  Medication Use      

In [11]:
# Check class distribution(to check if my class distribution is balanced or not)
class_distribution = df['Heart Attack Risk'].value_counts()
print("Class Distribution:\n", class_distribution)

Class Distribution:
 Heart Attack Risk
0    5624
1    3139
Name: count, dtype: int64


In [15]:
# Oversample the minority class using SMOTE
sm = SMOTE(random_state=42)
X_oversampled, y_oversampled = sm.fit_resample(X, y)

In [16]:
# Again check the distribution of class
check_distribution = y_oversampled.value_counts()
print(check_distribution)

Heart Attack Risk
0    5624
1    5624
Name: count, dtype: int64


In [17]:
# Splitting Data into train set and test set 
X_train , X_test , y_train , y_test = train_test_split(X_oversampled , y_oversampled ,test_size=0.2, random_state= 42)

In [18]:
# Feature Scaling to standarize the features(such that mean is zero and variance is one.)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [24]:
# Doing the feature selection
selector = SelectFromModel(xgb.XGBClassifier()).fit(X_train, y_train)
selected_features = selector.get_support()
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

In [25]:
# Creating New feature lists according to top 10 features only (provided in the last of the notebook)[For XGboost/RandomForest only]
X_new = df.drop(['Patient ID', 'Blood Pressure', 'Country', 'Continent', 'Hemisphere', 'Heart Attack Risk','Sex','Cholesterol','Obesity','Previous Heart Problems','Medication Use','Stress Level','Sedentary Hours Per Day','Income','BMI','Triglycerides','Physical Activity Days Per Week','Sleep Hours Per Day','BP_Systolic','BP_Diastolic'], axis = 1)
y_new = df['Heart Attack Risk']
print(X_new.info())

# Oversample the minority class using SMOTE
sm = SMOTE(random_state=42)
X_new_oversampled, y_new_oversampled = sm.fit_resample(X_new, y_new)

# Again check the distribution of class
check_distribution = y_new_oversampled.value_counts()
print("Class-Distribution\n",check_distribution)

# Splitting Data into train set and test set 
X_new_train , X_new_test , y_new_train , y_new_test = train_test_split(X_new_oversampled , y_new_oversampled ,test_size=0.2, random_state= 42)

# Feature Scaling to standarize the features(such that mean is zero and variance is one.)
scaler = StandardScaler()
X_new_train = scaler.fit_transform(X_new_train)
X_new_test = scaler.transform(X_new_test)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      8763 non-null   int64  
 1   Heart Rate               8763 non-null   int64  
 2   Diabetes                 8763 non-null   int64  
 3   Family History           8763 non-null   int64  
 4   Smoking                  8763 non-null   int64  
 5   Alcohol Consumption      8763 non-null   int64  
 6   Exercise Hours Per Week  8763 non-null   float64
 7   Diet                     8763 non-null   int64  
dtypes: float64(1), int64(7)
memory usage: 547.8 KB
None
Class-Distribution
 Heart Attack Risk
0    5624
1    5624
Name: count, dtype: int64


In [1]:
# Making the XGboost model for our heart attack risk prediction (Model-1) [Train with selected features]
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 40),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5)
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Run Optuna to find the best hyperparameters
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params

# Train the model with the best hyperparameters
model = xgb.XGBClassifier(**best_params)
model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy with tuned hyperparameters:', accuracy)


NameError: name 'optuna' is not defined

In [None]:
# Making the RandomForest model for our heart attack risk prediction (Model-2)
def objective1(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'random_state': trial.suggest_int('random_state', 40, 45)
    }

    rf_model = RandomForestClassifier(**params)  # Pass individual hyperparameters
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_pred)
    return rf_accuracy

# Run Optuna to find the best hyperparameters
study = optuna.create_study(direction='maximize')
study.optimize(objective1, n_trials=10)

# Get the best hyperparameters
best_params = study.best_params

# Train the model with the best hyperparameters
model1 = RandomForestClassifier(**best_params)  # Pass individual hyperparameters
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy from Randomforest with tuned hyperparameters:', accuracy)


[I 2024-01-06 22:05:59,257] A new study created in memory with name: no-name-e087df2f-b458-48b1-b703-038f05c1d981
[I 2024-01-06 22:06:03,727] Trial 0 finished with value: 0.6528888888888889 and parameters: {'max_depth': 8, 'n_estimators': 455, 'random_state': 42}. Best is trial 0 with value: 0.6528888888888889.
[I 2024-01-06 22:06:06,331] Trial 1 finished with value: 0.6626666666666666 and parameters: {'max_depth': 10, 'n_estimators': 239, 'random_state': 42}. Best is trial 1 with value: 0.6626666666666666.
[I 2024-01-06 22:06:07,845] Trial 2 finished with value: 0.64 and parameters: {'max_depth': 4, 'n_estimators': 281, 'random_state': 45}. Best is trial 1 with value: 0.6626666666666666.
[I 2024-01-06 22:06:11,059] Trial 3 finished with value: 0.6551111111111111 and parameters: {'max_depth': 8, 'n_estimators': 358, 'random_state': 44}. Best is trial 1 with value: 0.6626666666666666.
[I 2024-01-06 22:06:11,671] Trial 4 finished with value: 0.6368888888888888 and parameters: {'max_depth

Accuracy from Randomforest with tuned hyperparameters: 0.6626666666666666


In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
# print("sample_weight shape:", sample_weight_array.shape)
sample_weights = np.ones(y_train.shape[0])
# Calculate sample weights based on the class distribution
class_counts = np.bincount(y_train)
sample_weights[y_train == 1] = class_counts[0] / class_counts[1]

# Normalize the sample weights
sample_weights /= np.sum(sample_weights)


X_train shape: (8998, 22)
y_train shape: (8998,)


In [None]:
# Making the neural network model for our heart attack risk prediction (Model-3)

model2 = Sequential([
    Dense(units=128, activation='relu'),
    BatchNormalization(),
    Dropout(rate=0.2),  # Add dropout layer with a rate of 0.2
    Dense(units=64, activation='relu'),
    BatchNormalization(),
    Dropout(rate=0.2),  # Add dropout layer with a rate of 0.2
    Dense(units=32, activation='relu'),
    BatchNormalization(),
    Dropout(rate=0.2),  # Add dropout layer with a rate of 0.2
    Dense(units=1, activation='sigmoid')
])

model2.compile(loss= BinaryCrossentropy(from_logits=True ,reduction='none') ,optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),metrics=['accuracy'],sample_weight_mode='temporal')
model2.fit(X_train,y_train,epochs=100,batch_size=32, validation_data=(X_test, y_test), callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)],sample_weight=sample_weights)

test_loss, test_acc = model2.evaluate(X_test, y_test, verbose=0)
print('\nTest accuracy:', test_acc*100)

# Make predictions
y_pred = model2.predict(X_test)
y_pred_binary = np.where(y_pred > 0.5, 1, 0)  # Convert predicted probabilities to binary labels

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary, average='macro')

# Print the evaluation results
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

2024-01-06 22:02:16.957781: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-06 22:02:17.124854: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-06 22:02:17.125333: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Epoch 1/100


  output, from_logits = _get_logits(
2024-01-06 22:02:18.930820: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-01-06 22:02:20.544213: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fde692288a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-06 22:02:20.544276: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2024-01-06 22:02:20.553550: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-01-06 22:02:20.582698: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1704558740.662751   18652 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

Test accuracy: 65.68889021873474
Accuracy: 0.6568888888888889
Precision: 0.7193627450980392
Recall: 0.5194690265486726
F1 score: 0.6505089250000604


In [None]:
# Evaluating the Neural Network Model
test_loss, test_acc = model2.evaluate(X_test, y_test, verbose=0)
print('\nTest accuracy:', test_acc*100)

# Make predictions
y_pred = model2.predict(X_test)
y_pred_binary = np.where(y_pred > 0.5, 1, 0)  # Convert predicted probabilities to binary labels

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary, average='macro')

# Print the evaluation results
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)


Test accuracy: 65.68889021873474
Accuracy: 0.6568888888888889
Precision: 0.7193627450980392
Recall: 0.5194690265486726
F1 score: 0.6505089250000604


In [None]:
# Perform 5-fold cross-validation for XGBOOST model
scores = cross_val_score(model, X_train_selected, y_train, cv=5)
print('Cross-validation scores:', scores)

# Calculate the mean cross-validation score
mean_score = np.mean(scores)
print('Mean cross-validation score:', mean_score)

# Make predictions
y_pred = model.predict(X_test_selected)
y_pred_binary = np.where(y_pred > 0.5, 1, 0)  # Convert predicted probabilities to binary labels

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary, average='macro')

# Print the evaluation results
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

Cross-validation scores: [0.62555556 0.62666667 0.61277778 0.6364647  0.61756531]
Mean cross-validation score: 0.6238060033351862
Accuracy: 0.6222222222222222
Precision: 0.6639344262295082
Recall: 0.5017699115044247
F1 score: 0.6168673714036617


In [None]:
# Perform 5-fold cross-validation for Randomforest model
scores = cross_val_score(model1, X_train, y_train, cv=5)
print('Cross-validation scores:', scores)

# Calculate the mean cross-validation score
mean_score = np.mean(scores)
print('Mean cross-validation score:', mean_score)

# Make predictions
y_pred = model1.predict(X_test)
y_pred_binary = np.where(y_pred > 0.5, 1, 0)  # Convert predicted probabilities to binary labels

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary, average='macro')

# Print the evaluation results
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

Cross-validation scores: [0.64055556 0.65611111 0.65111111 0.67982212 0.64035575]
Mean cross-validation score: 0.6535911308751776
Accuracy: 0.6644444444444444
Precision: 0.6979936642027456
Recall: 0.5849557522123894
F1 score: 0.66244887553499


In [None]:
#Getting the most important features while training with XGBOOST.

# Assuming X_train_selected was created by indexing columns from the original DataFrame (df)
selected_feature_indices = np.where(selector.get_support())[0]
selected_feature_names = df.columns[selected_feature_indices]

# Get feature importance scores from the model
feature_importances = model.feature_importances_

# Create a dataframe of feature importances
feature_importances_df = pd.DataFrame({'Importance': feature_importances, 'Feature': selected_feature_names})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values('Importance', ascending=False)

# Print the top 10 most important features
print('Top 10 Most Important Features:')
print(feature_importances_df.head(10))


Top 10 Most Important Features:
   Importance                  Feature
5    0.209388      Alcohol Consumption
3    0.165799           Family History
6    0.151671  Exercise Hours Per Week
1    0.133247               Heart Rate
4    0.126508                  Smoking
7    0.093092                     Diet
0    0.085519                      Age
2    0.034777                 Diabetes


In [None]:
#Getting the most important features while training with RandomForest.(When train with selected features [X_train_selected])

# Assuming X_train_selected was created by indexing columns from the original DataFrame (df)
selected_feature_indices = np.where(selector.get_support())[0]
selected_feature_names = df.columns[selected_feature_indices]

# Get feature importance scores from the model
feature_importances = model1.feature_importances_

# Create a dataframe of feature importances
feature_importances_df = pd.DataFrame({'Importance': feature_importances, 'Feature': selected_feature_names})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values('Importance', ascending=False)

# Print the top 10 most important features
print('Top 10 Most Important Features:')
print(feature_importances_df.head(10))
