**In this notebook, we classify how prone is a person to getting a heart attack given some crucial paramters about his/her heart.**
I have used 5 algorithms along with grid search and cross validation to get best parameters for algorithm to train upon.

Algorithms used are:
1. SVM
2. Decision trees with pruning
3. K nearest Neighbor
4. AdaBoost
5. Neural Network using MLP Classifier
6. Custom Neural Network 

# IMPORTS

In [None]:
np.random.seed(42)

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import seaborn
import time
import tensorflow as tf
import pandas as pd
import numpy as np
import itertools

from sklearn.utils import shuffle
from tqdm import tqdm

# Plots and stats
import statistics as st
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Label Encoding & Scaling
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

# Model Building
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold,KFold,cross_val_score,cross_val_predict

# Models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import GridSearchCV

#Model Evaluation
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score,precision_score, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Time
import time

In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset')

In [None]:
df.head()

In [None]:
for col in df.columns:
    num_distinct_values = len(df[col].unique())
    print(f"{col}: {num_distinct_values} distinct values")

In [None]:
df['output'].value_counts()

In [None]:
duplicate_rows_data = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_data.shape)

In [None]:
df = df.drop_duplicates()

In [None]:
data_na = (df.isnull().sum() / len(df)) * 100
data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :data_na})
missing_data.head(20)

In [None]:
df['thall'] = df['thall'].replace(0, 2)

In [None]:
cp_mapping = {0: 'typical angina',
              1: 'atypical angina',
              2: 'non-anginal pain',
              3: 'asymptomatic'}
df['cp'] = df['cp'].map(cp_mapping)

slp_mapping = {0: 'unsloping',
               1: 'flat',
               2: 'downsloping'}
df['slp'] = df['slp'].map(slp_mapping)

thall_mapping = {1: 'fixed defect',
                 2: 'normal',
                 3: 'reversible defect'}
df['thall'] = df['thall'].map(thall_mapping)

rest_ecg_mapping = {0: 'normal',
                    1: 'ST-T wave abnormality',
                    2: 'left ventricular hypertrophy'}
df['restecg'] = df['restecg'].map(rest_ecg_mapping)

sex_mapping = {1: 'male',
               0: 'female'}
df['sex'] = df['sex'].map(sex_mapping)

In [None]:
df.shape

In [None]:
target_count = df['output'].value_counts()
plt.pie(target_count, labels=target_count.index, autopct='%1.1f%%')
plt.title('Target Distribution')
plt.show()

In [None]:
data = df.copy()

In [None]:
categorical_columns = ['sex','cp','restecg','slp','thall']
numerical_columns = ['age','trtbps','chol','fbs','thalachh','exng','oldpeak','caa','output']

dummy_variables = pd.get_dummies(data, columns=categorical_columns, drop_first=False)

In [None]:
scaler = StandardScaler()

scaled_numerical = scaler.fit_transform(data[numerical_columns])
scaled_numerical_df  = pd.DataFrame(scaled_numerical, columns = numerical_columns)


In [None]:
dummy_variables = dummy_variables.drop(numerical_columns, axis = 1)

processed_df = pd.concat([dummy_variables, scaled_numerical_df], axis = 1)

In [None]:
correlation_matrix = processed_df.corr()

plt.figure(figsize=(12, 11))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt='.2f')
plt.title("Correlation Matrix Heatmap")
plt.savefig('Correlation map')
plt.show()

corr = processed_df.corr()
target_corr = corr['output'].drop('output')

target_corr_sorted = target_corr.sort_values(ascending=False)

sns.set(font_scale=0.8)
sns.set_style("white")
sns.set_palette("PuBuGn_d")
sns.heatmap(target_corr_sorted.to_frame(), cmap="coolwarm", annot=True, fmt='.2f')
plt.title('Correlation with Heart attack')

plt.show()

# Preprocessing

In [None]:
le = LabelEncoder()

for col in ['sex','cp','restecg','slp','thall']:
    df[col] = le.fit_transform(df[col])
scaler = StandardScaler()

for col in ['age','trtbps','chol','fbs','thalachh','exng','oldpeak','caa']:
    df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

In [None]:
categorical_columns = ['sex','cp','restecg','slp','thall']
numerical_columns = ['age','trtbps','chol','fbs','thalachh','exng','oldpeak','caa']

In [None]:
X = data.drop('output', axis=1)
y = data['output']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)])
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
execution_times = []
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)

    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    train_acc = accuracy_score(y_train, train_preds)
    test_acc = accuracy_score(y_test, test_preds)

    end_time = time.time()
    execution_time = end_time - start_time

    execution_times.append(execution_time)

    return train_acc, test_acc, test_preds

In [None]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# GRID SEARCH WITH CROSS VALIDATION

In [None]:
# Decision trees
import pandas as pd
param_grid = {
    'max_depth': [2,3, 5,7,9,10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
}

clf = DecisionTreeClassifier()

grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

results = pd.DataFrame(grid_search.cv_results_)

results = results[['param_max_depth', 'param_min_samples_split', 'param_min_samples_leaf', 'param_criterion', 'mean_test_score']]

results.rename(columns={
    'param_max_depth': 'Max Depth',
    'param_min_samples_split': 'Min Samples Split',
    'param_min_samples_leaf': 'Min Samples Leaf',
    'param_criterion': 'Criterion',
    'mean_test_score': 'Mean Test Score'
}, inplace=True)

results.sort_values(by='Mean Test Score', ascending=False, inplace=True)

results.to_csv('grid_search_results.csv', index=False)

print("Best Hyperparameters:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print(f'The accuracy of the best model is: {accuracy}')


In [None]:
# KNN
import pandas as pd

param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
}

clf = KNeighborsClassifier()

grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

results = pd.DataFrame(grid_search.cv_results_)

results = results[['param_n_neighbors', 'param_weights', 'param_p', 'mean_test_score']]

results.rename(columns={
    'param_n_neighbors': 'Number of Neighbors',
    'param_weights': 'Weights',
    'param_p': 'p',
    'mean_test_score': 'Mean Test Score (Accuracy)'
}, inplace=True)

results.sort_values(by='Mean Test Score (Accuracy)', ascending=False, inplace=True)
results.to_csv('knn_grid_search_results.csv', index=False)

print("Best Hyperparameters:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print(f'The accuracy of the best model is: {accuracy}')


In [None]:
# SVM
param_grid = {
    # 'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
}

clf = SVC()
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
results = pd.DataFrame(grid_search.cv_results_)
results = results[['param_kernel', 'param_gamma', 'mean_test_score']]
results.rename(columns={
    'param_kernel': 'Kernel',
    'param_gamma': 'Gamma',
    'mean_test_score': 'Mean Test Score (Accuracy)'
}, inplace=True)
results.sort_values(by='Mean Test Score (Accuracy)', ascending=False, inplace=True)
results.to_csv('svm_grid_search_results.csv', index=False)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test)
print(f'The accuracy of the best model is: {accuracy}')
print("Best Hyperparameters:")
print(grid_search.best_params_)

In [None]:
# ADABOOST
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.001, 0.01, 0.1],
    'base_estimator__max_depth': [1, 2, 3],
}

base_classifier = DecisionTreeClassifier()
clf = AdaBoostClassifier(base_classifier)
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
results = pd.DataFrame(grid_search.cv_results_)
results = results[['param_n_estimators', 'param_learning_rate', 'param_base_estimator__max_depth', 'mean_test_score']]

results.rename(columns={
    'param_n_estimators': 'Number of Estimators',
    'param_learning_rate': 'Learning Rate',
    'param_base_estimator__max_depth': 'Base Estimator Max Depth',
    'mean_test_score': 'Mean Test Score (Accuracy)'
}, inplace=True)

results.sort_values(by='Mean Test Score (Accuracy)', ascending=False, inplace=True)

results.to_csv('adaboost_grid_search_results.csv', index=False)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test)
print(f'The accuracy of the best model is: {accuracy}')
print("Best Hyperparameters:")
print(grid_search.best_params_)


In [None]:
# MLP Classifier
param_grid = {
    'hidden_layer_sizes': [(5,), (10,), (15,), (20,), (30,), (40,), (50,), (60,), (80,)],
    'learning_rate_init': [0.0001, 0.001, 0.01],
}

clf = MLPClassifier()

grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
results = pd.DataFrame(grid_search.cv_results_)
results = results[['param_hidden_layer_sizes', 'param_learning_rate_init', 'mean_test_score']]
results.rename(columns={
    'param_hidden_layer_sizes': 'Hidden Layer Sizes',
    'param_learning_rate_init': 'Learning Rate Init',
    'mean_test_score': 'Mean Test Score (Accuracy)'
}, inplace=True)

results.sort_values(by='Mean Test Score (Accuracy)', ascending=False, inplace=True)
results.to_csv('mlp_classifier_grid_search_results.csv', index=False)
best_model = grid_search.best_estimator_

print("Best Hyperparameters:")
print(grid_search.best_params_)

y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test)
print(f'The accuracy of the best model is: {accuracy}')

# USING ABOVE PARAMETERS FOR MODEL TRAINING

In [None]:
train_results = []
test_results = []

In [None]:
# SVM

model_1 = SVC(gamma = 'scale', kernel = 'sigmoid')
train_acc, test_acc, y_pred = evaluate_model(model_1, X_train, y_train, X_test, y_test)
train_results.append(train_acc)
test_results.append(test_acc)
cm = confusion_matrix(y_test, y_pred)
acc_SVM=accuracy_score(y_test,y_pred)
f1_SVM=f1_score(y_test,y_pred)
clf_SVM=classification_report(y_test,y_pred)

print('***********SVM***********')
print('\n')
print('Accuracy : ',acc_SVM)
print('F1 Score : ',f1_SVM)
print(10*'=====')
print('Confusion Matrix :\n',cm)
print(10*'=====')
print('Classification Report :\n',clf_SVM)
print(30*'========')

plot_confusion_matrix(cm, classes=['No Disease', 'Disease'], title=model_1.__class__.__name__)
plt.savefig('confusion_matrix_svm.png')
plt.show()

In [None]:
#KNN
knn_params = {
    'n_neighbors': 5,
    'p': 1,
    'weights': 'uniform'
}
model_2 = KNeighborsClassifier(**knn_params)

train_acc, test_acc, y_pred = evaluate_model(model_2, X_train, y_train, X_test, y_test)
train_results.append(train_acc)
test_results.append(test_acc)
cm = confusion_matrix(y_test, y_pred)
acc_knn = accuracy_score(y_test, y_pred)
f1_knn = f1_score(y_test, y_pred)
clf_knn = classification_report(y_test, y_pred)

print('***********KNN with Single Set of Parameters***********')
print('\n')
print('Accuracy : ', acc_knn)
print('F1 Score : ', f1_knn)
print(10 * '=====')
print('Confusion Matrix :\n', cm)
print(10 * '=====')
print('Classification Report :\n', clf_knn)
print(30 * '========')

plot_confusion_matrix(cm, classes=['No Disease', 'Disease'], title=model_2.__class__.__name__)
plt.title(f'Confusion Matrix for KNN with Single Set of Parameters')
plt.savefig('confusion_matrix_knn.png')
plt.show()


In [None]:
# Decision tree
tree_params = {
    'criterion': 'gini',
    'max_depth': 3,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'ccp_alpha': 0.001
}


model_3 = DecisionTreeClassifier(**tree_params)
train_acc, test_acc, y_pred = evaluate_model(model_3, X_train, y_train, X_test, y_test)
train_results.append(train_acc)
test_results.append(test_acc)
cm = confusion_matrix(y_test, y_pred)
acc_dt = accuracy_score(y_test, y_pred)
f1_dt = f1_score(y_test, y_pred)
clf_dt = classification_report(y_test, y_pred)

print('***********Decision Tree with Pruned Parameters and Early Stopping***********')
print('\n')
print('Accuracy : ', acc_dt)
print('F1 Score : ', f1_dt)
print(10 * '=====')
print('Confusion Matrix :\n', cm)
print(10 * '=====')
print('Classification Report :\n', clf_dt)
print(30 * '========')

plot_confusion_matrix(cm, classes=['No Disease', 'Disease'], title=model_3.__class__.__name__)
plt.title(f'Confusion Matrix for Pruned Parameters and Early Stopping')
plt.savefig('confusion_matrix_pruned_early_stopping.png')
plt.show()


In [None]:
# Boosting with AdaBoost
base_classifier = DecisionTreeClassifier(max_depth=1)
model_4 = AdaBoostClassifier(
    base_classifier,
    n_estimators=200,
    learning_rate=0.01,
    random_state=42
)

train_acc, test_acc, y_pred = evaluate_model(model_4, X_train, y_train, X_test, y_test)
train_results.append(train_acc)
test_results.append(test_acc)
cm = confusion_matrix(y_test, y_pred)
acc_b = accuracy_score(y_test, y_pred)
f1_b = f1_score(y_test, y_pred)
clf_b = classification_report(y_test, y_pred)

print('***********AdaBoost***********')
print('\n')
print('Accuracy : ', acc_b)
print('F1 Score : ', f1_b)
print(10 * '=====')
print('Confusion Matrix :\n', cm)
print(10 * '=====')
print('Classification Report :\n', clf_b)
print(30 * '========')

plot_confusion_matrix(cm, classes=['No Disease', 'Disease'], title=model_4.__class__.__name__)
plt.savefig('confusion_matrix AdaBoost.png')
plt.show()

In [None]:
# Neural Network
model_5 = MLPClassifier(hidden_layer_sizes =  (30,),learning_rate_init = 0.001)
train_acc, test_acc, y_pred = evaluate_model(model_5, X_train, y_train, X_test, y_test)
train_results.append(train_acc)
test_results.append(test_acc)
cm = confusion_matrix(y_test, y_pred)
acc_nn=accuracy_score(y_test,y_pred)
f1_nn=f1_score(y_test,y_pred)
clf_nn=classification_report(y_test,y_pred)

print('***********Neural Network***********')
print('\n')
print('Accuracy : ',acc_nn)
print('F1 Score : ',f1_nn)
print(10*'=====')
print('Confusion Matrix :\n',cm)
print(10*'=====')
print('Classification Report :\n',clf_nn)
print(30*'========')

plot_confusion_matrix(cm, classes=['No Disease', 'Disease'], title=model_5.__class__.__name__)
plt.savefig('confusion_matrix_mlp.png')
plt.show()

In [None]:
# Neural Network
class NeuralNetwork(tf.keras.Model):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.dense1 = tf.keras.layers.Dense(30, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        x = self.dense1(inputs)
        return self.dense2(x)

model = NeuralNetwork()
learning_rate = 0.001
custom_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

start_time = time.time()
history = model.fit(X_train, y_train, epochs=20, validation_split=0.2)

loss, accuracy = model.evaluate(X_test, y_test)

end_time = time.time()
execution_time = end_time - start_time
execution_times.append(execution_time)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss', color='b')
plt.plot(history.history['val_loss'], label='Validation Loss', color='r')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy', color='b')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='r')
plt.axhline(y=accuracy, color='g', linestyle='-', label='Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

final_val_loss = history.history['val_loss'][-1]
final_val_accuracy = history.history['val_accuracy'][-1]

print("Validation Loss:", final_val_loss)
print("Validation Accuracy:", final_val_accuracy)

train_results.append(final_val_accuracy)
test_results.append(accuracy)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

y_pred = model.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)

cm = confusion_matrix(y_test, y_pred_binary)
acc_Cnn = accuracy_score(y_test, y_pred_binary)
f1_Cnn = f1_score(y_test, y_pred_binary)
clf_Cnn = classification_report(y_test, y_pred_binary)

print('***********Custom Neural Network ***********')
print('\n')
print('Accuracy : ', acc_Cnn)
print('F1 Score : ', f1_Cnn)
print(10 * '=====')
print('Confusion Matrix :\n', cm)
print(10 * '=====')
print('Classification Report :\n', clf_Cnn)
print(30 * '========')

plot_confusion_matrix(cm, classes=['No Disease', 'Disease'], title=model.__class__.__name__)
plt.savefig('Custom NN confusion_matrix.png')
plt.show()


# MODEL COMPARISSIONS

In [None]:
len(train_results)

In [None]:
len(test_results)

In [None]:
model_names = ["SVM","KNN", "Decision Trees","AdaBoost","MLPClassifier","Custom Neural_Network"]
plt.figure(figsize=(10,5))
plt.plot(model_names, train_results, 'o-', label="Training Accuracy", color='b')
plt.plot(model_names, test_results, 'o-', label="Testing Accuracy", color='r')
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.legend()
plt.title("Model Comparisons - Accuracy")
plt.savefig("Model Comparisons - Accuracy")
plt.show()

In [None]:
model_names = [ "SVM","KNN_1", "Decision Trees","AdaBoost","MLP Classifier","Custom Neural_Network"]
plt.figure(figsize=(10,5))
plt.plot(model_names, execution_times, 'o-', label="Wall Clock Time", color='b')
plt.ylabel("Time")
plt.xticks(rotation=45)
plt.legend()
plt.title("Model Comparisons - Wall clock time")
plt.savefig("Model Comparisons - Wall clock time")
plt.show()

In [None]:
tbl=pd.DataFrame()
tbl['Model']=pd.Series(['SVM','Decision Tree','KNN','AdaBoost', 'MLP Classifier','Custom Neural Network'])
tbl['Accuracy']=pd.Series([acc_SVM,acc_dt,acc_knn,acc_b, acc_nn, acc_Cnn])
tbl['F1_Score']=pd.Series([f1_SVM,f1_dt,f1_knn,f1_b, f1_nn, f1_Cnn])
tbl.set_index('Model')