In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,recall_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [78]:
# 1. load data
df = pd.read_csv('heart.csv')

In [79]:
# 2. EDA
print(df.head())
print(df.info())
print(df.describe())


   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age    

In [80]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [81]:
y=df["HeartDisease"]
X=df.drop(columns=["HeartDisease"])

In [82]:
categorical_cols=X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols=X.select_dtypes(include=["int64","float64"]).columns.tolist()

In [83]:
numeric_transformer=StandardScaler()
categorical_transformer=OneHotEncoder()
preprocessor=ColumnTransformer(
    transformers=[
        ("num",numeric_transformer,numerical_cols),
        ("cat",categorical_transformer,categorical_cols)
    ]
)

In [84]:
#names of categorical features after one-hot encoding, numerical feature's names remain unchanged
preprocessor.fit(X).named_transformers_['cat'].get_feature_names_out(categorical_cols)

array(['Sex_F', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA',
       'ChestPainType_NAP', 'ChestPainType_TA', 'RestingECG_LVH',
       'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_N',
       'ExerciseAngina_Y', 'ST_Slope_Down', 'ST_Slope_Flat',
       'ST_Slope_Up'], dtype=object)

In [85]:
# 5. Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)

In [86]:
# --- Model 1: Decison Tree ---

dt_pipeline=Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("classifier",DecisionTreeClassifier(random_state=1))
])


In [87]:
#Hyperparameter Grid for Tuning
param_grid = {
    'classifier__max_depth': [3, 5, 7, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__criterion': ['gini', 'entropy']
}

#Grid Search with Cross-validation
grid_search = GridSearchCV(dt_pipeline, param_grid, cv=5, scoring='recall', n_jobs=-1)

In [88]:
#Fit the grid search
grid_search.fit(X_train, y_train)
#best parameters
print("\nBest Decision Tree Parameters:", grid_search.best_params_)



Best Decision Tree Parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 5, 'classifier__min_samples_split': 2}


In [89]:
y_pred_decision_tree= grid_search.predict(X_test)


In [90]:
#--- Logistic Regression ---
logreg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000, random_state=42))
])

In [91]:
#train the logistic regression model
logreg_pipeline.fit(X_train,y_train)

In [92]:
y_pred_logreg = logreg_pipeline.predict(X_test)

In [101]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n Model: {model_name}")
    print("Recall:", recall_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))


In [103]:
evaluate_model(y_test, y_pred_decision_tree, "Decision Tree")


 Model: Decision Tree
Recall: 0.8727272727272727
Confusion Matrix:
 [[66  8]
 [14 96]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.89      0.86        74
           1       0.92      0.87      0.90       110

    accuracy                           0.88       184
   macro avg       0.87      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184



In [104]:
evaluate_model(y_test, y_pred_logreg, "Logistic Regression")


 Model: Logistic Regression
Recall: 0.9181818181818182
Confusion Matrix:
 [[ 64  10]
 [  9 101]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87        74
           1       0.91      0.92      0.91       110

    accuracy                           0.90       184
   macro avg       0.89      0.89      0.89       184
weighted avg       0.90      0.90      0.90       184



The model with the best recall is the logistic regression model