In [16]:
import kagglehub
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [None]:
path = kagglehub.dataset_download("fedesoriano/heart-failure-prediction")

print("Path to dataset files:", path)

## Preprocessing

In [14]:
df = pd.read_csv(os.path.join(path, 'heart.csv'))
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [19]:
df.shape

(918, 12)

In [None]:
df.isnull().sum()   # No Null Value as the DataSet Rating in 10.0

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [23]:
numeric_cols = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]

In [24]:
# Removing Outliers

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

df.shape    

(701, 12)

In [26]:
# Encoding Labels to for Categorical features 
cat_cols = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [27]:
input = df.drop("HeartDisease", axis=1)
target = df["HeartDisease"]

In [28]:
input.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,1,140,289,0,1,172,0,0.0,2
1,49,0,2,160,180,0,1,156,0,1.0,1
2,37,1,1,130,283,0,2,98,0,0.0,2
3,48,0,0,138,214,0,1,108,1,1.5,1
4,54,1,2,150,195,0,1,122,0,0.0,2


In [29]:
target.head()

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

## Train Test Split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

## Pipeline

In [31]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression())  # Placeholder
])

## Hyper Parameterize Tuning

In [33]:
# Using Grid-CV
param_grid = [
    {
        'clf': [LogisticRegression(max_iter=1000)],
        'clf__C': [0.1, 1, 10],
        'clf__solver': ['liblinear', 'lbfgs']
    },
    {
        'clf': [RandomForestClassifier(random_state=42)],
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [None, 10, 20]
    },
    {
        'clf': [SVC()],
        'clf__C': [0.1, 1, 10],
        'clf__kernel': ['linear', 'rbf'],
        'clf__gamma': ['scale', 'auto']
    },
    {
        'clf': [XGBClassifier(use_label_encoder=False, eval_metric='logloss')],
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [3, 5],
        'clf__learning_rate': [0.01, 0.1]
    },
    {
        'clf': [MLPClassifier(max_iter=500, random_state=42)],
        'clf__hidden_layer_sizes': [(32,), (64,), (32, 32)],
        'clf__activation': ['relu', 'tanh'],
        'clf__alpha': [0.0001, 0.001],
        'clf__learning_rate': ['constant', 'adaptive']
    }
]


In [34]:
grid = GridSearchCV(pipe, param_grid, cv=5, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## Score and Params of All Models

In [38]:
results = pd.DataFrame(grid.cv_results_)
results = results.sort_values("rank_test_score")


print("\nðŸ“Š Model Performance Summary (Sorted by Score):\n")
for i, row in results.iterrows():
    model_name = type(row['param_clf']).__name__
    mean_acc = row['mean_test_score']
    params = row['params']
    
    print(f"Model: {model_name}\n")
    print(f"Accuracy: {mean_acc:.4f}\n")
    print(f"Best Params: {params}\n")
    print("-" * 50)


ðŸ“Š Model Performance Summary (Sorted by Score):

Model: XGBClassifier

Accuracy: 0.8643

Best Params: {'clf': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, ...), 'clf__learning_rate': 0.1, 'clf__max_depth': 3, 'clf__n_estimators': 100}

-------------------------------

# Best Model

In [40]:
print("\nBest Estimator:\n", grid.best_estimator_)
print("\nBest Parameters:\n", grid.best_params_)



Best Estimator:
 Pipeline(steps=[('scaler', StandardScaler()),
                ('clf',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None, device=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric='logloss',
                               feature_types=None, feature_weights=None,
                               gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=0.1,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=3, max_leaves=None,
                               min_child_weight=None, missing=nan,
          

In [41]:
print("\nTest Set Classification Report:\n", classification_report(y_test, grid.predict(X_test)))


Test Set Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89        67
           1       0.90      0.89      0.90        74

    accuracy                           0.89       141
   macro avg       0.89      0.89      0.89       141
weighted avg       0.89      0.89      0.89       141



## Saving Model as PKL

In [None]:
import joblib

joblib.dump(grid.best_estimator_, 'best_heart_model.pkl')
print("\nSaved best model as 'best_heart_model.pkl'")


Saved best model as 'best_heart_model.pkl'


In [45]:
''' 
To use Saved Model

import joblib

model = joblib.load('best_heart_model.pkl')
sample = X_test.iloc[0:1]
prediction = model.predict(sample)
print("Prediction:", prediction)

'''

' \nTo use Saved Model\n\nimport joblib\n\nmodel = joblib.load(\'best_heart_model.pkl\')\nsample = X_test.iloc[0:1]\nprediction = model.predict(sample)\nprint("Prediction:", prediction)\n\n'