In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

In [16]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [17]:
df.corr()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
age,1.0,0.088006,-0.081584,-0.101012,0.060098,0.093289,-0.052354,0.159187,-0.045966,0.06543,0.018668,-0.224068,0.253729
anaemia,0.088006,1.0,-0.190741,-0.012729,0.031557,0.038182,-0.043786,0.052174,0.041882,-0.094769,-0.10729,-0.141414,0.06627
creatinine_phosphokinase,-0.081584,-0.190741,1.0,-0.009639,-0.04408,-0.07059,0.024463,-0.016408,0.05955,0.079791,0.002421,-0.009346,0.062728
diabetes,-0.101012,-0.012729,-0.009639,1.0,-0.00485,-0.012732,0.092193,-0.046975,-0.089551,-0.15773,-0.147173,0.033726,-0.001943
ejection_fraction,0.060098,0.031557,-0.04408,-0.00485,1.0,0.024445,0.072177,-0.011302,0.175902,-0.148386,-0.067315,0.041729,-0.268603
high_blood_pressure,0.093289,0.038182,-0.07059,-0.012732,0.024445,1.0,0.049963,-0.004935,0.037109,-0.104615,-0.055711,-0.196439,0.079351
platelets,-0.052354,-0.043786,0.024463,0.092193,0.072177,0.049963,1.0,-0.041198,0.062125,-0.12512,0.028234,0.010514,-0.049139
serum_creatinine,0.159187,0.052174,-0.016408,-0.046975,-0.011302,-0.004935,-0.041198,1.0,-0.189095,0.00697,-0.027414,-0.149315,0.294278
serum_sodium,-0.045966,0.041882,0.05955,-0.089551,0.175902,0.037109,0.062125,-0.189095,1.0,-0.027566,0.004813,0.08764,-0.195204
sex,0.06543,-0.094769,0.079791,-0.15773,-0.148386,-0.104615,-0.12512,0.00697,-0.027566,1.0,0.445892,-0.015608,-0.004316


In [18]:
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"{col}: count outliers = {outliers.shape[0]}")

age: count outliers = 0
anaemia: count outliers = 0
creatinine_phosphokinase: count outliers = 29
diabetes: count outliers = 0
ejection_fraction: count outliers = 2
high_blood_pressure: count outliers = 0
platelets: count outliers = 21
serum_creatinine: count outliers = 29
serum_sodium: count outliers = 4
sex: count outliers = 0
smoking: count outliers = 0
time: count outliers = 0
DEATH_EVENT: count outliers = 0


In [19]:
for col in ['creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # cap the outliers
    df[col] = df[col].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)


In [None]:
X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [21]:
cols_to_scale = ['age', 'creatinine_phosphokinase', 'ejection_fraction',
                    'platelets', 'serum_creatinine', 'serum_sodium', 'time']

from sklearn.preprocessing import RobustScaler

cols_to_scale = [
    'age',
    'creatinine_phosphokinase',
    'ejection_fraction',
    'platelets',
    'serum_creatinine',
    'serum_sodium',
    'time'
]

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

scaler = RobustScaler()
X_train_scaled[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test_scaled[cols_to_scale] = scaler.transform(X_test[cols_to_scale])


In [22]:
log_model = LogisticRegression()
log_params = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}
log_grid = GridSearchCV(log_model, log_params, cv=5, scoring='accuracy')
log_grid.fit(X_train_scaled, y_train)
best_log = log_grid.best_estimator_

In [23]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_params = {
    'max_depth': [2, 3, 4, 5], 
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [2, 4, 8]
}
dt_grid = GridSearchCV(estimator=dt_model, param_grid=dt_params, cv=3, n_jobs=-1, scoring='accuracy')
dt_grid.fit(X_train_scaled, y_train)
best_dt = dt_grid.best_estimator_

In [24]:
rf_model = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, 20],  
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4] 
}
rf_grid = GridSearchCV(rf_model, rf_params, cv=10, scoring='accuracy')
rf_grid.fit(X_train_scaled, y_train)
best_rf = rf_grid.best_estimator_

In [25]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}
xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='accuracy')
xgb_grid.fit(X_train_scaled, y_train)
best_xgb = xgb_grid.best_estimator_

In [26]:
svc_model = SVC()
svc_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': [2, 3, 4] 
}


svc_grid = GridSearchCV(estimator=svc_model, param_grid=svc_params, cv=3, n_jobs=-1, scoring='accuracy')
svc_grid.fit(X_train_scaled, y_train)

best_svc = svc_grid.best_estimator_


In [27]:
models = {
    "Logistic Regression": best_log,
    "Decision Tree": best_dt,
    "Random Forest": best_rf,
    "XGBoost": best_xgb,
    "svc": best_svc
}

for name, model in models.items():
    print("=" * 60)
    print(f"🔍 Model: {name}")

    

    # Test prediction
    y_test_pred = model.predict(X_test_scaled)
    test_acc = accuracy_score(y_test, y_test_pred)

    print(f"📉 Test Accuracy:  {test_acc:.4f}")
    print("📊 Test Classification Report:")
    print(classification_report(y_test, y_test_pred))

🔍 Model: Logistic Regression
📉 Test Accuracy:  0.8167
📊 Test Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.95      0.88        41
           1       0.83      0.53      0.65        19

    accuracy                           0.82        60
   macro avg       0.82      0.74      0.76        60
weighted avg       0.82      0.82      0.80        60

🔍 Model: Decision Tree
📉 Test Accuracy:  0.8333
📊 Test Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.98      0.89        41
           1       0.91      0.53      0.67        19

    accuracy                           0.83        60
   macro avg       0.86      0.75      0.78        60
weighted avg       0.85      0.83      0.82        60

🔍 Model: Random Forest
📉 Test Accuracy:  0.8500
📊 Test Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.95      0.90        

In [28]:
import joblib

# Save best model and scaler
joblib.dump(best_rf, 'best_rf_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']