In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# -----------------------------
# Load dataset
# -----------------------------
df = pd.read_csv('../data/heart_disease_selected_features.csv')
X = df.drop('target', axis=1)
y = df['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
# Standardize features for SVM and Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
# -----------------------------
# Hyperparameter grids
# -----------------------------
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs']
    },
    'Decision Tree': {
        'max_depth': [None, 3, 5, 7, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 3, 5, 7],
        'min_samples_split': [2, 5, 10]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }
}

In [7]:
# -----------------------------
# GridSearchCV for each model
# -----------------------------
best_models = {}

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
grid_lr = GridSearchCV(estimator=lr, param_grid=param_grids['Logistic Regression'], cv=5, scoring='accuracy')
grid_lr.fit(X_train_scaled, y_train)
best_models['Logistic Regression'] = grid_lr.best_estimator_
print("Best Logistic Regression params:", grid_lr.best_params_)







Best Logistic Regression params: {'C': 0.1, 'solver': 'lbfgs'}


In [8]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
grid_dt = GridSearchCV(estimator=dt, param_grid=param_grids['Decision Tree'], cv=5, scoring='accuracy')
grid_dt.fit(X_train, y_train)
best_models['Decision Tree'] = grid_dt.best_estimator_
print("Best Decision Tree params:", grid_dt.best_params_)

Best Decision Tree params: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}


In [9]:
# Random Forest
rf = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(estimator=rf, param_grid=param_grids['Random Forest'], cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)
best_models['Random Forest'] = grid_rf.best_estimator_
print("Best Random Forest params:", grid_rf.best_params_)

Best Random Forest params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}


In [10]:
# SVM
svm = SVC(probability=True, random_state=42)
grid_svm = GridSearchCV(estimator=svm, param_grid=param_grids['SVM'], cv=5, scoring='accuracy')
grid_svm.fit(X_train_scaled, y_train)
best_models['SVM'] = grid_svm.best_estimator_
print("Best SVM params:", grid_svm.best_params_)

Best SVM params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}


In [11]:
# -----------------------------
# Evaluate best models on test set
# -----------------------------
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

for name, model in best_models.items():
    if name in ['Logistic Regression', 'SVM']:
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:,1]
    else:
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:,1]
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    print(f"{name} (Optimized) Metrics:")
    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}, AUC: {auc:.4f}\n")

Logistic Regression (Optimized) Metrics:
Accuracy: 0.9016, Precision: 0.9062, Recall: 0.9062, F1-Score: 0.9062, AUC: 0.9472

Decision Tree (Optimized) Metrics:
Accuracy: 0.7541, Precision: 0.7576, Recall: 0.7812, F1-Score: 0.7692, AUC: 0.8658

Random Forest (Optimized) Metrics:
Accuracy: 0.8689, Precision: 0.8750, Recall: 0.8750, F1-Score: 0.8750, AUC: 0.9542

SVM (Optimized) Metrics:
Accuracy: 0.9180, Precision: 0.9091, Recall: 0.9375, F1-Score: 0.9231, AUC: 0.9450



In [13]:
# Save each model separately
import joblib

for name, model in best_models.items():
    joblib.dump(model, f'../models/{name.replace(" ", "_")}_model.pkl')
    
print("All models and scaler have been saved successfully!")

All models and scaler have been saved successfully!
