In [4]:
# ============================
# 🪜 Step 6: Hyperparameter Tuning
# ============================

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import randint, uniform

# 1️⃣ Load preprocessed dataset
data = pd.read_csv('../data/heart_disease_cleaned.csv')  # عدل المسار حسب مشروعك
X = data.drop('target', axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2️⃣ Define models & hyperparameter grids
models = {
    "Logistic Regression": (
        LogisticRegression(max_iter=500),
        {
            'C': [0.01, 0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs']
        }
    ),
    "Decision Tree": (
        DecisionTreeClassifier(random_state=42),
        {
            'max_depth': [None, 3, 5, 10],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    ),
    "Random Forest": (
        RandomForestClassifier(random_state=42),
        {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5, 10]
        }
    ),
    "SVM": (
        SVC(),
        {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf', 'poly'],
            'gamma': ['scale', 'auto']
        }
    )
}

# 3️⃣ Run GridSearchCV for each model
best_models = {}
for name, (model, params) in models.items():
    print(f"\n🔍 Tuning hyperparameters for {name} ...")
    grid = GridSearchCV(model, params, cv=5, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_

    print(f"✅ Best Parameters for {name}: {grid.best_params_}")
    print(f"🏆 Best CV Score: {grid.best_score_:.4f}")

# 4️⃣ Evaluate all tuned models on test set
print("\n📊 Final Model Comparison (Test Set Performance):")
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name:20s} - Accuracy: {acc:.4f}")

# 5️⃣ Select the best performing model
best_model_name = max(best_models, key=lambda m: accuracy_score(y_test, best_models[m].predict(X_test)))
best_model = best_models[best_model_name]

print(f"\n🚀 Best Tuned Model: {best_model_name}")
print(classification_report(y_test, best_model.predict(X_test)))
import joblib
joblib.dump(best_model, '../models/temp_best_model.pkl')



🔍 Tuning hyperparameters for Logistic Regression ...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
✅ Best Parameters for Logistic Regression: {'C': 0.1, 'solver': 'liblinear'}
🏆 Best CV Score: 0.8450

🔍 Tuning hyperparameters for Decision Tree ...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
✅ Best Parameters for Decision Tree: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
🏆 Best CV Score: 0.7282

🔍 Tuning hyperparameters for Random Forest ...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
✅ Best Parameters for Random Forest: {'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 200}
🏆 Best CV Score: 0.8244

🔍 Tuning hyperparameters for SVM ...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
✅ Best Parameters for SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
🏆 Best CV Score: 0.8324

📊 Final Model Comparison (Test Set Performance):
Logistic Regression  - Accuracy: 0.8333
Decision Tree        - Acc

['../models/temp_best_model.pkl']