In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib



df = pd.read_csv("../data/heart_disease_selected.csv")

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)


# GridSearchCV WITH RandomForest

param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Parameters (GridSearch):", grid.best_params_)
print("Best Score (GridSearch):", grid.best_score_)

best_rf = grid.best_estimator_
y_pred = best_rf.predict(X_test)
print("\nClassification Report (Best RF):\n", classification_report(y_test, y_pred))


# RandomizedSearchCV WITH Logistic Regression

param_dist = {
    "C": [0.01, 0.1, 1, 10, 100],
    "solver": ["liblinear", "saga"]
}

random_search = RandomizedSearchCV(
    LogisticRegression(max_iter=2000, random_state=42),
    param_distributions=param_dist,
    n_iter=5,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best Parameters (RandomizedSearch):", random_search.best_params_)
print("Best Score (RandomizedSearch):", random_search.best_score_)

best_lr = random_search.best_estimator_
y_pred = best_lr.predict(X_test)
print("\nClassification Report (Best LR):\n", classification_report(y_test, y_pred))


joblib.dump(best_rf, "../models/final_model.pkl")
print("Best Random Forest model saved!")

Best Parameters (GridSearch): {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 100}
Best Score (GridSearch): 0.8100340136054422

Classification Report (Best RF):
               precision    recall  f1-score   support

           0       0.90      0.85      0.88        33
           1       0.83      0.89      0.86        28

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61

Best Parameters (RandomizedSearch): {'solver': 'saga', 'C': 1}
Best Score (RandomizedSearch): 0.8056972789115646

Classification Report (Best LR):
               precision    recall  f1-score   support

           0       0.87      0.82      0.84        33
           1       0.80      0.86      0.83        28

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61

Best Random Forest model sa