In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [25]:
# Load data
df = pd.read_csv('data/heart_disease_selected_features.csv')
X = df.drop('target', axis=1)
y = df['target']


In [26]:
# Split data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [27]:
# Baseline model
baseline_rf = RandomForestClassifier(random_state=42)
baseline_rf.fit(x_train, y_train)
baseline_pred = baseline_rf.predict(x_test)
baseline_acc = accuracy_score(y_test, baseline_pred)
print(f"Baseline Random Forest Accuracy:{baseline_acc:.2f}")

Baseline Random Forest Accuracy:0.78


In [28]:
# 1. GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)
print("Best GridSearchCV Params:", grid_search.best_params_)
grid_pred = grid_search.predict(x_test)
grid_acc = accuracy_score(y_test, grid_pred)
print("GridSearchCV Accuracy:", grid_acc)


Best GridSearchCV Params: {'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 50}
GridSearchCV Accuracy: 0.7833333333333333


In [29]:
# 2. RandomizedSearchCV
from scipy.stats import randint
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': randint(2, 11)
}
random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_dist, n_iter=10, cv=5, n_jobs=-1, random_state=42)
random_search.fit(x_train, y_train)
print("Best RandomizedSearchCV Params:", random_search.best_params_)
rand_pred = random_search.predict(x_test)
rand_acc = accuracy_score(y_test, rand_pred)
print("RandomizedSearchCV Accuracy:", rand_acc)

Best RandomizedSearchCV Params: {'max_depth': 10, 'min_samples_split': 9, 'n_estimators': 70}
RandomizedSearchCV Accuracy: 0.7666666666666667


In [30]:
# Compare results
print("\n--- Model Comparison ---")
print(f"Baseline Accuracy: {baseline_acc:.3f}")
print(f"GridSearchCV Accuracy: {grid_acc:.3f}")
print(f"RandomizedSearchCV Accuracy: {rand_acc:.3f}")



--- Model Comparison ---
Baseline Accuracy: 0.783
GridSearchCV Accuracy: 0.783
RandomizedSearchCV Accuracy: 0.767


In [31]:
# Best performing model
if grid_acc >= rand_acc and grid_acc >= baseline_acc:
    print("\n✔ Best performing model: GridSearchCV Random Forest")
    print(classification_report(y_test, grid_pred))
elif rand_acc >= grid_acc and rand_acc >= baseline_acc:
    print("\n✔ Best performing model: RandomizedSearchCV Random Forest")
    print(classification_report(y_test, rand_pred))
else:
    print("\n✔ Best performing model: Baseline Random Forest")
    print(classification_report(y_test, baseline_pred))


✔ Best performing model: GridSearchCV Random Forest
              precision    recall  f1-score   support

           0       0.76      0.88      0.81        32
           1       0.83      0.68      0.75        28

    accuracy                           0.78        60
   macro avg       0.79      0.78      0.78        60
weighted avg       0.79      0.78      0.78        60



In [None]:
final_model = grid_search.best_estimator_ if grid_acc >= rand_acc else random_search.best_estimator_




joblib.dump(final_model, "D:\\Training Sets\\Sprints\\Heart_Disease\\Models\\final_model.pkl")

NameError: name 'scaler' is not defined