In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('/home/youmna/Heart-Disease-Prediction/data/reduced_heart_disease.csv')

X = df.drop('target', axis=1)
y = df['target']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)


In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier(random_state=42)

grid = GridSearchCV(estimator=rf,
                    param_grid=param_grid,
                    cv=5,
                    scoring='accuracy',
                    n_jobs=-1)

grid.fit(X_train, y_train)

print("Best Parameters from GridSearchCV:")
print(grid.best_params_)

best_rf_grid = grid.best_estimator_


Best Parameters from GridSearchCV:
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [4]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(100, 200),
    'max_depth': [5, 10, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

rf = RandomForestClassifier(random_state=42)

rand_search = RandomizedSearchCV(estimator=rf,
                                 param_distributions=param_dist,
                                 n_iter=10,
                                 scoring='accuracy',
                                 cv=5,
                                 n_jobs=-1,
                                 random_state=42)

rand_search.fit(X_train, y_train)

print("Best Parameters from RandomizedSearchCV:")
print(rand_search.best_params_)

best_rf_rand = rand_search.best_estimator_


Best Parameters from RandomizedSearchCV:
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 188}


In [5]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = best_rf_grid.predict(X_test)
print("Evaluation after GridSearchCV:")
print(classification_report(y_test, y_pred))


Evaluation after GridSearchCV:
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        32
           1       0.21      0.27      0.24        11
           2       0.00      0.00      0.00         7
           3       0.17      0.14      0.15         7
           4       0.00      0.00      0.00         3

    accuracy                           0.57        60
   macro avg       0.23      0.27      0.25        60
weighted avg       0.48      0.57      0.52        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [7]:
import pickle

with open('/home/youmna/Heart-Disease-Prediction/models/best_rf.pkl', 'wb') as f:
    pickle.dump(best_rf_grid, f)
