In [3]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import joblib

In [5]:
df_pca = pd.read_csv('../data/heart_disease_pca.csv')
X = df_pca.drop('target', axis=1)
y = df_pca['target']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 3, 5, 7],
    'min_samples_split': [2, 4, 6]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found:  {'max_depth': None, 'min_samples_split': 6, 'n_estimators': 50}
Best cross-validation score:  0.6321428571428571


In [9]:
# Evaluate the best model on the test set

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

from sklearn.metrics import classification_report

print("Classification Report:\n", classification_report(y_test, y_pred_best))

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.93      0.82        29
           1       0.57      0.33      0.42        12
           2       0.22      0.22      0.22         9
           3       0.25      0.29      0.27         7
           4       0.00      0.00      0.00         4

    accuracy                           0.57        61
   macro avg       0.35      0.35      0.35        61
weighted avg       0.52      0.57      0.54        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
joblib.dump(best_model, '../models/best_random_forest_model_gridSearch.pkl')

['../models/best_random_forest_model_gridSearch.pkl']

In [12]:
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 3, 5, 7, 9],
    'min_samples_split': [2, 4, 6, 8]
}

random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

random_search.fit(X_train, y_train)

print("Best parameters found by Randomized Search: ", random_search.best_params_)
print("Best cross-validation score by Randomized Search: ", random_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found by Randomized Search:  {'n_estimators': 200, 'min_samples_split': 4, 'max_depth': 5}
Best cross-validation score by Randomized Search:  0.6238095238095238


In [14]:
best_random_model = random_search.best_estimator_

joblib.dump(best_random_model, '../models/best_random_forest_model_randomGridSearch.pkl')

['../models/best_random_forest_model_randomGridSearch.pkl']