In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd
import joblib

# Load the data
X_train = pd.read_csv('../archive/X_train.csv')
y_train = pd.read_csv('../archive/y_train.csv')
X_test = pd.read_csv('../archive/X_test.csv')
y_test = pd.read_csv('../archive/y_test.csv')

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],      # Number of trees in the forest
    'max_depth': [None, 5, 10, 20],       # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],      # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4]         # Minimum samples required at a leaf node
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,                                 # 5-fold cross-validation
    scoring='roc_auc',                    # Use ROC-AUC as the metric
    n_jobs=-1,                            # Use all available CPU cores
    verbose=2                             # Print progress
)

# Fit to training data
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best ROC-AUC score (CV):", grid_search.best_score_)

# Evaluate on test set
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)
print("Test set classification report:\n", classification_report(y_test, y_pred_best))
print("Test set ROC-AUC:", roc_auc_score(y_test, best_rf.predict_proba(X_test)[:,1]))

# Save the best model
joblib.dump(best_rf, "../models/mental_health_rf_best.pkl")
print("Best model saved as ../models/mental_health_rf_best.pkl")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters found: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best ROC-AUC score (CV): 0.8193298109672427
Test set classification report:
               precision    recall  f1-score   support

           0       0.77      0.74      0.75       129
           1       0.73      0.76      0.75       123

    accuracy                           0.75       252
   macro avg       0.75      0.75      0.75       252
weighted avg       0.75      0.75      0.75       252

Test set ROC-AUC: 0.791580008823344
Best model saved as ../models/mental_health_rf_best.pkl


  return fit_method(estimator, *args, **kwargs)
