In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
# Load your dataset
pcos_df = pd.read_csv('new_pcos_dataset.csv')  # Load your data here
# For example, assuming 'target' is your label
X = pcos_df.drop('PCOS (Y/N)', axis=1)  # Features
y = pcos_df['PCOS (Y/N)']  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700],  # Increase range for more trees
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35],      # Add more granular depth options
    'min_samples_split': [2, 5, 10, 15],                 # Broaden range for minimum samples to split
    'min_samples_leaf': [1, 2, 3, 4, 5, 10],             # Increase leaf sample sizes for more granularity
    'max_features': ['auto', 'sqrt', 'log2'],            # Keep this as is; useful for feature selection
    'bootstrap': [True, False]                           # Consider whether to use bootstrapping
}


In [None]:
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Score: {best_score:.4f}")


Fitting 5 folds for each of 8064 candidates, totalling 40320 fits


In [5]:
# Train the best model
best_rf = grid_search.best_estimator_

# Make predictions
y_pred = best_rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


Test Accuracy: 0.8889
Confusion Matrix:
 [[70  5]
 [ 7 26]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.93      0.92        75
           1       0.84      0.79      0.81        33

    accuracy                           0.89       108
   macro avg       0.87      0.86      0.87       108
weighted avg       0.89      0.89      0.89       108

