In [5]:
import optuna

import numpy as np
import pandas as pd 
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## OPTUNA 

took 1min 13 secs for finding the best params and give the best accuracy

In [6]:
x = load_breast_cancer(as_frame=True).data
y= load_breast_cancer(as_frame=True).target

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [9]:
def objective(trial):
    params={
        'n_estimators' : trial.suggest_int('n_estimators',100,1000, step=100),
        'max_depth' : trial.suggest_int('max_depth',3,15),   # this will intelligently choose values from 3 to 15 (inclusive) for the param
        'min_samples_split': trial.suggest_int('min_samples_split',2,20),  # same here
        'max_features' : trial.suggest_categorical('max_features', ['sqrt','log2'])
    }

    model = RandomForestClassifier(random_state=42, **params)
    model.fit(x_train, y_train)
    y_pred= model.predict(x_test)
    accuracy= accuracy_score(y_test, y_pred)
    return accuracy  # this is the metric that the objective function gives , as we are using accuracy we will maximize

In [10]:
study= optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Number of finished trials:  ', len(study.trials))
print('Best Trial:  ', study.best_trial.params)
print('Best accuracy:  ', study.best_trial.value)

[I 2025-07-13 16:29:39,209] A new study created in memory with name: no-name-e77a3a40-bb8f-46cc-9bad-7bd28d80abed
[I 2025-07-13 16:29:39,583] Trial 0 finished with value: 0.9649122807017544 and parameters: {'n_estimators': 100, 'max_depth': 9, 'min_samples_split': 15, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9649122807017544.
[I 2025-07-13 16:29:40,732] Trial 1 finished with value: 0.9649122807017544 and parameters: {'n_estimators': 600, 'max_depth': 12, 'min_samples_split': 19, 'max_features': 'log2'}. Best is trial 0 with value: 0.9649122807017544.
[I 2025-07-13 16:29:41,934] Trial 2 finished with value: 0.9649122807017544 and parameters: {'n_estimators': 600, 'max_depth': 8, 'min_samples_split': 14, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9649122807017544.
[I 2025-07-13 16:29:42,102] Trial 3 finished with value: 0.9649122807017544 and parameters: {'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 8, 'max_features': 'log2'}. Best is trial 0 with v

Number of finished trials:   100
Best Trial:   {'n_estimators': 100, 'max_depth': 9, 'min_samples_split': 15, 'max_features': 'sqrt'}
Best accuracy:   0.9649122807017544


In [11]:
best_params= study.best_trial.params
best_model = RandomForestClassifier(random_state=42, **best_params)
best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)

best_accuracy= accuracy_score(y_test, y_pred)

print('best accuracy: ', best_accuracy)

best accuracy:  0.9649122807017544


## GRID SEARCH

took around 11 mins for the total of 4000 fits to get the best accuracy 

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
param_grid = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'max_depth': [3, 5, 7, 10, 15],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

model=RandomForestClassifier(random_state=42)

grid_search= GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)
grid_search.fit(x_train, y_train)

print('best params :  ')
print(grid_search.best_params_)

print('best cross val accuracy:  ')
print(grid_search.best_score_)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits
best params :  
{'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
best cross val accuracy:  
0.9626373626373625


In [17]:
best_model= grid_search.best_estimator_
y_pred= best_model.predict(x_test)

best_score= accuracy_score(y_test, y_pred)

print('best accuracy with Grid Search :  ', best_score)

best accuracy with Grid Search :   0.9649122807017544


## RANDOM SEARCH 

works by sampling a fixed number of hyperparameter settings from the specified distributions and is more efficient for larger search spaces, when computational resources are limited.

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 15),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2']
}

model=RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                                   n_iter=100, scoring='accuracy', cv=5, verbose=1, n_jobs=-1, random_state=42)


random_search.fit(x_train, y_train)

print("Best parameters found by RandomizedSearchCV:")
print(random_search.best_params_)
print("Best cross-validation accuracy:", random_search.best_score_)


best_random_model = random_search.best_estimator_
val_preds_random = best_random_model.predict(x_test)
val_accuracy_random = accuracy_score(y_test, val_preds_random)
print("Validation accuracy with best hyperparameters (RandomizedSearchCV):", val_accuracy_random)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found by RandomizedSearchCV:
{'max_depth': 11, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 996}
Best cross-validation accuracy: 0.9582417582417582
Validation accuracy with best hyperparameters (RandomizedSearchCV): 0.9649122807017544
