In [13]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import optuna

In [2]:
df = pd.read_csv('MNIST.csv')

In [3]:
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
X = df.drop(columns='label')
Y = df['label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.2)

In [7]:
print(f'Training set: {X_train.shape}')
print(f'Testing set: {X_test.shape}')

Training set: (3210, 784)
Testing set: (803, 784)


In [19]:
def objective(trial):
  n_estimators = trial.suggest_int('n_estimators', 10,200)
  max_depth = trial.suggest_int('max_depth',3,20)
  criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
  min_samples_split = trial.suggest_int('min_samples_split',2,10)
  min_samples_leaf = trial.suggest_int('min_samples_leaf',1,10)
  bootstrap = trial.suggest_categorical('bootstrap', [True, False])

  model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                 criterion=criterion, min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf, bootstrap=bootstrap)

  score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
  return score

In [20]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(),)
study.optimize(objective, n_trials=50)

[I 2025-06-08 07:15:56,302] A new study created in memory with name: no-name-bdbc354d-bb38-4b5b-93ef-4a0f98c335a8
[I 2025-06-08 07:16:03,948] Trial 0 finished with value: 0.9249221183800623 and parameters: {'n_estimators': 65, 'max_depth': 17, 'criterion': 'gini', 'min_samples_split': 9, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 0 with value: 0.9249221183800623.
[I 2025-06-08 07:16:05,224] Trial 1 finished with value: 0.7221183800623052 and parameters: {'n_estimators': 34, 'max_depth': 3, 'criterion': 'gini', 'min_samples_split': 7, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 0 with value: 0.9249221183800623.
[I 2025-06-08 07:16:10,323] Trial 2 finished with value: 0.9087227414330217 and parameters: {'n_estimators': 43, 'max_depth': 18, 'criterion': 'entropy', 'min_samples_split': 9, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 0 with value: 0.9249221183800623.
[I 2025-06-08 07:16:18,935] Trial 3 finished with value: 0.9006230529595015 and para

In [22]:
print(f'Best Accuracy: {study.best_trial.value}')
print(f'Best Parameters: {study.best_params}')

Best Accuracy: 0.9327102803738319
Best Parameters: {'n_estimators': 118, 'max_depth': 19, 'criterion': 'entropy', 'min_samples_split': 7, 'min_samples_leaf': 1, 'bootstrap': False}


In [23]:
from sklearn.metrics import accuracy_score
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy Score: {test_accuracy:.2f}')

Accuracy Score: 0.93


# Samplers in Optuna

In [24]:
def objective(trial):
  n_estimators = trial.suggest_int('n_estimators', 10,200)
  max_depth = trial.suggest_int('max_depth',3,20)
  criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
  min_samples_split = trial.suggest_int('min_samples_split',2,10)
  min_samples_leaf = trial.suggest_int('min_samples_leaf',1,10)
  bootstrap = trial.suggest_categorical('bootstrap', [True, False])

  model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                 criterion=criterion, min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf, bootstrap=bootstrap)

  score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
  return score

In [25]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler())
study.optimize(objective, n_trials=50)

[I 2025-06-08 07:36:34,796] A new study created in memory with name: no-name-ec1fd53f-34be-49c9-bbd0-5104e6187bbd
[I 2025-06-08 07:36:37,443] Trial 0 finished with value: 0.894392523364486 and parameters: {'n_estimators': 24, 'max_depth': 14, 'criterion': 'gini', 'min_samples_split': 8, 'min_samples_leaf': 5, 'bootstrap': True}. Best is trial 0 with value: 0.894392523364486.
[I 2025-06-08 07:36:41,826] Trial 1 finished with value: 0.8925233644859814 and parameters: {'n_estimators': 45, 'max_depth': 7, 'criterion': 'entropy', 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True}. Best is trial 0 with value: 0.894392523364486.
[I 2025-06-08 07:36:52,872] Trial 2 finished with value: 0.9137071651090343 and parameters: {'n_estimators': 143, 'max_depth': 10, 'criterion': 'gini', 'min_samples_split': 5, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 2 with value: 0.9137071651090343.
[I 2025-06-08 07:36:59,544] Trial 3 finished with value: 0.9152647975077881 and paramete

In [26]:
print(f'Best Accuracy: {study.best_trial.value}')
print(f'Best parameters: {study.best_params}')

Best Accuracy: 0.9314641744548288
Best parameters: {'n_estimators': 93, 'max_depth': 16, 'criterion': 'entropy', 'min_samples_split': 2, 'min_samples_leaf': 2, 'bootstrap': False}


In [27]:
from sklearn.metrics import accuracy_score
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy Score: {test_accuracy:.2f}')

Accuracy Score: 0.93


# Optuna Visualizations

In [28]:
# For visualizations
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [29]:
plot_optimization_history(study).show()

In [31]:
plot_parallel_coordinate(study).show()

In [32]:
plot_slice(study).show()

In [33]:
plot_contour(study).show()

In [34]:
plot_param_importances(study).show()