<a href="https://colab.research.google.com/github/abhicse002/Py_DS_libs/blob/main/hyperparam_tuning_using_optuna_bayessian_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [32]:
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

In [6]:
# Load Diabetes Dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
column_names = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
df = pd.read_csv(url, header=None, names=column_names)

In [23]:
# Replace Zero with NaN, impute it with mean value
missing_column_names = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
df[missing_column_names] = df[missing_column_names].replace(0, np.nan)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27.0,,36.8,0.340,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1,126.0,60.0,,,30.1,0.349,47,1


In [24]:
# imputing
df.fillna(df.mean(), inplace=True)

In [26]:
# check if there us any Nan value present
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [31]:
# Split nito Feature & Target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) # learn scaling parameters from the training data
X_test = scaler.transform(X_test) # applying the scaling on test data

print()





In [46]:
"""
Steps for Optuna
1. Define the Objective function:- pass the max_depth & n_estimator to the model & get the accuracy
2. Creating the Study with Optuna with direction maximize or minimize & Sampler
3. Sampler defines the value of n_estimator & max_depth based on past data
4. TRIAL has the intelligence(Logic to choose the next hyper parameters)
"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# define objective fucntion which picks optimal n_estimator, max_depth and gives accuracy
def objective_function(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 5, 15)

    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    cross_val_scores = cross_val_score(model, X_train, Y_train, cv=3, scoring='accuracy')
    accuracy = cross_val_scores.mean()

    return accuracy

In [48]:
# creating study
# direction = maximize - we want to maximize the accuracy
# direction = minimize - if we want to minize the loss, r2_score or working in loss function
# TPESampler -

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective_function, n_trials=50) # run 50 trils to find the best hyperparameters

print(f"Best Trial Accuracy {study.best_value}")
print(f"Best Trial Params {study.best_params}")

[I 2025-01-01 11:18:13,732] A new study created in memory with name: no-name-5f0658b6-a8f9-43e5-b06f-db07c494f5b5
[I 2025-01-01 11:18:15,507] Trial 0 finished with value: 0.7653631284916201 and parameters: {'n_estimators': 111, 'max_depth': 11}. Best is trial 0 with value: 0.7653631284916201.
[I 2025-01-01 11:18:16,820] Trial 1 finished with value: 0.7616387337057727 and parameters: {'n_estimators': 177, 'max_depth': 5}. Best is trial 0 with value: 0.7653631284916201.
[I 2025-01-01 11:18:17,158] Trial 2 finished with value: 0.7616387337057727 and parameters: {'n_estimators': 51, 'max_depth': 9}. Best is trial 0 with value: 0.7653631284916201.
[I 2025-01-01 11:18:17,797] Trial 3 finished with value: 0.7690875232774674 and parameters: {'n_estimators': 106, 'max_depth': 5}. Best is trial 3 with value: 0.7690875232774674.
[I 2025-01-01 11:18:18,734] Trial 4 finished with value: 0.7672253258845437 and parameters: {'n_estimators': 143, 'max_depth': 12}. Best is trial 3 with value: 0.76908752

Best Trial Accuracy 0.7802607076350093
Best Trial Params {'n_estimators': 129, 'max_depth': 7}
