In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


In [6]:
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [5]:
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(url, names=columns)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
cols_with_missing_vals = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

df.fillna(df.mean(), inplace=True)

print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [9]:
X = df.drop(columns='Outcome', axis=1)
y =df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')

Training set shape: (537, 8)
Test set shape: (231, 8)


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# define the objective function
def objective(trail):
  # suggest
  n_estimators = trail.suggest_int('n_estimators', 50, 200)
  max_depth = trail.suggest_int('max_depth', 3, 20)

  # create the RandomforestClassifier with suggested hyperparameters
  model = RandomForestClassifier(
      n_estimators=n_estimators,
      max_depth=max_depth,
      random_state=42
  )

  # Perform 3-fold cross_validation and calculate accuracy
  score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
  # return the accuracy score for optuna to maximize
  return score

In [12]:
# create a study object and optimize the objective function
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
# run 50 trials to find the best hyperparameters
study.optimize(objective, n_trials=50)

[I 2025-06-08 04:17:53,291] A new study created in memory with name: no-name-1befc66a-7d4c-4467-991c-b9c3239e6c01
[I 2025-06-08 04:17:53,997] Trial 0 finished with value: 0.7672253258845437 and parameters: {'n_estimators': 73, 'max_depth': 9}. Best is trial 0 with value: 0.7672253258845437.
[I 2025-06-08 04:17:55,469] Trial 1 finished with value: 0.7802607076350093 and parameters: {'n_estimators': 133, 'max_depth': 7}. Best is trial 1 with value: 0.7802607076350093.
[I 2025-06-08 04:17:57,174] Trial 2 finished with value: 0.7728119180633147 and parameters: {'n_estimators': 200, 'max_depth': 15}. Best is trial 1 with value: 0.7802607076350093.
[I 2025-06-08 04:17:57,482] Trial 3 finished with value: 0.7728119180633147 and parameters: {'n_estimators': 50, 'max_depth': 16}. Best is trial 1 with value: 0.7802607076350093.
[I 2025-06-08 04:17:58,182] Trial 4 finished with value: 0.7560521415270017 and parameters: {'n_estimators': 133, 'max_depth': 3}. Best is trial 1 with value: 0.780260707

In [13]:
print(f'Bets trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Bets trial accuracy: 0.7821229050279329
Best hyperparameters: {'n_estimators': 124, 'max_depth': 18}


In [14]:
from sklearn.metrics import accuracy_score
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy with best hyperparameters: {test_accuracy: .2f}')

Test accuracy with best hyperparameters:  0.74


# Best Algorithm

In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [20]:
# Define the objective function for Optuna
def objective(trial):
    # Choose the algorithm to tune
    classifier_name = trial.suggest_categorical('classifier', ['SVM', 'RandomForest', 'GradientBoosting'])

    if classifier_name == 'SVM':
        # SVM hyperparameters
        c = trial.suggest_float('C', 0.1, 100, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)

    elif classifier_name == 'RandomForest':
        # Random Forest hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )

    elif classifier_name == 'GradientBoosting':
        # Gradient Boosting hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    # Perform cross-validation and return the mean accuracy
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

In [21]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2025-06-08 05:51:26,781] A new study created in memory with name: no-name-358d4d8d-63bc-4a9b-a488-a8fbaf6edcea
[I 2025-06-08 05:51:26,912] Trial 0 finished with value: 0.7709497206703911 and parameters: {'classifier': 'SVM', 'C': 0.6935284540858504, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.7709497206703911.
[I 2025-06-08 05:51:26,996] Trial 1 finished with value: 0.7690875232774674 and parameters: {'classifier': 'SVM', 'C': 0.5919707235190612, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.7709497206703911.
[I 2025-06-08 05:51:27,084] Trial 2 finished with value: 0.7653631284916201 and parameters: {'classifier': 'SVM', 'C': 1.297765503428147, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.7709497206703911.
[I 2025-06-08 05:51:27,158] Trial 3 finished with value: 0.7858472998137803 and parameters: {'classifier': 'SVM', 'C': 0.20300046060009772, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 3 with value: 0.785847299

In [23]:
best_trials = study.best_trial
print(f'Best trial parameters: ', best_trials.params)
print(f'Best trial accuracy: {best_trials.value}')

Best trial parameters:  {'classifier': 'SVM', 'C': 0.12465198557281104, 'kernel': 'linear', 'gamma': 'scale'}
Best trial accuracy: 0.7895716945996275


In [24]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_bootstrap,params_classifier,params_gamma,params_kernel,params_learning_rate,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.770950,2025-06-08 05:51:26.785414,2025-06-08 05:51:26.909738,0 days 00:00:00.124324,0.693528,,SVM,scale,rbf,,,,,,COMPLETE
1,1,0.769088,2025-06-08 05:51:26.916409,2025-06-08 05:51:26.995359,0 days 00:00:00.078950,0.591971,,SVM,scale,rbf,,,,,,COMPLETE
2,2,0.765363,2025-06-08 05:51:27.000874,2025-06-08 05:51:27.084569,0 days 00:00:00.083695,1.297766,,SVM,scale,rbf,,,,,,COMPLETE
3,3,0.785847,2025-06-08 05:51:27.090225,2025-06-08 05:51:27.158038,0 days 00:00:00.067813,0.203000,,SVM,scale,linear,,,,,,COMPLETE
4,4,0.754190,2025-06-08 05:51:27.165117,2025-06-08 05:51:30.490998,0 days 00:00:03.325881,,,GradientBoosting,,,0.011963,17.0,8.0,6.0,131.0,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,0.715084,2025-06-08 05:52:24.013194,2025-06-08 05:52:24.089791,0 days 00:00:00.076597,0.154345,,SVM,scale,poly,,,,,,COMPLETE
96,96,0.750466,2025-06-08 05:52:24.091296,2025-06-08 05:52:25.586962,0 days 00:00:01.495666,,,GradientBoosting,,,0.149069,5.0,9.0,3.0,165.0,COMPLETE
97,97,0.789572,2025-06-08 05:52:25.588425,2025-06-08 05:52:25.631435,0 days 00:00:00.043010,0.121039,,SVM,auto,linear,,,,,,COMPLETE
98,98,0.785847,2025-06-08 05:52:25.632753,2025-06-08 05:52:25.682264,0 days 00:00:00.049511,0.236844,,SVM,scale,linear,,,,,,COMPLETE


In [26]:
study.trials_dataframe()['params_classifier'].value_counts()

Unnamed: 0_level_0,count
params_classifier,Unnamed: 1_level_1
SVM,80
GradientBoosting,10
RandomForest,10


In [27]:
study.trials_dataframe().groupby('params_classifier')['value'].mean()

Unnamed: 0_level_0,value
params_classifier,Unnamed: 1_level_1
GradientBoosting,0.741527
RandomForest,0.761825
SVM,0.774604
