In [1]:
! pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [2]:
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nancyalaswad90/review")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/nancyalaswad90/review?dataset_version_number=5...


100%|██████████| 8.91k/8.91k [00:00<00:00, 10.1MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/nancyalaswad90/review/versions/5





In [5]:
import os
dataset_path = "/root/.cache/kagglehub/datasets/nancyalaswad90/review/versions/5"
os.listdir(dataset_path)

['diabetes.csv']

In [6]:
file_path = os.path.join(dataset_path, 'diabetes.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
import numpy as np

print(df.columns==0)

[False False False False False False False False False]


In [99]:
X = df.drop(columns=['Outcome'])

columns_with_zeros = [
    col for col in X.columns
    if (X[col] == 0).any()
]

print(df['Outcome'].value_counts())

Outcome
1.0    768
Name: count, dtype: int64


In [100]:
import numpy as np

# Create a list of feature columns that contained zeros and need imputation, excluding the 'Outcome' column
feature_cols_to_impute = [col for col in columns_with_zeros if col != 'Outcome']

# Replace 0s with NaN in the selected feature columns
df[feature_cols_to_impute] = df[feature_cols_to_impute].replace(0, np.nan)

# Fill NaNs with the mean for only the selected feature columns
for col in feature_cols_to_impute:
    df[col].fillna(df[col].mean(), inplace=True)

# Verify that there are no null values left in the entire DataFrame
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [101]:
from sklearn.preprocessing import StandardScaler
X = df.drop('Outcome',axis=1)
y = df["Outcome"].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(f"Training set shape : {X_train.shape}")
print(f"Testing set shape : {X_test.shape}")

Training set shape : (537, 8)
Testing set shape : (231, 8)


In [102]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [103]:
def objective(trail):
  n_estimators = trail.suggest_int("n_estimators",50,200)
  max_depth = trail.suggest_int("max_depth",3,20)
  model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state = 42)
  score = cross_val_score(model,X_train, y_train,cv=5,scoring='accuracy').mean()
  return score

In [104]:
# create a study object and optimize the objective function
study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials = 50) # run 50 trails to find the best hyperparameters

[I 2026-01-05 16:01:44,011] A new study created in memory with name: no-name-795db948-1b1c-49ff-ba50-ab25216b56c4
[I 2026-01-05 16:01:46,464] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 198, 'max_depth': 18}. Best is trial 0 with value: 1.0.
[I 2026-01-05 16:01:47,852] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 168, 'max_depth': 4}. Best is trial 0 with value: 1.0.
[I 2026-01-05 16:01:49,114] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 135, 'max_depth': 18}. Best is trial 0 with value: 1.0.
[I 2026-01-05 16:01:51,393] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 180, 'max_depth': 16}. Best is trial 0 with value: 1.0.
[I 2026-01-05 16:01:52,435] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 129, 'max_depth': 12}. Best is trial 0 with value: 1.0.
[I 2026-01-05 16:01:53,834] Trial 5 finished with value: 1.0 and parameters: {'n_estimators': 165, 'max_depth': 8}. Best is trial 0 with 

In [105]:
print(f"best trail accuracy : {study.best_trial.value}")
print(f"best hyperparameters : {study.best_trial.params}")

best trail accuracy : 1.0
best hyperparameters : {'n_estimators': 198, 'max_depth': 18}


In [106]:
from sklearn.metrics import accuracy_score
best_model  = RandomForestClassifier(**study.best_trial.params, random_state = 42)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print(f"Test accuracy: {accuracy_score(y_test,y_pred)}")

Test accuracy: 1.0


In [107]:
# visulization
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [108]:
# optimization History
plot_optimization_history(study).show()

In [109]:
# parallel_plot
plot_parallel_coordinate(study).show()

In [110]:
# slice plot
plot_slice(study).show()

In [111]:
plot_contour(study).show()

In [112]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

def objective(trial):

    classifier_name = trial.suggest_categorical(
        "classifier", ["SVM", "RandomForest", "GradientBoosting"]
    )

    if classifier_name == "SVM":
        c = trial.suggest_float("svm_c", 0.1, 100, log=True)
        kernel = trial.suggest_categorical("svm_kernel", ["linear", "rbf"])
        gamma = trial.suggest_categorical("svm_gamma", ["scale", "auto"])

        model = SVC(
            C=c,
            kernel=kernel,
            gamma=gamma,
            class_weight="balanced"
        )

    elif classifier_name == "RandomForest":
        n_estimators = trial.suggest_int("rf_n_estimators", 50, 200)
        max_depth = trial.suggest_int("rf_max_depth", 3, 20)

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            class_weight="balanced",
            random_state=42
        )

    else:
        n_estimators = trial.suggest_int("gb_n_estimators", 50, 200)
        learning_rate = trial.suggest_float("gb_lr", 0.01, 0.1)
        max_depth = trial.suggest_int("gb_max_depth", 3, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            random_state=42
        )

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    score = cross_val_score(
        model, X_train, y_train, cv=cv, scoring="accuracy"
    ).mean()

    return score


In [113]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2026-01-05 16:03:03,035] A new study created in memory with name: no-name-a7efa924-592b-425f-ac60-78643e43f2c6
[I 2026-01-05 16:03:05,286] Trial 0 finished with value: 1.0 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 168, 'rf_max_depth': 13}. Best is trial 0 with value: 1.0.
[W 2026-01-05 16:03:05,302] Trial 1 failed with parameters: {'classifier': 'GradientBoosting', 'gb_n_estimators': 123, 'gb_lr': 0.0985109235555657, 'gb_max_depth': 7} because of the following error: ValueError('\nAll the 3 fits failed.\nIt is very likely that your model is misconfigured.\nYou can try to debug the error by setting error_score=\'raise\'.\n\nBelow are more details about the failures:\n--------------------------------------------------------------------------------\n3 fits failed with the following error:\nTraceback (most recent call last):\n  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score\n    estimator.fit(X_t

ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/ensemble/_gb.py", line 669, in fit
    y = self._encode_y(y=y, sample_weight=None)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/ensemble/_gb.py", line 1532, in _encode_y
    raise ValueError(
ValueError: y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.


In [None]:
# code is same due to data imbalance this error has been occoured