# Optuna Demo Notebook

In [2]:
# # Uncomment and run once if you haven't installed Optuna in your local machine
# %pip install optuna

In [None]:
import time
import optuna

import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv("/Users/katecastillo/Documents/MSDS/MSDS2025/Term2/COSCI221/cleaned_traffic_crashes_people.csv")
df.head()

Unnamed: 0,age,injury_classification,person_type_BICYCLE,person_type_DRIVER,person_type_NON-CONTACT VEHICLE,person_type_NON-MOTOR VEHICLE,person_type_PASSENGER,person_type_PEDESTRIAN,state_AL,state_AR,...,driver_vision_HILLCREST,driver_vision_MOVING VEHICLES,driver_vision_NOT OBSCURED,driver_vision_OTHER,driver_vision_PARKED VEHICLES,"driver_vision_TREES, PLANTS",driver_vision_UNKNOWN,driver_vision_WINDSHIELD (WATER/ICE),driver_vision_nan,driver_vision_infrequent_sklearn
0,41.0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38.184669,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,32.0,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,38.184669,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,38.184669,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
df["injury_classification"].value_counts()

1    185273
2      9387
3      5485
4      1768
5        85
Name: injury_classification, dtype: int64

## Defining the Study Object
- A study object in Optuna is a collection of trials that share the same search space and objective function
- To create a study object and optimize your objective function, you can use the `optuna.create_study()` function and the `study.optimize()` method

In [6]:
# Define the objective function

def objective(trial):
    X, y = df.drop("injury_classification", axis=1), df.injury_classification
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Suggest hyperparameters
    alpha = trial.suggest_float("alpha", 0.0, 1.0)

    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    return score

In [7]:
# Create a study object
study = optuna.create_study(study_name="basic_example", direction="minimize")

start = time.perf_counter()
study.optimize(objective, n_trials=100) # Optimize the objective function
end = time.perf_counter()

print(f"Time taken: {end-start:.6f} seconds")

[I 2024-11-11 11:17:58,851] A new study created in memory with name: basic_example
[I 2024-11-11 11:17:59,409] Trial 0 finished with value: 0.19471158368999578 and parameters: {'alpha': 0.7556415079217539}. Best is trial 0 with value: 0.19471158368999578.
[I 2024-11-11 11:17:59,825] Trial 1 finished with value: 0.19471612123161872 and parameters: {'alpha': 0.4933446581826705}. Best is trial 0 with value: 0.19471158368999578.
[I 2024-11-11 11:18:00,329] Trial 2 finished with value: 0.19470870584702996 and parameters: {'alpha': 0.9348404603370335}. Best is trial 2 with value: 0.19470870584702996.
[I 2024-11-11 11:18:00,972] Trial 3 finished with value: 0.19471073725761487 and parameters: {'alpha': 0.8072723236857982}. Best is trial 2 with value: 0.19470870584702996.
[I 2024-11-11 11:18:01,337] Trial 4 finished with value: 0.1947232616730779 and parameters: {'alpha': 0.12652428969068696}. Best is trial 2 with value: 0.19470870584702996.
[I 2024-11-11 11:18:01,700] Trial 5 finished with va

Time taken: 38.407587 seconds


In [None]:
# optuna hub

## Suggesting Hyperparameters

***Suggest Uniform***<br>
`suggest_uniform(name, low, high)`: Suggests a floating-point value uniformly distributed between `low` and `high`

***Suggest Loguniform*** <br>
`suggest_loguniform(name, low, high)`: Suggests a floating-point value logarithmically distributed between `low` and `high`

***Suggest Int*** <br>
`suggest_int(name, low, high)`: Suggests an integer value between `low` and `high`

***Suggest Categorical*** <br>
`suggest_categorical(name, choices)`: Suggests a categorical value from the given list of choices

In [26]:
# Define the objective function

def objective(trial):
    X, y = df.drop("injury_classification", axis=1), df.injury_classification
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 10, 200)

    model = RandomForestClassifier(n_estimators=n_estimators)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    return score

In [28]:
# Create a study object
study = optuna.create_study(study_name="int_rf_example", direction="minimize")

start = time.perf_counter()
study.optimize(objective, n_trials=10) # Optimize the objective function
end = time.perf_counter()

print(f"Time taken: {end-start:.6f} seconds")

[I 2024-11-10 14:53:43,543] A new study created in memory with name: int_rf_example
[I 2024-11-10 14:54:01,963] Trial 0 finished with value: 0.2298019801980198 and parameters: {'n_estimators': 110}. Best is trial 0 with value: 0.2298019801980198.
[I 2024-11-10 14:54:04,306] Trial 1 finished with value: 0.231996699669967 and parameters: {'n_estimators': 12}. Best is trial 0 with value: 0.2298019801980198.
[I 2024-11-10 14:54:37,697] Trial 2 finished with value: 0.23031353135313531 and parameters: {'n_estimators': 168}. Best is trial 0 with value: 0.2298019801980198.
[I 2024-11-10 14:54:44,005] Trial 3 finished with value: 0.2316006600660066 and parameters: {'n_estimators': 30}. Best is trial 0 with value: 0.2298019801980198.
[I 2024-11-10 14:54:52,463] Trial 4 finished with value: 0.23113861386138615 and parameters: {'n_estimators': 42}. Best is trial 0 with value: 0.2298019801980198.
[I 2024-11-10 14:55:28,581] Trial 5 finished with value: 0.23034653465346536 and parameters: {'n_estima

Time taken: 203.980730 seconds


In [29]:
print("Best hyperparameters:", study.best_params)
print("Best value:", study.best_value)

Best hyperparameters: {'n_estimators': 141}
Best value: 0.22962046204620462
