# PPO Hyperparameters tuning

In [2]:
import optuna
from stable_baselines3.common.evaluation import evaluate_policy
from PPO.ppo_utils import create_agent, train
import gym
from env.custom_hopper import *

n_trials = 50
n_episodes = 1000
mean_timestep = 100
n_eval_episodes = 1000

def optimize_call(clip_range, learning_rate, gamma, n_episodes, n_eval_episodes, env: str = "CustomHopper-source-v0"):
    train_env = gym.make(env)
    
    agent = create_agent(
        clip_range=clip_range, 
        verbose=0,
        learning_rate=learning_rate,
        gamma=gamma
    )
    train(agent, total_timestep=n_episodes)
    
    mean_reward, _ = evaluate_policy(agent, train_env, n_eval_episodes=n_eval_episodes)
    return mean_reward


def objective(trial):
    clip_range = trial.suggest_float("clip_range", 0.01, 0.3, log=True)
    learning_rate = trial.suggest_float("learning_rate", 5e-4, 1e-3, step=1e-4)
    gamma = trial.suggest_float("gamma", 0.99, 0.999, step=0.001)
    
    total_reward = optimize_call(clip_range, learning_rate, gamma, n_episodes*mean_timestep, n_eval_episodes)

    return total_reward



study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials, n_jobs=4)

print("Best parameters:", study.best_params)
print("Best reward:", study.best_value)

[I 2025-05-29 22:13:31,964] A new study created in memory with name: no-name-c590d939-6183-49c7-9683-6fac34239599
[I 2025-05-29 22:22:13,788] Trial 2 finished with value: 335.87948635315894 and parameters: {'clip_range': 0.04061024305777686, 'learning_rate': 0.0005, 'gamma': 0.995}. Best is trial 2 with value: 335.87948635315894.
[I 2025-05-29 22:22:45,944] Trial 3 finished with value: 498.42115930950644 and parameters: {'clip_range': 0.2976021480198951, 'learning_rate': 0.0007, 'gamma': 0.994}. Best is trial 3 with value: 498.42115930950644.
[I 2025-05-29 22:24:12,527] Trial 0 finished with value: 458.86427682989836 and parameters: {'clip_range': 0.06076594587760699, 'learning_rate': 0.001, 'gamma': 0.998}. Best is trial 3 with value: 498.42115930950644.
[I 2025-05-29 22:25:14,816] Trial 1 finished with value: 534.3012834342718 and parameters: {'clip_range': 0.06355317008283, 'learning_rate': 0.0008, 'gamma': 0.999}. Best is trial 1 with value: 534.3012834342718.
[I 2025-05-29 22:30:2

Best parameters: {'clip_range': 0.19877024509129543, 'learning_rate': 0.0008, 'gamma': 0.992}
Best reward: 1542.6249746930598
