In [1]:
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.evaluation import evaluate_policy
import gymnasium as gym
import gym_gridworlds
import optuna

  from .autonotebook import tqdm as notebook_tqdm


## Parameter optimization

In [6]:
def optimize_model(trial):
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
    n_steps = trial.suggest_categorical("n_steps", [64, 128, 256, 512, 1024])
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])
    gamma = trial.suggest_uniform("gamma", 0.9, 0.9999)
    exploration_fraction = trial.suggest_uniform("exploration_fraction", 0, 0.1)
    env_path = f"Gym-Gridworlds/FourRooms-Original-13x13-v0"

    env = gym.make(env_path,
                   no_stay = True,
                   distance_reward = True,
                   start_pos = None,
                   random_goals = False)

    model = DQN(
    "MlpPolicy",
    env,
    learning_rate=learning_rate,
    n_steps = n_steps, 
    # batch_size=64,
    gamma=gamma,
    exploration_fraction=0.05,
    # ent_coef=0.1,
    verbose=1,
    )

    model.learn(total_timesteps = 50000)

    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes = 10)
    return mean_reward


In [7]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_model, n_trials=20, show_progress_bar=True)


[I 2025-12-06 19:46:14,909] A new study created in memory with name: no-name-d0cccd9c-4b5d-4659-a6b2-626d1743d4ec
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
  gamma = trial.suggest_uniform("gamma", 0.9, 0.9999)
  exploration_fraction = trial.suggest_uniform("exploration_fraction", 0, 0.1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 171      |
|    ep_rew_mean      | -9.74    |
|    exploration_rate | 0.74     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3317     |
|    time_elapsed     | 0        |
|    total_timesteps  | 685      |
| train/              |          |
|    learning_rate    | 0.000398 |
|    loss             | 0.0628   |
|    n_updates        | 146      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 129      |
|    ep_rew_mean      | -6.91    |
|    exploration_rate | 0.609    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3233     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1029     |
| train/              |        

Best trial: 0. Best value: 0.518343:   5%|▌         | 1/20 [00:20<06:30, 20.55s/it]

[I 2025-12-06 19:46:35,463] Trial 0 finished with value: 0.5183431951329112 and parameters: {'learning_rate': 0.00039794239747465506, 'n_steps': 1024, 'batch_size': 256, 'gamma': 0.9336220961406864, 'exploration_fraction': 0.027733828091331326}. Best is trial 0 with value: 0.5183431951329112.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 151      |
|    ep_rew_mean      | -10.2    |
|    exploration_rate | 0.77     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5060     |
|    time_elapsed     | 0        |
|    total_timesteps  | 605      |
| train/              |          |
|    learning_rate    | 2.65e-05 |
|    loss             | 0.461    |
|    n_updates        | 126      |
----------------------------------


  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
  gamma = trial.suggest_uniform("gamma", 0.9, 0.9999)
  exploration_fraction = trial.suggest_uniform("exploration_fraction", 0, 0.1)


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 137      |
|    ep_rew_mean      | -7.99    |
|    exploration_rate | 0.584    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4639     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1095     |
| train/              |          |
|    learning_rate    | 2.65e-05 |
|    loss             | 0.305    |
|    n_updates        | 248      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 144      |
|    ep_rew_mean      | -9.01    |
|    exploration_rate | 0.343    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 4408     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1728     |
| train/              |          |
|    learning_rate    | 2.65e-05 |
|    loss             | 0.323    |
|    n_updates      

Best trial: 0. Best value: 0.518343:  10%|█         | 2/20 [00:34<04:59, 16.66s/it]

[I 2025-12-06 19:46:49,396] Trial 1 finished with value: -7.024260372109711 and parameters: {'learning_rate': 2.6536523433274936e-05, 'n_steps': 128, 'batch_size': 256, 'gamma': 0.9203839203586897, 'exploration_fraction': 0.08622886561788662}. Best is trial 0 with value: 0.5183431951329112.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 154      |
|    ep_rew_mean      | -8.84    |
|    exploration_rate | 0.767    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3708     |
|    time_elapsed     | 0        |
|    total_timesteps  | 614      |
| train/              |          |
|    learning_rate    | 0.00365  |
|    loss             | 0.0245   |
|    n_updates        | 128      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean   

Best trial: 2. Best value: 0.526036:  15%|█▌        | 3/20 [00:55<05:15, 18.54s/it]

[I 2025-12-06 19:47:10,170] Trial 2 finished with value: 0.526035501435399 and parameters: {'learning_rate': 0.003654217634892923, 'n_steps': 1024, 'batch_size': 256, 'gamma': 0.9337936049491153, 'exploration_fraction': 0.051476024675492}. Best is trial 2 with value: 0.526035501435399.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 112      |
|    ep_rew_mean      | -6.51    |
|    exploration_rate | 0.83     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4153     |
|    time_elapsed     | 0        |
|    total_timesteps  | 448      |
| train/              |          |
|    learning_rate    | 3.52e-05 |
|    loss             | 1.36     |
|    n_updates        | 86       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 

Best trial: 2. Best value: 0.526036:  20%|██        | 4/20 [01:08<04:25, 16.59s/it]

[I 2025-12-06 19:47:23,768] Trial 3 finished with value: -3.2112426405772565 and parameters: {'learning_rate': 3.523592011984309e-05, 'n_steps': 64, 'batch_size': 32, 'gamma': 0.9749514075927469, 'exploration_fraction': 0.07741120807514597}. Best is trial 2 with value: 0.526035501435399.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 129      |
|    ep_rew_mean      | -7.11    |
|    exploration_rate | 0.804    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3684     |
|    time_elapsed     | 0        |
|    total_timesteps  | 517      |
| train/              |          |
|    learning_rate    | 2.38e-05 |
|    loss             | 0.346    |
|    n_updates        | 104      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      

Best trial: 2. Best value: 0.526036:  25%|██▌       | 5/20 [01:28<04:26, 17.79s/it]

[I 2025-12-06 19:47:43,683] Trial 4 finished with value: -7.368047321587801 and parameters: {'learning_rate': 2.3790437624665392e-05, 'n_steps': 1024, 'batch_size': 128, 'gamma': 0.9263637191697062, 'exploration_fraction': 0.00733886077239615}. Best is trial 2 with value: 0.526035501435399.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 122      |
|    ep_rew_mean      | -4.57    |
|    exploration_rate | 0.815    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2666     |
|    time_elapsed     | 0        |
|    total_timesteps  | 487      |
| train/              |          |
|    learning_rate    | 0.00185  |
|    loss             | 0.0205   |
|    n_updates        | 96       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean   

Best trial: 5. Best value: 0.57929:  30%|███       | 6/20 [01:47<04:12, 18.06s/it] 

[I 2025-12-06 19:48:02,269] Trial 5 finished with value: 0.5792899399995803 and parameters: {'learning_rate': 0.001852676639454492, 'n_steps': 512, 'batch_size': 64, 'gamma': 0.9273682690981515, 'exploration_fraction': 0.05213505207466268}. Best is trial 5 with value: 0.5792899399995803.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 130      |
|    ep_rew_mean      | -6.91    |
|    exploration_rate | 0.803    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3891     |
|    time_elapsed     | 0        |
|    total_timesteps  | 518      |
| train/              |          |
|    learning_rate    | 0.00984  |
|    loss             | 0.0526   |
|    n_updates        | 104      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      

Best trial: 5. Best value: 0.57929:  35%|███▌      | 7/20 [02:03<03:46, 17.44s/it]

[I 2025-12-06 19:48:18,428] Trial 6 finished with value: 0.5005917157977819 and parameters: {'learning_rate': 0.00983598958227145, 'n_steps': 128, 'batch_size': 128, 'gamma': 0.9469977845584036, 'exploration_fraction': 0.05522309180260768}. Best is trial 5 with value: 0.5792899399995803.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 169      |
|    ep_rew_mean      | -7.76    |
|    exploration_rate | 0.743    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4858     |
|    time_elapsed     | 0        |
|    total_timesteps  | 676      |
| train/              |          |
|    learning_rate    | 0.00784  |
|    loss             | 0.264    |
|    n_updates        | 143      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      

Best trial: 5. Best value: 0.57929:  35%|███▌      | 7/20 [02:18<03:46, 17.44s/it]

[I 2025-12-06 19:48:33,104] Trial 7 finished with value: -1.6775147467851639 and parameters: {'learning_rate': 0.007843763277153014, 'n_steps': 64, 'batch_size': 64, 'gamma': 0.981690912611863, 'exploration_fraction': 0.056998584975060884}. Best is trial 5 with value: 0.5792899399995803.


Best trial: 5. Best value: 0.57929:  40%|████      | 8/20 [02:18<03:18, 16.56s/it]

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 196      |
|    ep_rew_mean      | -12.8    |
|    exploration_rate | 0.701    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2940     |
|    time_elapsed     | 0        |
|    total_timesteps  | 786      |
| train/              |          |
|    learning_rate    | 0.000415 |
|    loss             | 0.11     |
|    n_updates        | 171      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 177      |
|    ep_rew_mean      | -10.6    |
|    exploration_rate | 0.462    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2937     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1417     |
| train/              |        

Best trial: 5. Best value: 0.57929:  45%|████▌     | 9/20 [02:37<03:13, 17.56s/it]

[I 2025-12-06 19:48:52,875] Trial 8 finished with value: 0.5153846146538854 and parameters: {'learning_rate': 0.0004147658365724272, 'n_steps': 1024, 'batch_size': 256, 'gamma': 0.9540951321508936, 'exploration_fraction': 0.0651851275162269}. Best is trial 5 with value: 0.5792899399995803.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | -19.2    |
|    exploration_rate | 0.696    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4895     |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
| train/              |          |
|    learning_rate    | 4.05e-05 |
|    loss             | 0.635    |
|    n_updates        | 174      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean    

Best trial: 5. Best value: 0.57929:  50%|█████     | 10/20 [02:52<02:44, 16.47s/it]

[I 2025-12-06 19:49:06,912] Trial 9 finished with value: -6.847929081134498 and parameters: {'learning_rate': 4.0462739529626596e-05, 'n_steps': 128, 'batch_size': 256, 'gamma': 0.9322654092412163, 'exploration_fraction': 0.03725108211235363}. Best is trial 5 with value: 0.5792899399995803.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | -13.8    |
|    exploration_rate | 0.696    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4189     |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
| train/              |          |
|    learning_rate    | 0.0016   |
|    loss             | 0.00682  |
|    n_updates        | 174      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean   

Best trial: 10. Best value: 0.64497:  55%|█████▌    | 11/20 [03:10<02:32, 16.94s/it]

[I 2025-12-06 19:49:24,908] Trial 10 finished with value: 0.6449704134836793 and parameters: {'learning_rate': 0.0015996182282361803, 'n_steps': 512, 'batch_size': 64, 'gamma': 0.9072682682772428, 'exploration_fraction': 0.09850839151986085}. Best is trial 10 with value: 0.6449704134836793.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 137      |
|    ep_rew_mean      | -6.44    |
|    exploration_rate | 0.791    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4412     |
|    time_elapsed     | 0        |
|    total_timesteps  | 549      |
| train/              |          |
|    learning_rate    | 0.00117  |
|    loss             | 0.013    |
|    n_updates        | 112      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean   

Best trial: 10. Best value: 0.64497:  55%|█████▌    | 11/20 [03:28<02:32, 16.94s/it]

[I 2025-12-06 19:49:43,209] Trial 11 finished with value: 0.620710059069097 and parameters: {'learning_rate': 0.001169368741314363, 'n_steps': 512, 'batch_size': 64, 'gamma': 0.9017569849761643, 'exploration_fraction': 0.09411034248967172}. Best is trial 10 with value: 0.6449704134836793.


Best trial: 10. Best value: 0.64497:  60%|██████    | 12/20 [03:28<02:18, 17.35s/it]

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 150      |
|    ep_rew_mean      | -6.58    |
|    exploration_rate | 0.772    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3188     |
|    time_elapsed     | 0        |
|    total_timesteps  | 600      |
| train/              |          |
|    learning_rate    | 0.000997 |
|    loss             | 0.0118   |
|    n_updates        | 124      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 142      |
|    ep_rew_mean      | -6.29    |
|    exploration_rate | 0.568    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3379     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1138     |
| train/              |        

Best trial: 10. Best value: 0.64497:  65%|██████▌   | 13/20 [03:46<02:03, 17.64s/it]

[I 2025-12-06 19:50:01,493] Trial 12 finished with value: 0.3207100581377745 and parameters: {'learning_rate': 0.0009969878437724432, 'n_steps': 512, 'batch_size': 64, 'gamma': 0.9008386733518482, 'exploration_fraction': 0.09902084023652968}. Best is trial 10 with value: 0.6449704134836793.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 114      |
|    ep_rew_mean      | -4.91    |
|    exploration_rate | 0.826    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2655     |
|    time_elapsed     | 0        |
|    total_timesteps  | 458      |
| train/              |          |
|    learning_rate    | 0.000132 |
|    loss             | 0.0797   |
|    n_updates        | 89       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean   

Best trial: 10. Best value: 0.64497:  65%|██████▌   | 13/20 [04:03<02:03, 17.64s/it]

[I 2025-12-06 19:50:18,112] Trial 13 finished with value: -0.19881656020879745 and parameters: {'learning_rate': 0.00013241973643811383, 'n_steps': 512, 'batch_size': 64, 'gamma': 0.9022020090135077, 'exploration_fraction': 0.09983545694886012}. Best is trial 10 with value: 0.6449704134836793.


Best trial: 10. Best value: 0.64497:  70%|███████   | 14/20 [04:03<01:43, 17.33s/it]

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 150      |
|    ep_rew_mean      | -7.96    |
|    exploration_rate | 0.772    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4087     |
|    time_elapsed     | 0        |
|    total_timesteps  | 601      |
| train/              |          |
|    learning_rate    | 0.00105  |
|    loss             | 0.0132   |
|    n_updates        | 125      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 116      |
|    ep_rew_mean      | -5.11    |
|    exploration_rate | 0.648    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4058     |
|    time_elapsed     | 0        |
|    total_timesteps  | 927      |
| train/              |        

Best trial: 10. Best value: 0.64497:  75%|███████▌  | 15/20 [04:19<01:25, 17.06s/it]

[I 2025-12-06 19:50:34,547] Trial 14 finished with value: 0.6402366857975721 and parameters: {'learning_rate': 0.0010492335594888304, 'n_steps': 256, 'batch_size': 64, 'gamma': 0.9116353872263502, 'exploration_fraction': 0.0798573907768481}. Best is trial 10 with value: 0.6449704134836793.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 154      |
|    ep_rew_mean      | -10.4    |
|    exploration_rate | 0.767    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3805     |
|    time_elapsed     | 0        |
|    total_timesteps  | 614      |
| train/              |          |
|    learning_rate    | 0.000126 |
|    loss             | 0.258    |
|    n_updates        | 128      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean    

Best trial: 10. Best value: 0.64497:  80%|████████  | 16/20 [04:34<01:05, 16.45s/it]

[I 2025-12-06 19:50:49,576] Trial 15 finished with value: -3.113609511964023 and parameters: {'learning_rate': 0.00012638232215572492, 'n_steps': 256, 'batch_size': 32, 'gamma': 0.9154413929882579, 'exploration_fraction': 0.07937912615652332}. Best is trial 10 with value: 0.6449704134836793.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 104      |
|    ep_rew_mean      | -8.36    |
|    exploration_rate | 0.841    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2898     |
|    time_elapsed     | 0        |
|    total_timesteps  | 418      |
| train/              |          |
|    learning_rate    | 0.00268  |
|    loss             | 0.201    |
|    n_updates        | 79       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean  

Best trial: 10. Best value: 0.64497:  85%|████████▌ | 17/20 [04:51<00:49, 16.54s/it]

[I 2025-12-06 19:51:06,340] Trial 16 finished with value: 0.46745562087744474 and parameters: {'learning_rate': 0.0026781009014327137, 'n_steps': 256, 'batch_size': 64, 'gamma': 0.9625236882425027, 'exploration_fraction': 0.07121289105262242}. Best is trial 10 with value: 0.6449704134836793.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 124      |
|    ep_rew_mean      | -8.12    |
|    exploration_rate | 0.812    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5004     |
|    time_elapsed     | 0        |
|    total_timesteps  | 494      |
| train/              |          |
|    learning_rate    | 0.000655 |
|    loss             | 0.0438   |
|    n_updates        | 98       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean  

Best trial: 17. Best value: 0.73432:  90%|█████████ | 18/20 [05:09<00:34, 17.05s/it]

[I 2025-12-06 19:51:24,561] Trial 17 finished with value: 0.734319526515901 and parameters: {'learning_rate': 0.0006554357064255008, 'n_steps': 256, 'batch_size': 64, 'gamma': 0.915659405413554, 'exploration_fraction': 0.08612428844489553}. Best is trial 17 with value: 0.734319526515901.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 153      |
|    ep_rew_mean      | -8.93    |
|    exploration_rate | 0.768    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3544     |
|    time_elapsed     | 0        |
|    total_timesteps  | 611      |
| train/              |          |
|    learning_rate    | 0.00017  |
|    loss             | 0.465    |
|    n_updates        | 127      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      

Best trial: 17. Best value: 0.73432:  95%|█████████▌| 19/20 [05:25<00:16, 16.64s/it]

[I 2025-12-06 19:51:40,259] Trial 18 finished with value: 0.4461538452655077 and parameters: {'learning_rate': 0.00016972384639744138, 'n_steps': 256, 'batch_size': 64, 'gamma': 0.9465970387663994, 'exploration_fraction': 0.08788596552415467}. Best is trial 17 with value: 0.734319526515901.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | -10.9    |
|    exploration_rate | 0.696    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3775     |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
| train/              |          |
|    learning_rate    | 0.000725 |
|    loss             | 1.96     |
|    n_updates        | 174      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean   

Best trial: 17. Best value: 0.73432: 100%|██████████| 20/20 [05:41<00:00, 17.06s/it]

[I 2025-12-06 19:51:56,014] Trial 19 finished with value: -0.09230770170688629 and parameters: {'learning_rate': 0.0007246450755302936, 'n_steps': 256, 'batch_size': 128, 'gamma': 0.9992319568248705, 'exploration_fraction': 0.03804060822554044}. Best is trial 17 with value: 0.734319526515901.





In [8]:
print("Best params:", study.best_params)
print("Best value:", study.best_value)


Best params: {'learning_rate': 0.0006554357064255008, 'n_steps': 256, 'batch_size': 64, 'gamma': 0.915659405413554, 'exploration_fraction': 0.08612428844489553}
Best value: 0.734319526515901
