In [1]:
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.evaluation import evaluate_policy
import gymnasium as gym
import gym_gridworlds
from gym_gridworlds.observation_wrappers import MatrixWithGoalWrapper, AddGoalWrapper
import optuna

  from .autonotebook import tqdm as notebook_tqdm


## Parameter optimization

In [None]:
def optimize_model(trial):
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
    gamma = trial.suggest_uniform("gamma", 0.8, 0.99)
    exploration_fraction = trial.suggest_uniform("exploration_fraction", 0, 0.3)

    env_path = f"Gym-Gridworlds/FourRooms-Original-13x13-v0"


    env = gym.make(env_path,
                   no_stay = True,
                   distance_reward = True,
                   start_pos = None,
                   random_goals = False)
    
    # env = AddGoalWrapper(env)

    model = DQN(
    "MlpPolicy",
    env,
    learning_rate=learning_rate,
    # buffer_size=150000,
    # learning_starts=150,
    gamma=gamma,
    exploration_fraction=exploration_fraction,
    # ent_coef=0.1,
    verbose=1,
    )

    model.learn(total_timesteps = 40000)

    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes = 10)
    return mean_reward

In [45]:
study = optuna.create_study(direction="maximize")
study.optimize(optimize_model, n_trials=20, show_progress_bar=True)


[I 2025-12-09 11:11:49,321] A new study created in memory with name: no-name-f60a62d4-20c2-4801-bfb1-4fcb74cbae92
  0%|          | 0/20 [00:00<?, ?it/s]

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 240      |
|    ep_rew_mean      | -316     |
|    exploration_rate | 0.679    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5456     |
|    time_elapsed     | 0        |
|    total_timesteps  | 962      |
| train/              |          |
|    learning_rate    | 1.23e-05 |
|    loss             | 0.822    |
|    n_updates        | 215      |
----------------------------------


  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
  gamma = trial.suggest_uniform("gamma", 0.9, 0.9999)
  exploration_fraction = trial.suggest_uniform("exploration_fraction", 0, 0.1)


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 222      |
|    ep_rew_mean      | -283     |
|    exploration_rate | 0.406    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5105     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1779     |
| train/              |          |
|    learning_rate    | 1.23e-05 |
|    loss             | 0.728    |
|    n_updates        | 419      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 267      |
|    ep_rew_mean      | -314     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 12       |
|    fps              | 4823     |
|    time_elapsed     | 0        |
|    total_timesteps  | 3202     |
| train/              |          |
|    learning_rate    | 1.23e-05 |
|    loss             | 0.672    |
|    n_updates      

Best trial: 0. Best value: -393.696:   5%|▌         | 1/20 [00:09<02:57,  9.35s/it]

[I 2025-12-09 11:11:58,678] Trial 0 finished with value: -393.69599497020243 and parameters: {'learning_rate': 1.2260743031395027e-05, 'gamma': 0.9866273293482073, 'exploration_fraction': 0.07113464685261743}. Best is trial 0 with value: -393.69599497020243.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 121      |
|    ep_rew_mean      | -150     |
|    exploration_rate | 0.869    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6720     |
|    time_elapsed     | 0        |
|    total_timesteps  | 484      |
| train/              |          |
|    learning_rate    | 1.1e-05  |
|    loss             | 0.873    |
|    n_updates        | 95       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 132      |
|    ep_rew_mean 

Best trial: 0. Best value: -393.696:  10%|█         | 2/20 [00:18<02:48,  9.36s/it]

[I 2025-12-09 11:12:08,048] Trial 1 finished with value: -599.5460066795349 and parameters: {'learning_rate': 1.0989344053236885e-05, 'gamma': 0.927859883465056, 'exploration_fraction': 0.08785552138963995}. Best is trial 0 with value: -393.69599497020243.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 376      |
|    ep_rew_mean      | -428     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4804     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1502     |
| train/              |          |
|    learning_rate    | 0.000247 |
|    loss             | 0.0674   |
|    n_updates        | 350      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 320      |
|    ep_rew_mean   

Best trial: 2. Best value: -248.216:  15%|█▌        | 3/20 [00:28<02:39,  9.39s/it]

[I 2025-12-09 11:12:17,464] Trial 2 finished with value: -248.2160002797842 and parameters: {'learning_rate': 0.0002467052253826289, 'gamma': 0.9440371229823037, 'exploration_fraction': 0.014385307926520619}. Best is trial 2 with value: -248.2160002797842.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 344      |
|    ep_rew_mean      | -384     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4722     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1375     |
| train/              |          |
|    learning_rate    | 0.000758 |
|    loss             | 0.00453  |
|    n_updates        | 318      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 396      |
|    ep_rew_mean   

Best trial: 3. Best value: -187.41:  20%|██        | 4/20 [00:37<02:29,  9.36s/it] 

[I 2025-12-09 11:12:26,770] Trial 3 finished with value: -187.41000615656375 and parameters: {'learning_rate': 0.0007583330201885997, 'gamma': 0.9640573818229236, 'exploration_fraction': 0.0006570126346668093}. Best is trial 3 with value: -187.41000615656375.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 377      |
|    ep_rew_mean      | -441     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4878     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1508     |
| train/              |          |
|    learning_rate    | 0.000294 |
|    loss             | 0.0418   |
|    n_updates        | 351      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 376      |
|    ep_rew_mean

Best trial: 3. Best value: -187.41:  25%|██▌       | 5/20 [00:46<02:20,  9.37s/it]

[I 2025-12-09 11:12:36,162] Trial 4 finished with value: -310.7479959964752 and parameters: {'learning_rate': 0.0002935909414968343, 'gamma': 0.9610025696291566, 'exploration_fraction': 0.016597450202798136}. Best is trial 3 with value: -187.41000615656375.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 403      |
|    ep_rew_mean      | -439     |
|    exploration_rate | 0.61     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5718     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1611     |
| train/              |          |
|    learning_rate    | 0.000197 |
|    loss             | 0.0961   |
|    n_updates        | 377      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 451      |
|    ep_rew_mean  

Best trial: 3. Best value: -187.41:  30%|███       | 6/20 [00:56<02:10,  9.36s/it]

[I 2025-12-09 11:12:45,496] Trial 5 finished with value: -264.09499670267104 and parameters: {'learning_rate': 0.00019675740865747758, 'gamma': 0.9557958369332604, 'exploration_fraction': 0.09816617514494061}. Best is trial 3 with value: -187.41000615656375.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 249      |
|    ep_rew_mean      | -304     |
|    exploration_rate | 0.281    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5532     |
|    time_elapsed     | 0        |
|    total_timesteps  | 996      |
| train/              |          |
|    learning_rate    | 0.000536 |
|    loss             | 0.0098   |
|    n_updates        | 223      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 374      |
|    ep_rew_mean 

Best trial: 3. Best value: -187.41:  35%|███▌      | 7/20 [01:05<02:01,  9.37s/it]

[I 2025-12-09 11:12:54,902] Trial 6 finished with value: -331.7629989504814 and parameters: {'learning_rate': 0.000535581970394697, 'gamma': 0.9605182936119997, 'exploration_fraction': 0.03288439175843726}. Best is trial 3 with value: -187.41000615656375.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 300      |
|    ep_rew_mean      | -252     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4797     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1200     |
| train/              |          |
|    learning_rate    | 0.00443  |
|    loss             | 9.98e-05 |
|    n_updates        | 274      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 335      |
|    ep_rew_mean    

Best trial: 3. Best value: -187.41:  40%|████      | 8/20 [01:14<01:52,  9.36s/it]

[I 2025-12-09 11:13:04,227] Trial 7 finished with value: -319.8920026808977 and parameters: {'learning_rate': 0.004433805122136388, 'gamma': 0.9946346507416757, 'exploration_fraction': 0.011505488356362982}. Best is trial 3 with value: -187.41000615656375.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 104      |
|    ep_rew_mean      | -94.5    |
|    exploration_rate | 0.753    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6629     |
|    time_elapsed     | 0        |
|    total_timesteps  | 414      |
| train/              |          |
|    learning_rate    | 0.000284 |
|    loss             | 0.408    |
|    n_updates        | 78       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 90.1     |
|    ep_rew_mean   

Best trial: 3. Best value: -187.41:  45%|████▌     | 9/20 [01:24<01:44,  9.47s/it]

[I 2025-12-09 11:13:13,948] Trial 8 finished with value: -293.16099348664284 and parameters: {'learning_rate': 0.00028435808343684106, 'gamma': 0.9578047726981455, 'exploration_fraction': 0.039835245665221834}. Best is trial 3 with value: -187.41000615656375.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 383      |
|    ep_rew_mean      | -433     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4687     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1532     |
| train/              |          |
|    learning_rate    | 6.42e-05 |
|    loss             | 0.49     |
|    n_updates        | 357      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 442      |
|    ep_rew_mean

Best trial: 3. Best value: -187.41:  50%|█████     | 10/20 [01:34<01:34,  9.48s/it]

[I 2025-12-09 11:13:23,435] Trial 9 finished with value: -319.1379979074001 and parameters: {'learning_rate': 6.42005164784534e-05, 'gamma': 0.9617607343826254, 'exploration_fraction': 0.01711364121997}. Best is trial 3 with value: -187.41000615656375.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 384      |
|    ep_rew_mean      | -424     |
|    exploration_rate | 0.366    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5527     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1538     |
| train/              |          |
|    learning_rate    | 0.00315  |
|    loss             | 0.000177 |
|    n_updates        | 359      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 442      |
|    ep_rew_mean      |

Best trial: 3. Best value: -187.41:  55%|█████▌    | 11/20 [01:43<01:25,  9.49s/it]

[I 2025-12-09 11:13:32,955] Trial 10 finished with value: -189.05000509917735 and parameters: {'learning_rate': 0.0031529667341791195, 'gamma': 0.9038639434567716, 'exploration_fraction': 0.05763435765569747}. Best is trial 3 with value: -187.41000615656375.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 222      |
|    ep_rew_mean      | -283     |
|    exploration_rate | 0.651    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5966     |
|    time_elapsed     | 0        |
|    total_timesteps  | 886      |
| train/              |          |
|    learning_rate    | 0.00413  |
|    loss             | 0.00138  |
|    n_updates        | 196      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 142      |
|    ep_rew_mean 

Best trial: 3. Best value: -187.41:  60%|██████    | 12/20 [01:53<01:15,  9.49s/it]

[I 2025-12-09 11:13:42,458] Trial 11 finished with value: -197.02399439513684 and parameters: {'learning_rate': 0.004125061801836976, 'gamma': 0.9027252304320699, 'exploration_fraction': 0.06022283813004535}. Best is trial 3 with value: -187.41000615656375.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 285      |
|    ep_rew_mean      | -348     |
|    exploration_rate | 0.473    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4694     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1140     |
| train/              |          |
|    learning_rate    | 0.00141  |
|    loss             | 0.00127  |
|    n_updates        | 259      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 331      |
|    ep_rew_mean  

Best trial: 12. Best value: -174.896:  65%|██████▌   | 13/20 [02:02<01:06,  9.49s/it]

[I 2025-12-09 11:13:51,939] Trial 12 finished with value: -174.89599908590316 and parameters: {'learning_rate': 0.0014064767678852608, 'gamma': 0.9015010469594037, 'exploration_fraction': 0.05139468680051764}. Best is trial 12 with value: -174.89599908590316.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 243      |
|    ep_rew_mean      | -284     |
|    exploration_rate | 0.345    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5662     |
|    time_elapsed     | 0        |
|    total_timesteps  | 973      |
| train/              |          |
|    learning_rate    | 0.00153  |
|    loss             | 0.00338  |
|    n_updates        | 218      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 372      |
|    ep_rew_mean

Best trial: 12. Best value: -174.896:  70%|███████   | 14/20 [02:12<00:56,  9.48s/it]

[I 2025-12-09 11:14:01,385] Trial 13 finished with value: -316.9469946861267 and parameters: {'learning_rate': 0.0015271586880596826, 'gamma': 0.9784416475427241, 'exploration_fraction': 0.035289912886496265}. Best is trial 12 with value: -174.89599908590316.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 500      |
|    ep_rew_mean      | -578     |
|    exploration_rate | 0.379    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5458     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2000     |
| train/              |          |
|    learning_rate    | 0.00116  |
|    loss             | 0.000125 |
|    n_updates        | 474      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 441      |
|    ep_rew_mean

Best trial: 14. Best value: -157.199:  75%|███████▌  | 15/20 [02:21<00:47,  9.55s/it]

[I 2025-12-09 11:14:11,116] Trial 14 finished with value: -157.1990006506443 and parameters: {'learning_rate': 0.0011557394961492881, 'gamma': 0.9280825624354934, 'exploration_fraction': 0.07643557080564387}. Best is trial 14 with value: -157.1990006506443.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 276      |
|    ep_rew_mean      | -315     |
|    exploration_rate | 0.654    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5764     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1104     |
| train/              |          |
|    learning_rate    | 0.00969  |
|    loss             | 0.000496 |
|    n_updates        | 250      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 299      |
|    ep_rew_mean  

Best trial: 14. Best value: -157.199:  80%|████████  | 16/20 [02:31<00:38,  9.67s/it]

[I 2025-12-09 11:14:21,070] Trial 15 finished with value: -265.5089978337288 and parameters: {'learning_rate': 0.00968965934585395, 'gamma': 0.923406120414499, 'exploration_fraction': 0.07574307751583995}. Best is trial 14 with value: -157.1990006506443.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 156      |
|    ep_rew_mean      | -185     |
|    exploration_rate | 0.799    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6285     |
|    time_elapsed     | 0        |
|    total_timesteps  | 624      |
| train/              |          |
|    learning_rate    | 0.00129  |
|    loss             | 0.0413   |
|    n_updates        | 130      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 212      |
|    ep_rew_mean     

Best trial: 14. Best value: -157.199:  85%|████████▌ | 17/20 [02:41<00:28,  9.61s/it]

[I 2025-12-09 11:14:30,538] Trial 16 finished with value: -284.51700420081613 and parameters: {'learning_rate': 0.0012885401640729329, 'gamma': 0.9201216669565845, 'exploration_fraction': 0.07384082640916315}. Best is trial 14 with value: -157.1990006506443.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 99.8     |
|    ep_rew_mean      | -106     |
|    exploration_rate | 0.803    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6801     |
|    time_elapsed     | 0        |
|    total_timesteps  | 399      |
| train/              |          |
|    learning_rate    | 8.38e-05 |
|    loss             | 0.576    |
|    n_updates        | 74       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 80.1     |
|    ep_rew_mean 

Best trial: 14. Best value: -157.199:  90%|█████████ | 18/20 [02:50<00:19,  9.58s/it]

[I 2025-12-09 11:14:40,034] Trial 17 finished with value: -566.2859941184521 and parameters: {'learning_rate': 8.380420622773312e-05, 'gamma': 0.9377067201420017, 'exploration_fraction': 0.04814341785992329}. Best is trial 14 with value: -157.1990006506443.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 226      |
|    ep_rew_mean      | -271     |
|    exploration_rate | 0.748    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6028     |
|    time_elapsed     | 0        |
|    total_timesteps  | 903      |
| train/              |          |
|    learning_rate    | 0.00172  |
|    loss             | 0.0112   |
|    n_updates        | 200      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 281      |
|    ep_rew_mean  

Best trial: 14. Best value: -157.199:  95%|█████████▌| 19/20 [03:00<00:09,  9.54s/it]

[I 2025-12-09 11:14:49,491] Trial 18 finished with value: -240.00000173449516 and parameters: {'learning_rate': 0.0017239154091343019, 'gamma': 0.9136295996899851, 'exploration_fraction': 0.0851519765961872}. Best is trial 14 with value: -157.1990006506443.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 88       |
|    ep_rew_mean      | -115     |
|    exploration_rate | 0.843    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 7106     |
|    time_elapsed     | 0        |
|    total_timesteps  | 352      |
| train/              |          |
|    learning_rate    | 0.000666 |
|    loss             | 0.586    |
|    n_updates        | 62       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 138      |
|    ep_rew_mean  

Best trial: 14. Best value: -157.199: 100%|██████████| 20/20 [03:09<00:00,  9.48s/it]

[I 2025-12-09 11:14:58,962] Trial 19 finished with value: -247.3080027192831 and parameters: {'learning_rate': 0.0006658254864602453, 'gamma': 0.9350507544705394, 'exploration_fraction': 0.05328981100979126}. Best is trial 14 with value: -157.1990006506443.





In [46]:
print("Best params:", study.best_params)
print("Best value:", study.best_value)


Best params: {'learning_rate': 0.0011557394961492881, 'gamma': 0.9280825624354934, 'exploration_fraction': 0.07643557080564387}
Best value: -157.1990006506443
