In [1]:
import gymnasium as gym
from pogema import GridConfig
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

grid_config = GridConfig(
    size=16,
    density=0.5,
    num_agents=1,
    max_episode_steps=64
)

env = gym.make("Pogema-v0",grid_config=grid_config)

a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  logger.warn(
  logger.warn(


#### Optuna Integration

In [2]:
""" Optuna example that optimizes the hyperparameters of
a reinforcement learning agent using A2C implementation from Stable-Baselines3
on a Gymnasium environment.

This is a simplified version of what can be found in https://github.com/DLR-RM/rl-baselines3-zoo.

You can run this example as follows:
    $ python sb3_simple.py

"""
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(1.0e4)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3


DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": env
}

def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for A2C hyperparameters."""
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)

    trial.set_user_attr("gamma", gamma)

    return {
        "gamma": gamma,
        "learning_rate": learning_rate,
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_a2c_params(trial))
    # Create the RL model.
    model = A2C(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, n_jobs=4, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))


  from .autonotebook import tqdm as notebook_tqdm
[I 2023-11-10 17:19:25,386] A new study created in memory with name: no-name-52c464f8-98df-4fb4-8bb4-2184d70c7a25


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 38       |
|    ep_rew_mean        | 0.769    |
| time/                 |          |
|    fps                | 207      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | 0.993    |
|    learning_rate      | 1.01e-05 |
|    n_updates          | 99       |
|    policy_loss        | -0.436   |
|    value_loss         | 0.0738   |
--------------------------------

[I 2023-11-10 17:20:20,319] Trial 1 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.0006524574108032467, 'lr': 2.3087090920528374e-05}. Best is trial 1 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 81.7      |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -4.02e-08 |
|    explained_variance | 5.96e-08  |
|    learning_rate      | 0.474     |
|    n_updates          | 1999      |
|    policy_loss        | -0        |
|    value_loss         | 4.75e-06  |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 76.7     |
|    ep_rew_mean     | 0        |
| time/              |          |
|    fps             | 181      |
|    iterations      | 2000     |
|    time_elapsed    | 55       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:20:20,500] Trial 0 finished with value: 0.0 and parameters: {'gamma': 0.0004218386393968535, 'lr': 0.47353318794973187}. Best is trial 1 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 35.3      |
|    mean_reward        | 0.333     |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -2.36e+04 |
|    learning_rate      | 1.01e-05  |
|    n_updates          | 1999      |
|    policy_loss        | 0.459     |
|    value_loss         | 0.153     |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 37.1     |
|    ep_rew_mean     | 0.52     |
| time/              |          |
|    fps             | 179      |
|    iterations      | 2000     |
|    time_elapsed    | 55       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:20:21,035] Trial 2 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.001355851714624764, 'lr': 1.0080639948900417e-05}. Best is trial 1 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 44.3      |
|    mean_reward        | 0.333     |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -1.66e-07 |
|    explained_variance | 1.19e-07  |
|    learning_rate      | 0.0489    |
|    n_updates          | 1999      |
|    policy_loss        | -0        |
|    value_loss         | 0.385     |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 42.7     |
|    ep_rew_mean     | 0.37     |
| time/              |          |
|    fps             | 178      |
|    iterations      | 2000     |
|    time_elapsed    | 56       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:20:21,489] Trial 3 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.00015435647997484727, 'lr': 0.04894624732030454}. Best is trial 1 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 33.8      |
|    ep_rew_mean        | 0.5       |
| time/                 |           |
|    fps                | 177       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.57     |
|    explained_variance | -5.48e+04 |
|    learning_rate      | 0.000601  |
|    n_updates          | 99        |
|    policy_loss        | 0.405     |
|    value_loss         | 0.159     |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 40.6     |
|    ep_rew_mean        | 0.667    |
| time/                 |          |
|    fps                | 173      |
|    iterations         | 100      |
|

[I 2023-11-10 17:21:20,520] Trial 4 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.003855494842587123, 'lr': 0.0006014299621162612}. Best is trial 1 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 50.7     |
|    mean_reward        | 0.667    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 0        |
|    learning_rate      | 0.423    |
|    n_updates          | 1999     |
|    policy_loss        | -0       |
|    value_loss         | 4.94e-06 |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 37.6     |
|    ep_rew_mean     | 0.48     |
| time/              |          |
|    fps             | 166      |
|    iterations      | 2000     |
|    time_elapsed    | 60       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:21:21,108] Trial 6 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.0005665530132925133, 'lr': 0.4234305103244359}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 6.67      |
|    mean_reward        | 1         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -0.0017   |
|    explained_variance | 0.633     |
|    learning_rate      | 0.00134   |
|    n_updates          | 1999      |
|    policy_loss        | -9.74e-07 |
|    value_loss         | 5.04e-05  |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 38.4     |
|    ep_rew_mean     | 0.39     |
| time/              |          |
|    fps             | 163      |
|    iterations      | 2000     |
|    time_elapsed    | 61       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:21:21,583] Trial 5 finished with value: 1.0 and parameters: {'gamma': 0.010626448733732002, 'lr': 0.0013429429973806648}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 60.3      |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -0.0103   |
|    explained_variance | -0.143    |
|    learning_rate      | 0.00117   |
|    n_updates          | 1999      |
|    policy_loss        | -2.49e-06 |
|    value_loss         | 1.29e-05  |
-------------------------------------


[I 2023-11-10 17:21:21,864] Trial 7 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 29       |
|    ep_rew_mean        | 0.471    |
| time/                 |          |
|    fps                | 151      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -0.419   |
|    learning_rate      | 1.07e-05 |
|    n_updates          | 99       |
|    policy_loss        | 0.384    |
|    value_loss         | 0.166    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 35.7     |
|    ep_rew_mean        | 0.643    |
| time/                 |          |
|    fps                | 153      |
|    iterations         | 100      |
|    time_elapsed 

[I 2023-11-10 17:21:53,738] Trial 10 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 56        |
|    mean_reward        | 0.333     |
| time/                 |           |
|    total_timesteps    | 5000      |
| train/                |           |
|    entropy_loss       | -0.000381 |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.00968   |
|    n_updates          | 999       |
|    policy_loss        | -8.36e-07 |
|    value_loss         | 0.000803  |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 43.4     |
|    ep_rew_mean     | 0.42     |
| time/              |          |
|    fps             | 154      |
|    iterations      | 1000     |
|    time_elapsed    | 32       |
|    total_timesteps | 5000     |
---------------------------------
----------------------------------

[I 2023-11-10 17:22:21,150] Trial 11 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.02864939172467701, 'lr': 0.009678438925682426}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 34.7     |
|    mean_reward        | 0.667    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 0        |
|    learning_rate      | 0.619    |
|    n_updates          | 1999     |
|    policy_loss        | -0       |
|    value_loss         | 0.988    |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 46.6     |
|    ep_rew_mean     | 0.41     |
| time/              |          |
|    fps             | 166      |
|    iterations      | 2000     |
|    time_elapsed    | 60       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:22:21,309] Trial 9 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.03843148906801046, 'lr': 0.6193807161570288}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 59       |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -4.42    |
|    learning_rate      | 1.07e-05 |
|    n_updates          | 1999     |
|    policy_loss        | -0.038   |
|    value_loss         | 0.00238  |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 41.4     |
|    ep_rew_mean     | 0.46     |
| time/              |          |
|    fps             | 163      |
|    iterations      | 2000     |
|    time_elapsed    | 61       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:22:21,567] Trial 8 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.05258511801339614, 'lr': 1.0700743295887206e-05}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 33.6     |
|    ep_rew_mean        | 0.786    |
| time/                 |          |
|    fps                | 243      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -562     |
|    learning_rate      | 0.000166 |
|    n_updates          | 99       |
|    policy_loss        | -0.281   |
|    value_loss         | 0.0525   |
------------------------------------
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 32.3      |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 5000      |
| train/                |           |
|    entrop

[I 2023-11-10 17:22:23,796] Trial 12 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 28.9      |
|    ep_rew_mean        | 0.588     |
| time/                 |           |
|    fps                | 176       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -5.14e-09 |
|    explained_variance | -2.5e-06  |
|    learning_rate      | 0.0517    |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 0.205     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 49.1      |
|    ep_rew_mean        | 0.333     |
| time/                 |           |
|    fps                | 177       |
|    iterations         | 100   

[I 2023-11-10 17:22:40,806] Trial 13 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 57.1      |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 287       |
|    iterations         | 100       |
|    time_elapsed       | 1         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -4.82e-24 |
|    explained_variance | -2.74e-06 |
|    learning_rate      | 0.13      |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 0.0556    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 47.7      |
|    ep_rew_mean        | 0.446     |
| time/                 |           |
|    fps                | 189       |
|    iterations         | 800   

[I 2023-11-10 17:22:48,189] Trial 15 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 90.7      |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 270       |
|    iterations         | 400       |
|    time_elapsed       | 7         |
|    total_timesteps    | 2000      |
| train/                |           |
|    entropy_loss       | -4.64e-24 |
|    explained_variance | 0         |
|    learning_rate      | 0.13      |
|    n_updates          | 399       |
|    policy_loss        | -0        |
|    value_loss         | 0.0134    |
-------------------------------------
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 31.3      |
|    mean_reward        | 0.667     |
| time/                 |           |
|    total_timesteps    | 5000      |
| train/                |       

[I 2023-11-10 17:23:00,128] Trial 17 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 50.6     |
|    ep_rew_mean        | 0.327    |
| time/                 |          |
|    fps                | 178      |
|    iterations         | 500      |
|    time_elapsed       | 13       |
|    total_timesteps    | 2500     |
| train/                |          |
|    entropy_loss       | -0.00101 |
|    explained_variance | 0        |
|    learning_rate      | 0.00652  |
|    n_updates          | 499      |
|    policy_loss        | 7.97e-08 |
|    value_loss         | 8.26e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 30.6      |
|    ep_rew_mean        | 0.625     |
| time/                 |           |
|    fps                | 237       |
|    iterations         | 100       |
|    time_e

[I 2023-11-10 17:23:17,973] Trial 14 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.005038165666964593, 'lr': 0.05173141541708777}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 35.3      |
|    ep_rew_mean        | 0.42      |
| time/                 |           |
|    fps                | 241       |
|    iterations         | 900       |
|    time_elapsed       | 18        |
|    total_timesteps    | 4500      |
| train/                |           |
|    entropy_loss       | -0.00188  |
|    explained_variance | 0         |
|    learning_rate      | 0.0058    |
|    n_updates          | 899       |
|    policy_loss        | -5.23e-08 |
|    value_loss         | 1.02e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 43.2      |
|    ep_rew_mean        | 0.5       |
| time/                 |           |
|    fps                | 177       |
|    iterations         | 1100  

[I 2023-11-10 17:23:19,970] Trial 16 finished with value: 0.0 and parameters: {'gamma': 0.007263661850270324, 'lr': 0.08415730331927479}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 71        |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 174       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -0.0303   |
|    explained_variance | -3.63     |
|    learning_rate      | 0.0112    |
|    n_updates          | 99        |
|    policy_loss        | -8.95e-07 |
|    value_loss         | 8.08e-08  |
-------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 65.7     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|

[I 2023-11-10 17:23:21,066] Trial 19 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 42.2      |
|    ep_rew_mean        | 0.5       |
| time/                 |           |
|    fps                | 175       |
|    iterations         | 1200      |
|    time_elapsed       | 34        |
|    total_timesteps    | 6000      |
| train/                |           |
|    entropy_loss       | -0.000793 |
|    explained_variance | 0         |
|    learning_rate      | 0.00652   |
|    n_updates          | 1199      |
|    policy_loss        | -7.77e-10 |
|    value_loss         | 1.61e-10  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 83.3      |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 190       |
|    iterations         | 100   

[I 2023-11-10 17:23:41,067] Trial 22 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 52.4      |
|    ep_rew_mean        | 0.35      |
| time/                 |           |
|    fps                | 177       |
|    iterations         | 1900      |
|    time_elapsed       | 53        |
|    total_timesteps    | 9500      |
| train/                |           |
|    entropy_loss       | -0.000727 |
|    explained_variance | 0         |
|    learning_rate      | 0.00652   |
|    n_updates          | 1899      |
|    policy_loss        | -1.66e-09 |
|    value_loss         | 9.04e-10  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 60        |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 180       |
|    iterations         | 800   

[I 2023-11-10 17:23:45,017] Trial 18 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.007836212823880502, 'lr': 0.006520470478358697}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 47.4      |
|    ep_rew_mean        | 0.381     |
| time/                 |           |
|    fps                | 232       |
|    iterations         | 200       |
|    time_elapsed       | 4         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -2.84e-16 |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.266     |
|    n_updates          | 199       |
|    policy_loss        | -0        |
|    value_loss         | 0.449     |
-------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 42.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|

[I 2023-11-10 17:23:46,214] Trial 20 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 41.7      |
|    ep_rew_mean        | 0.273     |
| time/                 |           |
|    fps                | 224       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -0        |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.859     |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 4.76      |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 62.2      |
|    ep_rew_mean        | 0.333     |
| time/                 |           |
|    fps                | 237       |
|    iterations         | 300   

[I 2023-11-10 17:23:48,311] Trial 21 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 31.1      |
|    ep_rew_mean        | 0.0625    |
| time/                 |           |
|    fps                | 176       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -3.06e-27 |
|    explained_variance | 0         |
|    learning_rate      | 0.407     |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 0.0982    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 62.2      |
|    ep_rew_mean        | 0.344     |
| time/                 |           |
|    fps                | 234       |
|    iterations         | 400   

[I 2023-11-10 17:24:12,905] Trial 24 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 46.6      |
|    ep_rew_mean        | 0.18      |
| time/                 |           |
|    fps                | 227       |
|    iterations         | 1500      |
|    time_elapsed       | 32        |
|    total_timesteps    | 7500      |
| train/                |           |
|    entropy_loss       | -2.84e-16 |
|    explained_variance | 0         |
|    learning_rate      | 0.266     |
|    n_updates          | 1499      |
|    policy_loss        | -0        |
|    value_loss         | 0.402     |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 33.4     |
|    ep_rew_mean        | 0.46     |
| time/                 |          |
|    fps                | 172      |
|    iterations         | 900      |
|

[I 2023-11-10 17:24:24,774] Trial 23 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.015790499125537757, 'lr': 0.2659186845563921}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 37.3      |
|    ep_rew_mean        | 0.32      |
| time/                 |           |
|    fps                | 174       |
|    iterations         | 1300      |
|    time_elapsed       | 37        |
|    total_timesteps    | 6500      |
| train/                |           |
|    entropy_loss       | -0        |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.877     |
|    n_updates          | 1299      |
|    policy_loss        | -0        |
|    value_loss         | 6.14      |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 57.1     |
|    ep_rew_mean        | 0.25     |
| time/                 |          |
|    fps                | 176      |
|    iterations         | 1400     |
|

[I 2023-11-10 17:24:40,450] Trial 27 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 55.8      |
|    ep_rew_mean        | 0.25      |
| time/                 |           |
|    fps                | 174       |
|    iterations         | 1900      |
|    time_elapsed       | 54        |
|    total_timesteps    | 9500      |
| train/                |           |
|    entropy_loss       | -1.5e-31  |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.407     |
|    n_updates          | 1899      |
|    policy_loss        | -0        |
|    value_loss         | 0.124     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 51.9      |
|    ep_rew_mean        | 0.276     |
| time/                 |           |
|    fps                | 240       |
|    iterations         | 800   

[I 2023-11-10 17:24:43,608] Trial 25 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.015033740413923384, 'lr': 0.40678897182383084}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 49.9      |
|    ep_rew_mean        | 0.278     |
| time/                 |           |
|    fps                | 237       |
|    iterations         | 900       |
|    time_elapsed       | 18        |
|    total_timesteps    | 4500      |
| train/                |           |
|    entropy_loss       | -3.68e-14 |
|    explained_variance | 0         |
|    learning_rate      | 0.938     |
|    n_updates          | 899       |
|    policy_loss        | -0        |
|    value_loss         | 0.217     |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 50.2     |
|    ep_rew_mean        | 0        |
| time/                 |          |
|    fps                | 192      |
|    iterations         | 200      |
|

[I 2023-11-10 17:24:46,230] Trial 26 finished with value: 0.0 and parameters: {'gamma': 0.015712636080014067, 'lr': 0.8769590864482943}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 56.4     |
|    ep_rew_mean        | 0.5      |
| time/                 |          |
|    fps                | 180      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 0        |
|    learning_rate      | 0.163    |
|    n_updates          | 99       |
|    policy_loss        | -0       |
|    value_loss         | 0.00467  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 48.9      |
|    ep_rew_mean        | 0.29      |
| time/                 |           |
|    fps                | 235       |
|    iterations         | 1100      |
|    time_e

[I 2023-11-10 17:25:07,923] Trial 28 finished with value: 0.0 and parameters: {'gamma': 0.0032172390713837862, 'lr': 0.9375840652392353}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 82.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 0        |
|    learning_rate      | 0.501    |
|    n_updates          | 999      |
|    policy_loss        | -0       |
|    value_loss         | 0.0559   |
------------------------------------


[I 2023-11-10 17:25:08,815] Trial 29 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 38.7      |
|    ep_rew_mean        | 0.32      |
| time/                 |           |
|    fps                | 177       |
|    iterations         | 900       |
|    time_elapsed       | 25        |
|    total_timesteps    | 4500      |
| train/                |           |
|    entropy_loss       | -0        |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.163     |
|    n_updates          | 899       |
|    policy_loss        | -0        |
|    value_loss         | 0.00195   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 38.9     |
|    ep_rew_mean        | 0.44     |
| time/                 |          |
|    fps                | 173      |
|    iterations         | 800      |
|

[I 2023-11-10 17:25:28,884] Trial 32 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 66.7      |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 171       |
|    iterations         | 700       |
|    time_elapsed       | 20        |
|    total_timesteps    | 3500      |
| train/                |           |
|    entropy_loss       | -2.53e-10 |
|    explained_variance | 0         |
|    learning_rate      | 0.179     |
|    n_updates          | 699       |
|    policy_loss        | -0        |
|    value_loss         | 1.68e-09  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 37.8     |
|    ep_rew_mean        | 0.41     |
| time/                 |          |
|    fps                | 174      |
|    iterations         | 1500     |
|

[I 2023-11-10 17:25:38,755] Trial 33 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 44.6      |
|    ep_rew_mean        | 0.309     |
| time/                 |           |
|    fps                | 237       |
|    iterations         | 500       |
|    time_elapsed       | 10        |
|    total_timesteps    | 2500      |
| train/                |           |
|    entropy_loss       | -3.19e-10 |
|    explained_variance | 0         |
|    learning_rate      | 0.148     |
|    n_updates          | 499       |
|    policy_loss        | -0        |
|    value_loss         | 0.202     |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 41.2     |
|    ep_rew_mean        | 0.44     |
| time/                 |          |
|    fps                | 174      |
|    iterations         | 1900     |
|

[I 2023-11-10 17:25:41,363] Trial 30 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.003023364940930544, 'lr': 0.16310197487811412}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 49.3      |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 163       |
|    iterations         | 100       |
|    time_elapsed       | 3         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.9e-10  |
|    explained_variance | -0.000552 |
|    learning_rate      | 0.0343    |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 0.0107    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 46.6      |
|    ep_rew_mean        | 0.329     |
| time/                 |           |
|    fps                | 250       |
|    iterations         | 700   

[I 2023-11-10 17:25:44,073] Trial 31 finished with value: 0.0 and parameters: {'gamma': 0.0029605644529943267, 'lr': 0.14771801481923688}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 48.4      |
|    ep_rew_mean        | 0.321     |
| time/                 |           |
|    fps                | 257       |
|    iterations         | 800       |
|    time_elapsed       | 15        |
|    total_timesteps    | 4000      |
| train/                |           |
|    entropy_loss       | -3.08e-10 |
|    explained_variance | 0         |
|    learning_rate      | 0.148     |
|    n_updates          | 799       |
|    policy_loss        | -0        |
|    value_loss         | 0.201     |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 45.2     |
|    ep_rew_mean        | 0.556    |
| time/                 |          |
|    fps                | 149      |
|    iterations         | 100      |
|

[I 2023-11-10 17:25:47,771] Trial 34 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 52.8     |
|    ep_rew_mean        | 0.333    |
| time/                 |          |
|    fps                | 152      |
|    iterations         | 200      |
|    time_elapsed       | 6        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -0.423   |
|    explained_variance | -31.7    |
|    learning_rate      | 0.00269  |
|    n_updates          | 199      |
|    policy_loss        | 0.000718 |
|    value_loss         | 7.24e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 47.5      |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 158       |
|    iterations         | 300       |
|    time_e

[I 2023-11-10 17:26:05,840] Trial 38 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 41.5      |
|    ep_rew_mean        | 0.429     |
| time/                 |           |
|    fps                | 160       |
|    iterations         | 700       |
|    time_elapsed       | 21        |
|    total_timesteps    | 3500      |
| train/                |           |
|    entropy_loss       | -8.99e-05 |
|    explained_variance | 0         |
|    learning_rate      | 0.0339    |
|    n_updates          | 699       |
|    policy_loss        | -5.2e-07  |
|    value_loss         | 0.00562   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 44.5     |
|    ep_rew_mean        | 0.438    |
| time/                 |          |
|    fps                | 154      |
|    iterations         | 800      |
|

[I 2023-11-10 17:26:11,316] Trial 35 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 38.9     |
|    ep_rew_mean        | 0.4      |
| time/                 |          |
|    fps                | 161      |
|    iterations         | 900      |
|    time_elapsed       | 27       |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -5.8e-05 |
|    explained_variance | 0        |
|    learning_rate      | 0.0339   |
|    n_updates          | 899      |
|    policy_loss        | 2.79e-07 |
|    value_loss         | 0.00644  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 53.4     |
|    ep_rew_mean        | 0.216    |
| time/                 |          |
|    fps                | 306      |
|    iterations         | 400      |
|    time_elapsed 

[I 2023-11-10 17:26:14,189] Trial 36 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 40.2     |
|    ep_rew_mean        | 0.667    |
| time/                 |          |
|    fps                | 153      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.12    |
|    explained_variance | -6.71    |
|    learning_rate      | 0.00195  |
|    n_updates          | 99       |
|    policy_loss        | -5.4e-05 |
|    value_loss         | 5.85e-05 |
------------------------------------
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 68.3      |
|    mean_reward        | 0.333     |
| time/                 |           |
|    total_timesteps    | 5000      |
| train/                |           |
|    entrop

[I 2023-11-10 17:26:21,930] Trial 39 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 33       |
|    ep_rew_mean        | 0.6      |
| time/                 |          |
|    fps                | 164      |
|    iterations         | 300      |
|    time_elapsed       | 9        |
|    total_timesteps    | 1500     |
| train/                |          |
|    entropy_loss       | -1.58    |
|    explained_variance | -5.79    |
|    learning_rate      | 0.000466 |
|    n_updates          | 299      |
|    policy_loss        | -0.0854  |
|    value_loss         | 0.00376  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 25.9     |
|    ep_rew_mean        | 0.625    |
| time/                 |          |
|    fps                | 306      |
|    iterations         | 100      |
|    time_elapsed 

[I 2023-11-10 17:26:39,483] Trial 42 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 39       |
|    ep_rew_mean        | 0.46     |
| time/                 |          |
|    fps                | 152      |
|    iterations         | 900      |
|    time_elapsed       | 29       |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -0.695   |
|    explained_variance | -0.0802  |
|    learning_rate      | 0.00195  |
|    n_updates          | 899      |
|    policy_loss        | -0.0409  |
|    value_loss         | 0.00348  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 47.4      |
|    ep_rew_mean        | 0.44      |
| time/                 |           |
|    fps                | 158       |
|    iterations         | 1800      |
|    time_e

[I 2023-11-10 17:26:44,466] Trial 40 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 20.7     |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -1.46    |
|    explained_variance | -18      |
|    learning_rate      | 0.000466 |
|    n_updates          | 999      |
|    policy_loss        | 0.052    |
|    value_loss         | 0.00213  |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 33.1     |
|    ep_rew_mean     | 0.42     |
| time/              |          |
|    fps             | 158      |
|    iterations      | 1000     |
|    time_elapsed    | 31       |
|    total_timesteps | 5000     |
---------------------------------
-------------------------------------
| rollout/

[I 2023-11-10 17:26:47,974] Trial 37 finished with value: 0.0 and parameters: {'gamma': 0.027072528520359358, 'lr': 0.03389940311748451}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 32.5     |
|    ep_rew_mean        | 0.45     |
| time/                 |          |
|    fps                | 158      |
|    iterations         | 1100     |
|    time_elapsed       | 34       |
|    total_timesteps    | 5500     |
| train/                |          |
|    entropy_loss       | -1.33    |
|    explained_variance | -1.12    |
|    learning_rate      | 0.000466 |
|    n_updates          | 1099     |
|    policy_loss        | -0.00693 |
|    value_loss         | 9.28e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 80.9      |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 312       |
|    iterations         | 600       |
|    time_e

[I 2023-11-10 17:26:55,594] Trial 43 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 35.2     |
|    ep_rew_mean        | 0.571    |
| time/                 |          |
|    fps                | 319      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.59    |
|    explained_variance | -73.3    |
|    learning_rate      | 0.000232 |
|    n_updates          | 99       |
|    policy_loss        | -0.0279  |
|    value_loss         | 0.089    |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 38.8      |
|    ep_rew_mean        | 0.49      |
| time/                 |           |
|    fps                | 157       |
|    iterations         | 400       |
|    time_e

[I 2023-11-10 17:27:18,414] Trial 41 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.05252125439226925, 'lr': 0.0004664825230118767}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 29.3      |
|    ep_rew_mean        | 0.44      |
| time/                 |           |
|    fps                | 157       |
|    iterations         | 1100      |
|    time_elapsed       | 34        |
|    total_timesteps    | 5500      |
| train/                |           |
|    entropy_loss       | -1.6      |
|    explained_variance | -1.06e+04 |
|    learning_rate      | 0.000263  |
|    n_updates          | 1099      |
|    policy_loss        | 0.00189   |
|    value_loss         | 0.00103   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 43.7      |
|    ep_rew_mean        | 0.48      |
| time/                 |           |
|    fps                | 289       |
|    iterations         | 1400  

[I 2023-11-10 17:27:21,162] Trial 45 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 29.9     |
|    ep_rew_mean        | 0.333    |
| time/                 |          |
|    fps                | 157      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -166     |
|    learning_rate      | 0.000238 |
|    n_updates          | 99       |
|    policy_loss        | 0.111    |
|    value_loss         | 0.0097   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 47.2     |
|    ep_rew_mean        | 0.46     |
| time/                 |          |
|    fps                | 300      |
|    iterations         | 1600     |
|    time_elapsed 

[I 2023-11-10 17:27:28,042] Trial 46 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.00010349789743020243, 'lr': 0.00023198187881877852}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 32.2      |
|    ep_rew_mean        | 0.45      |
| time/                 |           |
|    fps                | 157       |
|    iterations         | 1400      |
|    time_elapsed       | 44        |
|    total_timesteps    | 7000      |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -2.12e+03 |
|    learning_rate      | 0.000263  |
|    n_updates          | 1399      |
|    policy_loss        | -0.0416   |
|    value_loss         | 0.000819  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 40.4     |
|    ep_rew_mean        | 0.417    |
| time/                 |          |
|    fps                | 281      |
|    iterations         | 100      |
|

[I 2023-11-10 17:27:49,281] Trial 44 finished with value: 0.0 and parameters: {'gamma': 0.0002074553158325352, 'lr': 0.000263065692974712}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 35.7      |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 5000      |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -1.87e+03 |
|    learning_rate      | 0.000238  |
|    n_updates          | 999       |
|    policy_loss        | -0.0739   |
|    value_loss         | 0.0074    |
-------------------------------------


[I 2023-11-10 17:27:50,254] Trial 47 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 37.3     |
|    ep_rew_mean        | 0.62     |
| time/                 |          |
|    fps                | 154      |
|    iterations         | 900      |
|    time_elapsed       | 29       |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -1.09    |
|    explained_variance | 0.0163   |
|    learning_rate      | 0.000979 |
|    n_updates          | 899      |
|    policy_loss        | 0.756    |
|    value_loss         | 0.634    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 33       |
|    ep_rew_mean        | 0.53     |
| time/                 |          |
|    fps                | 279      |
|    iterations         | 1300     |
|    time_elapsed 

[I 2023-11-10 17:28:07,266] Trial 49 finished with value: 1.0 and parameters: {'gamma': 0.039240248167976284, 'lr': 0.0004694029861764035}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 37.2     |
|    ep_rew_mean        | 0.525    |
| time/                 |          |
|    fps                | 156      |
|    iterations         | 600      |
|    time_elapsed       | 19       |
|    total_timesteps    | 3000     |
| train/                |          |
|    entropy_loss       | -1.16    |
|    explained_variance | 0.619    |
|    learning_rate      | 0.00129  |
|    n_updates          | 599      |
|    policy_loss        | 0.0103   |
|    value_loss         | 9.82e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 44       |
|    ep_rew_mean        | 0.412    |
| time/                 |          |
|    fps                | 156      |
|    iterations         | 600      |
|    time_elapsed 

[I 2023-11-10 17:28:26,072] Trial 48 finished with value: 0.0 and parameters: {'gamma': 0.03983968592848979, 'lr': 0.000979004266751471}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 41.1     |
|    ep_rew_mean        | 0.45     |
| time/                 |          |
|    fps                | 152      |
|    iterations         | 1100     |
|    time_elapsed       | 36       |
|    total_timesteps    | 5500     |
| train/                |          |
|    entropy_loss       | -0.704   |
|    explained_variance | 0.878    |
|    learning_rate      | 0.00118  |
|    n_updates          | 1099     |
|    policy_loss        | -0.0684  |
|    value_loss         | 0.00134  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 40.7     |
|    ep_rew_mean        | 0.51     |
| time/                 |          |
|    fps                | 155      |
|    iterations         | 1200     |
|    time_elapsed 

[I 2023-11-10 17:28:29,208] Trial 52 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 70.7     |
|    ep_rew_mean        | 0        |
| time/                 |          |
|    fps                | 155      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.0174  |
|    explained_variance | -560     |
|    learning_rate      | 0.00446  |
|    n_updates          | 99       |
|    policy_loss        | 1.03e-05 |
|    value_loss         | 6.58e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 41.6     |
|    ep_rew_mean        | 0.49     |
| time/                 |          |
|    fps                | 152      |
|    iterations         | 1200     |
|    time_elapsed 

[I 2023-11-10 17:28:43,289] Trial 54 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 58.8      |
|    ep_rew_mean        | 0.196     |
| time/                 |           |
|    fps                | 163       |
|    iterations         | 600       |
|    time_elapsed       | 18        |
|    total_timesteps    | 3000      |
| train/                |           |
|    entropy_loss       | -0.0113   |
|    explained_variance | -2.24     |
|    learning_rate      | 0.00446   |
|    n_updates          | 599       |
|    policy_loss        | -3.63e-07 |
|    value_loss         | 1.13e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 32.4     |
|    ep_rew_mean        | 0.333    |
| time/                 |          |
|    fps                | 215      |
|    iterations         | 100      |
|

[I 2023-11-10 17:28:52,146] Trial 50 finished with value: 0.0 and parameters: {'gamma': 0.043790855281505066, 'lr': 0.001291646084209183}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 32.3     |
|    ep_rew_mean        | 0.443    |
| time/                 |          |
|    fps                | 210      |
|    iterations         | 400      |
|    time_elapsed       | 9        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -70.7    |
|    learning_rate      | 0.000114 |
|    n_updates          | 399      |
|    policy_loss        | 0.157    |
|    value_loss         | 0.0139   |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 40.7     |
|    mean_reward        | 0.667    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss 

[I 2023-11-10 17:28:53,184] Trial 51 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.057903200164067685, 'lr': 0.0011790136621941496}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 52.3      |
|    ep_rew_mean        | 0.244     |
| time/                 |           |
|    fps                | 162       |
|    iterations         | 900       |
|    time_elapsed       | 27        |
|    total_timesteps    | 4500      |
| train/                |           |
|    entropy_loss       | -0.00371  |
|    explained_variance | -0.00457  |
|    learning_rate      | 0.00446   |
|    n_updates          | 899       |
|    policy_loss        | -7.13e-06 |
|    value_loss         | 0.000445  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 54.6     |
|    ep_rew_mean        | 0.556    |
| time/                 |          |
|    fps                | 166      |
|    iterations         | 100      |
|

[I 2023-11-10 17:29:21,998] Trial 57 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 43       |
|    ep_rew_mean        | 0.46     |
| time/                 |          |
|    fps                | 173      |
|    iterations         | 1100     |
|    time_elapsed       | 31       |
|    total_timesteps    | 5500     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -66.8    |
|    learning_rate      | 0.000107 |
|    n_updates          | 1099     |
|    policy_loss        | -0.105   |
|    value_loss         | 0.00741  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 39.3      |
|    ep_rew_mean        | 0.42      |
| time/                 |           |
|    fps                | 221       |
|    iterations         | 1800      |
|    time_e

[I 2023-11-10 17:29:25,168] Trial 53 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.07261464358484622, 'lr': 0.00446159005530824}. Best is trial 5 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 40.9     |
|    ep_rew_mean        | 0.44     |
| time/                 |          |
|    fps                | 220      |
|    iterations         | 1900     |
|    time_elapsed       | 42       |
|    total_timesteps    | 9500     |
| train/                |          |
|    entropy_loss       | -1.59    |
|    explained_variance | -257     |
|    learning_rate      | 0.000114 |
|    n_updates          | 1899     |
|    policy_loss        | -0.0412  |
|    value_loss         | 0.00191  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 43.3     |
|    ep_rew_mean        | 0.47     |
| time/                 |          |
|    fps                | 172      |
|    iterations         | 1200     |
|    time_elapsed 

[I 2023-11-10 17:29:28,827] Trial 55 finished with value: 0.0 and parameters: {'gamma': 0.0399202199651066, 'lr': 0.00011366146972199006}. Best is trial 5 with value: 1.0.


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 39.3     |
|    ep_rew_mean        | 0.51     |
| time/                 |          |
|    fps                | 174      |
|    iterations         | 1300     |
|    time_elapsed       | 37       |
|    total_timesteps    | 6500     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | 0        |
|    learning_rate      | 0.000107 |
|    n_updates          | 1299     |
|    policy_loss        | -0.00135 |
|    value_loss         | 8.04e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 59.5     |
|    ep_rew_mean        | 0.52     |
| time/                 |          |
|    fps                | 175      |
|    iterations         | 300      |
|    time_elapsed       | 8        |
|    total_timesteps    | 1500     |
| train/                |          |
|

[I 2023-11-10 17:29:42,463] Trial 56 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.034335538973827955, 'lr': 0.00010663459940371851}. Best is trial 5 with value: 1.0.


------------------------------------
| eval/                 |          |
|    mean_ep_length     | 36.7     |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -0.00129 |
|    explained_variance | 0.0777   |
|    learning_rate      | 0.00449  |
|    n_updates          | 999      |
|    policy_loss        | 1.05e-06 |
|    value_loss         | 9.97e-05 |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50.5     |
|    ep_rew_mean     | 0.429    |
| time/              |          |
|    fps             | 238      |
|    iterations      | 1000     |
|    time_elapsed    | 20       |
|    total_timesteps | 5000     |
---------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 48.4      |
|    ep_rew_mean        | 0.

[I 2023-11-10 17:29:43,994] Trial 59 pruned. 


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 55.3      |
|    ep_rew_mean        | 0.34      |
| time/                 |           |
|    fps                | 269       |
|    iterations         | 1200      |
|    time_elapsed       | 22        |
|    total_timesteps    | 6000      |
| train/                |           |
|    entropy_loss       | -0.00118  |
|    explained_variance | 0         |
|    learning_rate      | 0.00449   |
|    n_updates          | 1199      |
|    policy_loss        | -1.49e-06 |
|    value_loss         | 0.000235  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 53.7     |
|    ep_rew_mean        | 0.32     |
| time/                 |          |
|    fps                | 288      |
|    iterations         | 1300     |
|    time_elapsed       | 22       |
|    total_timesteps    | 6500     |
| train/             

[I 2023-11-10 17:29:46,456] Trial 58 finished with value: 0.0 and parameters: {'gamma': 0.07780183194607228, 'lr': 0.004490940691248326}. Best is trial 5 with value: 1.0.


Number of finished trials:  60
Best trial:
  Value:  1.0
  Params: 
    gamma: 0.010626448733732002
    lr: 0.0013429429973806648
  User attrs:
    gamma: 0.989373551266268


### Train agent with best hyper parameters

In [8]:
a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1,
    gamma=1-0.005090895994889371,
    learning_rate=0.03807432884729837,
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [9]:
# Train agent and save it
a2c_model.learn(int(1.0e4))
a2c_model.save("saved/a2c_b_mini")

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 64        |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 1721      |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -5.05e-06 |
|    explained_variance | 0         |
|    learning_rate      | 0.0381    |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 2.01e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 60.1      |
|    ep_rew_mean        | 0.0625    |
| time/                 |           |
|    fps                | 1764      |
|    iterations         | 200       |
|    time_elapsed       | 0         |
|    total_timesteps    | 1000      |
| train/    

#### Testing 

In [10]:
a2c_model = A2C.load("saved/a2c_b_mini")

env.reset()

mean_reward, std_reward = evaluate_policy(a2c_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:0.20 +/- 0.40


In [11]:
# RANDOM SEED
from IPython.display import SVG, display
from pogema.animation import AnimationMonitor, AnimationConfig

env = AnimationMonitor(env)

def evaluate_success_rate(model, env, num_episodes=1000):
    success_count = 0
    step_array = []
    for i in range(num_episodes):
        print(f'---{i}---')
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 64
        steps_taken = 0
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done)
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                step_array.append(steps_taken)
                env.save_animation(f"render{i}.svg", AnimationConfig(egocentric_idx=0))
                break

    success_rate = success_count / num_episodes
    return success_rate, step_array

success_rate,step_array = evaluate_success_rate(a2c_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")

---0---
4 64 0 False
4 63 0 False
4 62 0 False
4 61 0 False
4 60 0 False
4 59 0 False
4 58 0 False
4 57 0 False
4 56 0 False
4 55 0 False
4 54 0 False
4 53 0 False
4 52 0 False
4 51 0 False
4 50 0 False
4 49 0 False
4 48 0 False
4 47 0 False
4 46 0 False
4 45 0 False
4 44 0 False
4 43 0 False
4 42 0 False
4 41 0 False
4 40 0 False
4 39 0 False
4 38 0 False
4 37 0 False
4 36 0 False
4 35 0 False
4 34 0 False
4 33 0 False
4 32 0 False
4 31 0 False
4 30 0 False
4 29 0 False
4 28 0 False
4 27 0 False
4 26 0 False
4 25 0 False
4 24 0 False
4 23 0 False
4 22 0 False
4 21 0 False
4 20 0 False
4 19 0 False
4 18 0 False
4 17 0 False
4 16 0 False
4 15 0 False
4 14 0 False
4 13 0 False
4 12 0 False
4 11 0 False
4 10 0 False
4 9 0 False
4 8 0 False
4 7 0 False
4 6 0 False
4 5 0 False
4 4 0 False
4 3 0 False
4 2 0 False
4 1 0 False
---1---
4 64 0 False
4 63 0 False
4 62 0 False
4 61 0 False
4 60 0 False
4 59 0 False
4 58 0 False
4 57 0 False
4 56 0 False
4 55 0 False
4 54 0 False
4 53 0 False
4 52 