In [1]:
import gymnasium as gym
from pogema import GridConfig
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=64
)

env = gym.make("Pogema-v0",grid_config=grid_config)

a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  logger.warn(
  logger.warn(


#### Optuna Integration

In [2]:
""" Optuna example that optimizes the hyperparameters of
a reinforcement learning agent using A2C implementation from Stable-Baselines3
on a Gymnasium environment.

This is a simplified version of what can be found in https://github.com/DLR-RM/rl-baselines3-zoo.

You can run this example as follows:
    $ python sb3_simple.py

"""
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(1.0e4)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3


DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": env
}

def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for A2C hyperparameters."""
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)

    trial.set_user_attr("gamma", gamma)

    return {
        "gamma": gamma,
        "learning_rate": learning_rate,
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_a2c_params(trial))
    # Create the RL model.
    model = A2C(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, n_jobs=4, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))


  from .autonotebook import tqdm as notebook_tqdm
[I 2023-11-10 17:04:43,433] A new study created in memory with name: no-name-429a8ea4-4bf9-4968-95c0-a65f0b107470


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 53.6      |
|    ep_rew_mean        | 0.444     |
| time/                 |           |
|    fps                | 234       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -1.17e+05 |
|    learning_rate      | 0.000114  |
|    n_updates          | 99        |
|    policy_loss        | 0.285     |
|    value_loss         | 0.0753    |
----------------

[I 2023-11-10 17:05:27,437] Trial 2 finished with value: 0.0 and parameters: {'gamma': 0.0003476973944621764, 'lr': 0.0376067695210782}. Best is trial 2 with value: 0.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 37.7     |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | 0        |
|    learning_rate      | 0.000114 |
|    n_updates          | 1999     |
|    policy_loss        | -0.165   |
|    value_loss         | 0.0109   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 48.8     |
|    ep_rew_mean     | 0.48     |
| time/              |          |
|    fps             | 221      |
|    iterations      | 2000     |
|    time_elapsed    | 45       |
|    total_timesteps | 10000    |
---------------------------------
-------------------------------------
| eval/   

[I 2023-11-10 17:05:28,627] Trial 3 finished with value: 0.0 and parameters: {'gamma': 0.0003802282720031166, 'lr': 0.00012098464019917173}. Best is trial 2 with value: 0.0.
[I 2023-11-10 17:05:28,640] Trial 1 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.0020336678930584252, 'lr': 0.00011441085777384108}. Best is trial 1 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 56.7     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -953     |
|    learning_rate      | 7.51e-05 |
|    n_updates          | 1999     |
|    policy_loss        | 0.017    |
|    value_loss         | 0.000329 |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 47.6     |
|    ep_rew_mean     | 0.34     |
| time/              |          |
|    fps             | 218      |
|    iterations      | 2000     |
|    time_elapsed    | 45       |
|    total_timesteps |

[I 2023-11-10 17:05:29,288] Trial 0 finished with value: 0.0 and parameters: {'gamma': 0.0003440645335876197, 'lr': 7.509375249075675e-05}. Best is trial 1 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 65.6      |
|    ep_rew_mean        | 0.286     |
| time/                 |           |
|    fps                | 256       |
|    iterations         | 100       |
|    time_elapsed       | 1         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -8.57e+06 |
|    learning_rate      | 0.000179  |
|    n_updates          | 99        |
|    policy_loss        | -0.218    |
|    value_loss         | 0.0302    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 41.3     |
|    ep_rew_mean        | 0.167    |
| time/                 |          |
|    fps                | 231      |
|    iterations         | 100      |
|

[I 2023-11-10 17:06:11,143] Trial 4 finished with value: 0.0 and parameters: {'gamma': 0.037696807311298215, 'lr': 0.0001787218411933565}. Best is trial 1 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 18.3     |
|    mean_reward        | 0.667    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.32    |
|    explained_variance | -88.3    |
|    learning_rate      | 0.000621 |
|    n_updates          | 1999     |
|    policy_loss        | 0.0103   |
|    value_loss         | 0.00214  |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 49       |
|    ep_rew_mean     | 0.37     |
| time/              |          |
|    fps             | 232      |
|    iterations      | 2000     |
|    time_elapsed    | 43       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:06:11,692] Trial 6 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.0877339550531947, 'lr': 0.0006213653088768164}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 38        |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -2.88e-10 |
|    explained_variance | 0         |
|    learning_rate      | 0.0402    |
|    n_updates          | 1999      |
|    policy_loss        | -0        |
|    value_loss         | 0.00867   |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 62       |
|    ep_rew_mean     | 0        |
| time/              |          |
|    fps             | 229      |
|    iterations      | 2000     |
|    time_elapsed    | 43       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:06:12,241] Trial 5 finished with value: 0.0 and parameters: {'gamma': 0.02029355612959305, 'lr': 0.040188999303229285}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 56.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -0.00147 |
|    explained_variance | -5.9     |
|    learning_rate      | 0.00283  |
|    n_updates          | 1999     |
|    policy_loss        | 1.92e-06 |
|    value_loss         | 0.000234 |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 53.2     |
|    ep_rew_mean     | 0.22     |
| time/              |          |
|    fps             | 230      |
|    iterations      | 2000     |
|    time_elapsed    | 43       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:06:12,657] Trial 7 finished with value: 0.0 and parameters: {'gamma': 0.057693720931998545, 'lr': 0.0028334776825732055}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 57.7      |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 229       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -0        |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.88      |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 0.31      |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 61.2      |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 217       |
|    iterations         | 100   

[I 2023-11-10 17:06:55,367] Trial 9 finished with value: 0.0 and parameters: {'gamma': 0.06010935449013103, 'lr': 0.6746216263755623}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 89.7     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 1.19e-07 |
|    learning_rate      | 0.88     |
|    n_updates          | 1999     |
|    policy_loss        | -0       |
|    value_loss         | 0.0165   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 62.4     |
|    ep_rew_mean     | 0.12     |
| time/              |          |
|    fps             | 225      |
|    iterations      | 2000     |
|    time_elapsed    | 44       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:06:55,591] Trial 8 finished with value: 0.0 and parameters: {'gamma': 0.0051274472423178535, 'lr': 0.8800324926693016}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 24       |
|    mean_reward        | 0.667    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -23.6    |
|    learning_rate      | 1.14e-05 |
|    n_updates          | 1999     |
|    policy_loss        | 0.158    |
|    value_loss         | 0.0163   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 49       |
|    ep_rew_mean     | 0.4      |
| time/              |          |
|    fps             | 230      |
|    iterations      | 2000     |
|    time_elapsed    | 43       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:06:56,131] Trial 11 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.08869933032780512, 'lr': 1.1378362482244722e-05}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 69.7     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 5.96e-08 |
|    learning_rate      | 0.984    |
|    n_updates          | 1999     |
|    policy_loss        | -0       |
|    value_loss         | 69.3     |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 47.7     |
|    ep_rew_mean     | 0.14     |
| time/              |          |
|    fps             | 227      |
|    iterations      | 2000     |
|    time_elapsed    | 43       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:06:56,216] Trial 10 finished with value: 0.0 and parameters: {'gamma': 0.06273784506151216, 'lr': 0.983745303742442}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 91.8     |
|    ep_rew_mean        | 0.2      |
| time/                 |          |
|    fps                | 221      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.48    |
|    explained_variance | -9.04    |
|    learning_rate      | 0.00125  |
|    n_updates          | 99       |
|    policy_loss        | -0.00786 |
|    value_loss         | 0.000486 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 55.4      |
|    ep_rew_mean        | 0.222     |
| time/                 |           |
|    fps                | 223       |
|    iterations         | 100       |
|    time_e

[I 2023-11-10 17:07:41,647] Trial 12 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.01179369739249944, 'lr': 0.0012453778097373217}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 61.7     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -265     |
|    learning_rate      | 1.64e-05 |
|    n_updates          | 1999     |
|    policy_loss        | 0.331    |
|    value_loss         | 0.0614   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 48.2     |
|    ep_rew_mean     | 0.38     |
| time/              |          |
|    fps             | 211      |
|    iterations      | 2000     |
|    time_elapsed    | 47       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:07:43,041] Trial 13 finished with value: 0.0 and parameters: {'gamma': 0.01098949303104898, 'lr': 1.6372360528135037e-05}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 53.7      |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -5.88e+03 |
|    learning_rate      | 1.04e-05  |
|    n_updates          | 1999      |
|    policy_loss        | -0.0656   |
|    value_loss         | 0.103     |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 43.5     |
|    ep_rew_mean     | 0.39     |
| time/              |          |
|    fps             | 211      |
|    iterations      | 2000     |
|    time_elapsed    | 47       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:07:43,471] Trial 14 finished with value: 0.0 and parameters: {'gamma': 0.014165109935163558, 'lr': 1.0385736933884185e-05}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 58.3     |
|    mean_reward        | 0.667    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -10.1    |
|    learning_rate      | 1.36e-05 |
|    n_updates          | 1999     |
|    policy_loss        | 0.291    |
|    value_loss         | 0.0446   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50.1     |
|    ep_rew_mean     | 0.33     |
| time/              |          |
|    fps             | 210      |
|    iterations      | 2000     |
|    time_elapsed    | 47       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:07:43,892] Trial 15 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.09653484389471109, 'lr': 1.3633644755765542e-05}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 53.9     |
|    ep_rew_mean        | 0.444    |
| time/                 |          |
|    fps                | 203      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -0.165   |
|    learning_rate      | 3.49e-05 |
|    n_updates          | 99       |
|    policy_loss        | 0.0674   |
|    value_loss         | 0.0022   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 54       |
|    ep_rew_mean        | 0.444    |
| time/                 |          |
|    fps                | 202      |
|    iterations         | 100      |
|    time_elapsed 

[I 2023-11-10 17:08:24,152] Trial 16 finished with value: 0.0 and parameters: {'gamma': 0.09168868766397441, 'lr': 3.48644706636956e-05}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 47.7     |
|    ep_rew_mean        | 0.4      |
| time/                 |          |
|    fps                | 234      |
|    iterations         | 1900     |
|    time_elapsed       | 40       |
|    total_timesteps    | 9500     |
| train/                |          |
|    entropy_loss       | -1.45    |
|    explained_variance | -26.6    |
|    learning_rate      | 0.000439 |
|    n_updates          | 1899     |
|    policy_loss        | 0.0754   |
|    value_loss         | 0.00255  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 57.6     |
|    ep_rew_mean        | 0.3      |
| time/                 |          |
|    fps                | 231      |
|    iterations         | 1900     |
|    time_elapsed 

[I 2023-11-10 17:08:26,358] Trial 17 finished with value: 0.0 and parameters: {'gamma': 0.0980180778257475, 'lr': 1.0594587904618382e-05}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 34       |
|    mean_reward        | 0.667    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.55    |
|    explained_variance | -4.63    |
|    learning_rate      | 0.000439 |
|    n_updates          | 1999     |
|    policy_loss        | 0.0391   |
|    value_loss         | 0.000589 |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 47       |
|    ep_rew_mean     | 0.39     |
| time/              |          |
|    fps             | 234      |
|    iterations      | 2000     |
|    time_elapsed    | 42       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:08:26,539] Trial 19 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.031103932424987595, 'lr': 0.00043925132929670924}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 74.7      |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -0.211    |
|    explained_variance | -6.15     |
|    learning_rate      | 0.00114   |
|    n_updates          | 1999      |
|    policy_loss        | -0.000487 |
|    value_loss         | 0.000985  |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 53.4     |
|    ep_rew_mean     | 0.3      |
| time/              |          |
|    fps             | 230      |
|    iterations      | 2000     |
|    time_elapsed    | 43       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:08:26,920] Trial 18 finished with value: 0.0 and parameters: {'gamma': 0.09352890752440088, 'lr': 0.0011420315658105793}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 55.1     |
|    ep_rew_mean        | 0.389    |
| time/                 |          |
|    fps                | 249      |
|    iterations         | 200      |
|    time_elapsed       | 4        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | 0.18     |
|    learning_rate      | 0.000502 |
|    n_updates          | 199      |
|    policy_loss        | 0.161    |
|    value_loss         | 0.129    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 31.5     |
|    ep_rew_mean        | 0.333    |
| time/                 |          |
|    fps                | 219      |
|    iterations         | 100      |
|    time_elapsed 

[I 2023-11-10 17:09:09,147] Trial 20 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.029176991968283404, 'lr': 0.0005016662435298765}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 50.9     |
|    ep_rew_mean        | 0.37     |
| time/                 |          |
|    fps                | 220      |
|    iterations         | 1900     |
|    time_elapsed       | 43       |
|    total_timesteps    | 9500     |
| train/                |          |
|    entropy_loss       | -1.02    |
|    explained_variance | 0        |
|    learning_rate      | 0.000542 |
|    n_updates          | 1899     |
|    policy_loss        | -0.00996 |
|    value_loss         | 0.000114 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 54.9     |
|    ep_rew_mean        | 0.26     |
| time/                 |          |
|    fps                | 219      |
|    iterations         | 1900     |
|    time_elapsed 

[I 2023-11-10 17:09:11,409] Trial 22 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.029513459858026246, 'lr': 0.0004958244240581512}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 31.8     |
|    ep_rew_mean        | 0.154    |
| time/                 |          |
|    fps                | 211      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -14.6    |
|    learning_rate      | 3.52e-05 |
|    n_updates          | 99       |
|    policy_loss        | -0.00152 |
|    value_loss         | 0.0121   |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 54.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss 

[I 2023-11-10 17:09:12,285] Trial 21 finished with value: 0.0 and parameters: {'gamma': 0.02806627344542503, 'lr': 0.000541928914195127}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 46       |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -108     |
|    learning_rate      | 4.38e-05 |
|    n_updates          | 1999     |
|    policy_loss        | -0.327   |
|    value_loss         | 0.0516   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 57.2     |
|    ep_rew_mean     | 0.25     |
| time/              |          |
|    fps             | 218      |
|    iterations      | 2000     |
|    time_elapsed    | 45       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:09:12,761] Trial 23 finished with value: 0.0 and parameters: {'gamma': 0.02711174746106919, 'lr': 4.380124050414981e-05}. Best is trial 6 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 49.6      |
|    ep_rew_mean        | 0.556     |
| time/                 |           |
|    fps                | 213       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -2.54e+04 |
|    learning_rate      | 3.57e-05  |
|    n_updates          | 99        |
|    policy_loss        | 0.0485    |
|    value_loss         | 0.0534    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 39        |
|    ep_rew_mean        | 0.28      |
| time/                 |           |
|    fps                | 211       |
|    iterations         | 200   

[I 2023-11-10 17:09:55,443] Trial 24 finished with value: 1.0 and parameters: {'gamma': 0.042613270090891706, 'lr': 3.5229093454971854e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 47.8     |
|    ep_rew_mean        | 0.26     |
| time/                 |          |
|    fps                | 215      |
|    iterations         | 1900     |
|    time_elapsed       | 44       |
|    total_timesteps    | 9500     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -89.7    |
|    learning_rate      | 3.57e-05 |
|    n_updates          | 1899     |
|    policy_loss        | 0.154    |
|    value_loss         | 0.0315   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 47.1     |
|    ep_rew_mean        | 0.32     |
| time/                 |          |
|    fps                | 214      |
|    iterations         | 1900     |
|    time_elapsed 

[I 2023-11-10 17:09:57,995] Trial 25 finished with value: 0.0 and parameters: {'gamma': 0.043977543823189164, 'lr': 3.565472362525121e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 78.3     |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | 0        |
|    learning_rate      | 3.49e-05 |
|    n_updates          | 1999     |
|    policy_loss        | -0.0176  |
|    value_loss         | 0.000147 |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 46.7     |
|    ep_rew_mean     | 0.33     |
| time/              |          |
|    fps             | 214      |
|    iterations      | 2000     |
|    time_elapsed    | 46       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:09:59,443] Trial 27 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.05624144931023298, 'lr': 3.489424412095154e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 59.2     |
|    ep_rew_mean        | 0.375    |
| time/                 |          |
|    fps                | 218      |
|    iterations         | 200      |
|    time_elapsed       | 4        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -44      |
|    learning_rate      | 3.34e-05 |
|    n_updates          | 199      |
|    policy_loss        | 0.0172   |
|    value_loss         | 0.00704  |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 43.3     |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss 

[I 2023-11-10 17:10:00,350] Trial 26 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.049252283428299924, 'lr': 3.6836060485108124e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 53.2     |
|    ep_rew_mean        | 0.222    |
| time/                 |          |
|    fps                | 205      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -23.4    |
|    learning_rate      | 3.47e-05 |
|    n_updates          | 99       |
|    policy_loss        | -0.304   |
|    value_loss         | 0.0754   |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 45.3      |
|    ep_rew_mean        | 0.455     |
| time/                 |           |
|    fps                | 211       |
|    iterations         | 100       |
|    time_e

[I 2023-11-10 17:10:41,966] Trial 28 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.04771858292103918, 'lr': 3.337266355619559e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 49.4     |
|    ep_rew_mean        | 0.27     |
| time/                 |          |
|    fps                | 215      |
|    iterations         | 1900     |
|    time_elapsed       | 44       |
|    total_timesteps    | 9500     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -215     |
|    learning_rate      | 3.47e-05 |
|    n_updates          | 1899     |
|    policy_loss        | -0.0693  |
|    value_loss         | 0.00655  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 50.6     |
|    ep_rew_mean        | 0.29     |
| time/                 |          |
|    fps                | 218      |
|    iterations         | 1900     |
|    time_elapsed 

[I 2023-11-10 17:10:44,522] Trial 29 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.048879198092471134, 'lr': 3.465513495564522e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 34.7      |
|    ep_rew_mean        | 0.357     |
| time/                 |           |
|    fps                | 193       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -3.33e+03 |
|    learning_rate      | 7.42e-05  |
|    n_updates          | 99        |
|    policy_loss        | -0.265    |
|    value_loss         | 0.18      |
-------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 44.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|

[I 2023-11-10 17:10:46,469] Trial 30 finished with value: 0.0 and parameters: {'gamma': 0.016944191248333993, 'lr': 0.00015804919997643675}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 51.9     |
|    ep_rew_mean     | 0.29     |
| time/              |          |
|    fps             | 216      |
|    iterations      | 2000     |
|    time_elapsed    | 46       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:10:46,509] Trial 31 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.016757483467215915, 'lr': 0.0001866898039243518}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 45.5      |
|    ep_rew_mean        | 0.286     |
| time/                 |           |
|    fps                | 204       |
|    iterations         | 200       |
|    time_elapsed       | 4         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -1.28e+05 |
|    learning_rate      | 7.42e-05  |
|    n_updates          | 199       |
|    policy_loss        | 0.146     |
|    value_loss         | 0.0239    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 69.1     |
|    ep_rew_mean        | 0.714    |
| time/                 |          |
|    fps                | 210      |
|    iterations         | 100      |
|

[I 2023-11-10 17:11:25,214] Trial 32 finished with value: 0.0 and parameters: {'gamma': 0.017853867880259944, 'lr': 7.42119021509752e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 44.8     |
|    ep_rew_mean        | 0.38     |
| time/                 |          |
|    fps                | 244      |
|    iterations         | 1900     |
|    time_elapsed       | 38       |
|    total_timesteps    | 9500     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -21.5    |
|    learning_rate      | 1.78e-05 |
|    n_updates          | 1899     |
|    policy_loss        | 0.146    |
|    value_loss         | 0.0403   |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 52.4      |
|    ep_rew_mean        | 0.29      |
| time/                 |           |
|    fps                | 240       |
|    iterations         | 1900      |
|    time_e

[I 2023-11-10 17:11:26,886] Trial 33 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.02034311431959245, 'lr': 9.797783975097004e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 47.5     |
|    ep_rew_mean        | 0.2      |
| time/                 |          |
|    fps                | 221      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -18.5    |
|    learning_rate      | 1.71e-05 |
|    n_updates          | 99       |
|    policy_loss        | 0.215    |
|    value_loss         | 0.0743   |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 130      |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss 

[I 2023-11-10 17:11:27,784] Trial 34 finished with value: 0.0 and parameters: {'gamma': 0.09780088321582509, 'lr': 1.7848304857301413e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 44.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -0.511   |
|    learning_rate      | 1.75e-05 |
|    n_updates          | 1999     |
|    policy_loss        | 0.511    |
|    value_loss         | 0.635    |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 51.3     |
|    ep_rew_mean     | 0.28     |
| time/              |          |
|    fps             | 239      |
|    iterations      | 2000     |
|    time_elapsed    | 41       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:11:28,392] Trial 35 finished with value: 0.0 and parameters: {'gamma': 0.09505562634673678, 'lr': 1.7536699530925264e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 48.6     |
|    ep_rew_mean        | 0.222    |
| time/                 |          |
|    fps                | 240      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -284     |
|    learning_rate      | 1.5e-05  |
|    n_updates          | 99       |
|    policy_loss        | 0.559    |
|    value_loss         | 0.187    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 62.2     |
|    ep_rew_mean        | 0.188    |
| time/                 |          |
|    fps                | 235      |
|    iterations         | 200      |
|    time_elapsed 

[I 2023-11-10 17:12:07,783] Trial 36 finished with value: 0.0 and parameters: {'gamma': 0.09802268360687398, 'lr': 1.706506645807341e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 89.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -81.6    |
|    learning_rate      | 1.5e-05  |
|    n_updates          | 1999     |
|    policy_loss        | 0.025    |
|    value_loss         | 0.00474  |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 54.5     |
|    ep_rew_mean     | 0.27     |
| time/              |          |
|    fps             | 237      |
|    iterations      | 2000     |
|    time_elapsed    | 42       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:12:08,972] Trial 37 finished with value: 0.0 and parameters: {'gamma': 0.09258940320543768, 'lr': 1.496410845972207e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 34.7      |
|    mean_reward        | 0.333     |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 1.74e-05  |
|    n_updates          | 1999      |
|    policy_loss        | -0.0823   |
|    value_loss         | 0.00312   |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 47.5     |
|    ep_rew_mean     | 0.3      |
| time/              |          |
|    fps             | 239      |
|    iterations      | 2000     |
|    time_elapsed    | 41       |
|    total_timesteps | 10000    |
---------------------------------
----------------------------------

[I 2023-11-10 17:12:09,605] Trial 39 finished with value: 0.0 and parameters: {'gamma': 0.07080515473536506, 'lr': 7.203879489550789e-05}. Best is trial 24 with value: 1.0.
[I 2023-11-10 17:12:09,622] Trial 38 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.07401442612210851, 'lr': 1.739610377846516e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 38.1     |
|    ep_rew_mean        | 0.538    |
| time/                 |          |
|    fps                | 251      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -7.25    |
|    learning_rate      | 8.57e-05 |
|    n_updates          | 99       |
|    policy_loss        | 1.25     |
|    value_loss         | 0.651    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 41.9     |
|    ep_rew_mean        | 0.4      |
| time/                 |          

[I 2023-11-10 17:12:48,716] Trial 40 finished with value: 0.0 and parameters: {'gamma': 0.06484307104285858, 'lr': 8.57314109038375e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 73.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -192     |
|    learning_rate      | 7.55e-05 |
|    n_updates          | 1999     |
|    policy_loss        | 0.0455   |
|    value_loss         | 0.00141  |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 45.7     |
|    ep_rew_mean     | 0.38     |
| time/              |          |
|    fps             | 250      |
|    iterations      | 2000     |
|    time_elapsed    | 39       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:12:48,999] Trial 41 finished with value: 0.0 and parameters: {'gamma': 0.06611188712269743, 'lr': 7.548829135917205e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 81       |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | 0        |
|    learning_rate      | 7.55e-05 |
|    n_updates          | 1999     |
|    policy_loss        | -0.0632  |
|    value_loss         | 0.00155  |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 49.3     |
|    ep_rew_mean     | 0.32     |
| time/              |          |
|    fps             | 252      |
|    iterations      | 2000     |
|    time_elapsed    | 39       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:12:49,243] Trial 43 finished with value: 0.0 and parameters: {'gamma': 0.04016495050568748, 'lr': 7.548473300483619e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 44.7     |
|    ep_rew_mean        | 0.273    |
| time/                 |          |
|    fps                | 261      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.6     |
|    explained_variance | 0.52     |
|    learning_rate      | 0.000254 |
|    n_updates          | 99       |
|    policy_loss        | -0.0895  |
|    value_loss         | 0.00973  |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 38.7     |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss 

[I 2023-11-10 17:12:50,745] Trial 42 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.03644356478611209, 'lr': 5.993747744167092e-05}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 36       |
|    ep_rew_mean        | 0.667    |
| time/                 |          |
|    fps                | 240      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -20.1    |
|    learning_rate      | 0.000272 |
|    n_updates          | 99       |
|    policy_loss        | -0.0391  |
|    value_loss         | 0.000848 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 31.2     |
|    ep_rew_mean        | 0.357    |
| time/                 |          |
|    fps                | 210      |
|    iterations         | 100      |
|    time_elapsed 

[I 2023-11-10 17:13:29,083] Trial 44 finished with value: 0.0 and parameters: {'gamma': 0.03870314139041518, 'lr': 0.0002538442476555441}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 34       |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.59    |
|    explained_variance | -20.2    |
|    learning_rate      | 0.000253 |
|    n_updates          | 1999     |
|    policy_loss        | -0.0126  |
|    value_loss         | 0.000423 |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 48.1     |
|    ep_rew_mean     | 0.42     |
| time/              |          |
|    fps             | 243      |
|    iterations      | 2000     |
|    time_elapsed    | 41       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:13:30,424] Trial 46 finished with value: 0.0 and parameters: {'gamma': 0.04336188014142975, 'lr': 0.00025272477602438046}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 48.2      |
|    ep_rew_mean        | 0.35      |
| time/                 |           |
|    fps                | 239       |
|    iterations         | 1900      |
|    time_elapsed       | 39        |
|    total_timesteps    | 9500      |
| train/                |           |
|    entropy_loss       | -1.6      |
|    explained_variance | -1.57e+03 |
|    learning_rate      | 0.000222  |
|    n_updates          | 1899      |
|    policy_loss        | 0.13      |
|    value_loss         | 0.011     |
-------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 40.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|

[I 2023-11-10 17:13:30,755] Trial 45 finished with value: 0.0 and parameters: {'gamma': 0.03775478187673999, 'lr': 0.00027178275560107675}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 40.5      |
|    ep_rew_mean        | 0.417     |
| time/                 |           |
|    fps                | 197       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -3.53e+03 |
|    learning_rate      | 0.000274  |
|    n_updates          | 99        |
|    policy_loss        | 0.412     |
|    value_loss         | 0.102     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 44.5      |
|    ep_rew_mean        | 0.182     |
| time/                 |           |
|    fps                | 238       |
|    iterations         | 100   

[I 2023-11-10 17:13:32,799] Trial 47 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.009498224699289716, 'lr': 0.00022219805878962682}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 59.5     |
|    ep_rew_mean        | 0.5      |
| time/                 |          |
|    fps                | 204      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.0683  |
|    explained_variance | -28      |
|    learning_rate      | 0.00375  |
|    n_updates          | 99       |
|    policy_loss        | 0.000306 |
|    value_loss         | 0.00165  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 40.5      |
|    ep_rew_mean        | 0.292     |
| time/                 |           |
|    fps                | 218       |
|    iterations         | 200       |
|    time_e

[I 2023-11-10 17:14:12,615] Trial 48 finished with value: 0.0 and parameters: {'gamma': 0.008321377574412703, 'lr': 0.0002739811707783564}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 49.7      |
|    ep_rew_mean        | 0.14      |
| time/                 |           |
|    fps                | 238       |
|    iterations         | 1900      |
|    time_elapsed       | 39        |
|    total_timesteps    | 9500      |
| train/                |           |
|    entropy_loss       | -0.000447 |
|    explained_variance | -1.77e+03 |
|    learning_rate      | 0.00546   |
|    n_updates          | 1899      |
|    policy_loss        | 2.75e-07  |
|    value_loss         | 7.08e-05  |
-------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 61.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|

[I 2023-11-10 17:14:13,011] Trial 49 finished with value: 0.0 and parameters: {'gamma': 0.007536127864358732, 'lr': 0.005285692897607962}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 48.7     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -0.00062 |
|    explained_variance | -41.2    |
|    learning_rate      | 0.00375  |
|    n_updates          | 1999     |
|    policy_loss        | 7.39e-09 |
|    value_loss         | 5.1e-08  |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 55.9     |
|    ep_rew_mean     | 0.26     |
| time/              |          |
|    fps             | 233      |
|    iterations      | 2000     |
|    time_elapsed    | 42       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:14:13,693] Trial 50 finished with value: 0.0 and parameters: {'gamma': 0.008855750378619045, 'lr': 0.0037521258243010244}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 71        |
|    ep_rew_mean        | 0.429     |
| time/                 |           |
|    fps                | 234       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -2.07e+05 |
|    learning_rate      | 0.000129  |
|    n_updates          | 99        |
|    policy_loss        | 0.29      |
|    value_loss         | 0.109     |
-------------------------------------
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 72.7      |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |       

[I 2023-11-10 17:14:15,207] Trial 51 finished with value: 0.0 and parameters: {'gamma': 0.06835812912005368, 'lr': 0.0054619785644897875}. Best is trial 24 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 41.7      |
|    ep_rew_mean        | 0.25      |
| time/                 |           |
|    fps                | 205       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -6.47e+03 |
|    learning_rate      | 0.000138  |
|    n_updates          | 99        |
|    policy_loss        | 0.326     |
|    value_loss         | 0.0668    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 54.1     |
|    ep_rew_mean        | 0.444    |
| time/                 |          |
|    fps                | 229      |
|    iterations         | 100      |
|

[I 2023-11-10 17:14:55,081] Trial 53 finished with value: 0.0 and parameters: {'gamma': 0.001953739159483827, 'lr': 0.00013750348149947072}. Best is trial 24 with value: 1.0.


------------------------------------
| eval/                 |          |
|    mean_ep_length     | 67       |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -0.37    |
|    learning_rate      | 0.000129 |
|    n_updates          | 1999     |
|    policy_loss        | -0.193   |
|    value_loss         | 0.0155   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 44.4     |
|    ep_rew_mean     | 0.39     |
| time/              |          |
|    fps             | 235      |
|    iterations      | 2000     |
|    time_elapsed    | 42       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:14:55,169] Trial 52 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.0026673939837890378, 'lr': 0.0001287142281659044}. Best is trial 24 with value: 1.0.


------------------------------------
| eval/                 |          |
|    mean_ep_length     | 55.3     |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -39.5    |
|    learning_rate      | 1e-05    |
|    n_updates          | 1999     |
|    policy_loss        | -0.698   |
|    value_loss         | 0.234    |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50.4     |
|    ep_rew_mean     | 0.28     |
| time/              |          |
|    fps             | 238      |
|    iterations      | 2000     |
|    time_elapsed    | 41       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:14:55,632] Trial 54 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.06127288433187494, 'lr': 1.0045117614749572e-05}. Best is trial 24 with value: 1.0.


-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 64        |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -2.31e+04 |
|    learning_rate      | 2.44e-05  |
|    n_updates          | 1999      |
|    policy_loss        | 0.163     |
|    value_loss         | 0.026     |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 44.2     |
|    ep_rew_mean     | 0.38     |
| time/              |          |
|    fps             | 246      |
|    iterations      | 2000     |
|    time_elapsed    | 40       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:14:55,982] Trial 55 finished with value: 0.0 and parameters: {'gamma': 0.05309843977384957, 'lr': 2.441799698747516e-05}. Best is trial 24 with value: 1.0.


Number of finished trials:  56
Best trial:
  Value:  1.0
  Params: 
    gamma: 0.042613270090891706
    lr: 3.5229093454971854e-05
  User attrs:
    gamma: 0.9573867299091083


### Train agent with best hyper parameters

In [3]:
a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1,
    gamma=1-0.042613270090891706,
    learning_rate=3.5229093454971854e-05,
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [4]:
# Train agent and save it
a2c_model.learn(int(1.0e4))
a2c_model.save("saved/a2c_a_mini")

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 50.3      |
|    ep_rew_mean        | 0.222     |
| time/                 |           |
|    fps                | 707       |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -1.35e+03 |
|    learning_rate      | 3.52e-05  |
|    n_updates          | 99        |
|    policy_loss        | 0.287     |
|    value_loss         | 0.145     |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 49.8     |
|    ep_rew_mean        | 0.3      |
| time/                 |          |
|    fps                | 672      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/             

#### Testing 

In [5]:
a2c_model = A2C.load("saved/a2c_a_mini")

env.reset()

mean_reward, std_reward = evaluate_policy(a2c_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:0.05 +/- 0.22


In [6]:
# RANDOM SEED
from IPython.display import SVG, display
from pogema.animation import AnimationMonitor, AnimationConfig

env = AnimationMonitor(env)

def evaluate_success_rate(model, env, num_episodes=1000):
    success_count = 0
    step_array = []
    for i in range(num_episodes):
        print(f'---{i}---')
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 64
        steps_taken = 0
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done)
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                step_array.append(steps_taken)
                env.save_animation(f"render{i}.svg", AnimationConfig(egocentric_idx=0))
                break

    success_rate = success_count / num_episodes
    return success_rate, step_array

success_rate,step_array = evaluate_success_rate(a2c_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")

---0---
2 64 0 False
1 63 0 False
2 62 0 False
2 61 0 False
0 60 0 False
3 59 0 True
---1---
4 64 1 False
2 63 1 False
3 62 1 False
3 61 1 False
1 60 1 False
1 59 1 False
0 58 1 False
2 57 1 False
3 56 1 False
3 55 1 False
4 54 1 False
3 53 1 False
4 52 1 False
2 51 1 False
4 50 1 False
1 49 1 False
0 48 1 False
1 47 1 False
3 46 1 False
3 45 1 False
0 44 1 False
0 43 1 False
4 42 1 False
2 41 1 False
0 40 1 False
2 39 1 False
1 38 1 False
2 37 1 False
1 36 1 False
2 35 1 False
0 34 1 False
4 33 1 False
3 32 1 False
3 31 1 False
1 30 1 False
4 29 1 False
3 28 1 False
2 27 1 False
4 26 1 False
2 25 1 False
2 24 1 False
1 23 1 False
2 22 1 False
4 21 1 False
2 20 1 False
0 19 1 False
4 18 1 False
4 17 1 False
0 16 1 False
3 15 1 False
2 14 1 False
1 13 1 False
0 12 1 False
0 11 1 False
2 10 1 False
1 9 1 False
3 8 1 False
0 7 1 False
4 6 1 False
1 5 1 False
1 4 1 False
0 3 1 False
1 2 1 False
1 1 1 False
---2---
1 64 1 False
4 63 1 True
---3---
2 64 2 False
1 63 2 False
3 62 2 False
4 61