In [1]:
import gymnasium as gym
from pogema import GridConfig
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=128,
    obs_radius=3
)

env = gym.make("Pogema-v0",grid_config=grid_config)

a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  logger.warn(
  logger.warn(


#### Optuna Integration

In [2]:
""" Optuna example that optimizes the hyperparameters of
a reinforcement learning agent using A2C implementation from Stable-Baselines3
on a Gymnasium environment.

This is a simplified version of what can be found in https://github.com/DLR-RM/rl-baselines3-zoo.

You can run this example as follows:
    $ python sb3_simple.py

"""
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(1.0e4)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3


DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": env
}

def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for A2C hyperparameters."""
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)

    trial.set_user_attr("gamma", gamma)

    return {
        "gamma": gamma,
        "learning_rate": learning_rate,
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_a2c_params(trial))
    # Create the RL model.
    model = A2C(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, n_jobs=4, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))


  from .autonotebook import tqdm as notebook_tqdm
[I 2023-11-10 17:20:44,841] A new study created in memory with name: no-name-524e7d75-8d7a-4d81-bcb0-14ed89b8a1da


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 93.8      |
|    ep_rew_mean        | 0.2       |
| time/                 |           |
|    fps                | 209       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -3.52e-40 |
|    explained_variance | -1.51e+11 |
|    learning_rate      | 0.447     |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 22.9      |
----------------

[I 2023-11-10 17:21:44,158] Trial 0 finished with value: 0.0 and parameters: {'gamma': 0.00018636767042011222, 'lr': 0.4471282662030949}. Best is trial 0 with value: 0.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 83       |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.59    |
|    explained_variance | -216     |
|    learning_rate      | 0.000193 |
|    n_updates          | 1999     |
|    policy_loss        | -0.0268  |
|    value_loss         | 0.0129   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 72.2     |
|    ep_rew_mean     | 0.48     |
| time/              |          |
|    fps             | 166      |
|    iterations      | 2000     |
|    time_elapsed    | 60       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:21:45,081] Trial 2 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.09993907433325912, 'lr': 0.00019329599456242825}. Best is trial 2 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 119      |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.59    |
|    explained_variance | -176     |
|    learning_rate      | 0.000396 |
|    n_updates          | 1999     |
|    policy_loss        | -0.0502  |
|    value_loss         | 0.00142  |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 71.6     |
|    ep_rew_mean     | 0.43     |
| time/              |          |
|    fps             | 165      |
|    iterations      | 2000     |
|    time_elapsed    | 60       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:21:45,277] Trial 1 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.07121920927765338, 'lr': 0.00039649489170431445}. Best is trial 2 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 138       |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -3.88e+06 |
|    learning_rate      | 4.68e-05  |
|    n_updates          | 1999      |
|    policy_loss        | -0.0381   |
|    value_loss         | 0.0127    |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 92.1     |
|    ep_rew_mean     | 0.5      |
| time/              |          |
|    fps             | 163      |
|    iterations      | 2000     |
|    time_elapsed    | 61       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:21:45,963] Trial 3 finished with value: 0.0 and parameters: {'gamma': 0.0002730095895553212, 'lr': 4.679341855389713e-05}. Best is trial 2 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 220       |
|    ep_rew_mean        | 0.5       |
| time/                 |           |
|    fps                | 151       |
|    iterations         | 100       |
|    time_elapsed       | 3         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -0.000821 |
|    explained_variance | 0         |
|    learning_rate      | 0.0162    |
|    n_updates          | 99        |
|    policy_loss        | -1.37e-07 |
|    value_loss         | 3.74e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 72.7     |
|    ep_rew_mean        | 0.167    |
| time/                 |          |
|    fps                | 154      |
|    iterations         | 100      |
|

[I 2023-11-10 17:22:41,370] Trial 4 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.00015190180069623313, 'lr': 0.016247726439922373}. Best is trial 2 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 82.3      |
|    mean_reward        | 0.333     |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -9.87e-06 |
|    explained_variance | 5.01e-06  |
|    learning_rate      | 0.0393    |
|    n_updates          | 1999      |
|    policy_loss        | -0        |
|    value_loss         | 0.00923   |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 99.6     |
|    ep_rew_mean     | 0.32     |
| time/              |          |
|    fps             | 173      |
|    iterations      | 2000     |
|    time_elapsed    | 57       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:22:42,947] Trial 6 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.025503226868120274, 'lr': 0.03926537433372906}. Best is trial 2 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 97.3      |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -2.16e+05 |
|    learning_rate      | 5.33e-05  |
|    n_updates          | 1999      |
|    policy_loss        | 0.0186    |
|    value_loss         | 0.00699   |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 91.4     |
|    ep_rew_mean     | 0.31     |
| time/              |          |
|    fps             | 171      |
|    iterations      | 2000     |
|    time_elapsed    | 58       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:22:43,573] Trial 5 finished with value: 0.0 and parameters: {'gamma': 0.004968246424374478, 'lr': 5.33284343411552e-05}. Best is trial 2 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 119      |
|    ep_rew_mean        | 0.25     |
| time/                 |          |
|    fps                | 173      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.22    |
|    explained_variance | 0.661    |
|    learning_rate      | 0.00112  |
|    n_updates          | 99       |
|    policy_loss        | -0.305   |
|    value_loss         | 0.0293   |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 227      |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss 

[I 2023-11-10 17:22:44,524] Trial 7 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 116       |
|    ep_rew_mean        | 0.5       |
| time/                 |           |
|    fps                | 179       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.6      |
|    explained_variance | -2.78e+03 |
|    learning_rate      | 0.000668  |
|    n_updates          | 99        |
|    policy_loss        | -0.15     |
|    value_loss         | 0.0426    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 52.9     |
|    ep_rew_mean        | 0.889    |
| time/                 |          |
|    fps                | 192      |
|    iterations         | 100      |
|

[I 2023-11-10 17:23:37,698] Trial 8 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 83.7     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.26    |
|    explained_variance | -849     |
|    learning_rate      | 0.000668 |
|    n_updates          | 1999     |
|    policy_loss        | 0.0351   |
|    value_loss         | 0.00286  |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 71.7     |
|    ep_rew_mean     | 0.41     |
| time/              |          |
|    fps             | 180      |
|    iterations      | 2000     |
|    time_elapsed    | 55       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:23:38,541] Trial 9 finished with value: 0.0 and parameters: {'gamma': 0.006189096052623425, 'lr': 0.0006675798557185033}. Best is trial 2 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 69.1     |
|    ep_rew_mean        | 0.429    |
| time/                 |          |
|    fps                | 170      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.57    |
|    explained_variance | -0.204   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.0835   |
|    value_loss         | 0.00332  |
------------------------------------
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 98        |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entrop

[I 2023-11-10 17:23:40,867] Trial 11 finished with value: 0.0 and parameters: {'gamma': 0.09563869131331719, 'lr': 0.0009095598146932561}. Best is trial 2 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 178      |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.36    |
|    explained_variance | -2.87    |
|    learning_rate      | 0.000925 |
|    n_updates          | 1999     |
|    policy_loss        | 0.00747  |
|    value_loss         | 8.88e-05 |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 85.9     |
|    ep_rew_mean     | 0.44     |
| time/              |          |
|    fps             | 174      |
|    iterations      | 2000     |
|    time_elapsed    | 57       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:23:41,013] Trial 10 finished with value: 0.0 and parameters: {'gamma': 0.0963829289721275, 'lr': 0.0009251846306117754}. Best is trial 2 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 70.5      |
|    ep_rew_mean        | 0.5       |
| time/                 |           |
|    fps                | 177       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -1.29e+03 |
|    learning_rate      | 1.29e-05  |
|    n_updates          | 99        |
|    policy_loss        | -0.0757   |
|    value_loss         | 0.0229    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 72.1     |
|    ep_rew_mean        | 0.308    |
| time/                 |          |
|    fps                | 181      |
|    iterations         | 200      |
|

[I 2023-11-10 17:24:06,361] Trial 12 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 71.5     |
|    ep_rew_mean        | 0.484    |
| time/                 |          |
|    fps                | 177      |
|    iterations         | 900      |
|    time_elapsed       | 25       |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -661     |
|    learning_rate      | 1.52e-05 |
|    n_updates          | 899      |
|    policy_loss        | -0.332   |
|    value_loss         | 0.0502   |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 78.3      |
|    ep_rew_mean        | 0.482     |
| time/                 |           |
|    fps                | 175       |
|    iterations         | 900       |
|    time_e

[I 2023-11-10 17:24:37,178] Trial 13 finished with value: 0.0 and parameters: {'gamma': 0.024097553807068475, 'lr': 1.2925652677544566e-05}. Best is trial 2 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 86        |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -9.11e+03 |
|    learning_rate      | 1.52e-05  |
|    n_updates          | 1999      |
|    policy_loss        | -0.435    |
|    value_loss         | 0.0912    |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 78.3     |
|    ep_rew_mean     | 0.47     |
| time/              |          |
|    fps             | 173      |
|    iterations      | 2000     |
|    time_elapsed    | 57       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:24:38,600] Trial 15 finished with value: 0.0 and parameters: {'gamma': 0.019951043203902397, 'lr': 1.5181804826612145e-05}. Best is trial 2 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 79.4      |
|    ep_rew_mean        | 0.522     |
| time/                 |           |
|    fps                | 163       |
|    iterations         | 1100      |
|    time_elapsed       | 33        |
|    total_timesteps    | 5500      |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -4.46e+06 |
|    learning_rate      | 0.000113  |
|    n_updates          | 1099      |
|    policy_loss        | -0.187    |
|    value_loss         | 0.0234    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 69.4     |
|    ep_rew_mean        | 0.6      |
| time/                 |          |
|    fps                | 169      |
|    iterations         | 100      |
|

[I 2023-11-10 17:24:40,414] Trial 14 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.02908089172528587, 'lr': 1.0513887686517062e-05}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 79.3     |
|    ep_rew_mean        | 0.833    |
| time/                 |          |
|    fps                | 190      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -277     |
|    learning_rate      | 0.000161 |
|    n_updates          | 99       |
|    policy_loss        | -0.114   |
|    value_loss         | 0.02     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 80.7     |
|    ep_rew_mean        | 0.5      |
| time/                 |          |
|    fps                | 164      |
|    iterations         | 1200     |
|    time_elapsed 

[I 2023-11-10 17:25:06,160] Trial 16 finished with value: 0.0 and parameters: {'gamma': 0.02755047845804029, 'lr': 0.00011250203986694826}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 73       |
|    ep_rew_mean        | 0.508    |
| time/                 |          |
|    fps                | 169      |
|    iterations         | 900      |
|    time_elapsed       | 26       |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -24.9    |
|    learning_rate      | 0.000119 |
|    n_updates          | 899      |
|    policy_loss        | 0.229    |
|    value_loss         | 0.0297   |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 75.7     |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss 

[I 2023-11-10 17:25:10,818] Trial 19 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 75.7      |
|    ep_rew_mean        | 0.462     |
| time/                 |           |
|    fps                | 178       |
|    iterations         | 200       |
|    time_elapsed       | 5         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -0.0281   |
|    explained_variance | -6.49     |
|    learning_rate      | 0.00507   |
|    n_updates          | 199       |
|    policy_loss        | -0.000187 |
|    value_loss         | 0.00283   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 75.1     |
|    ep_rew_mean        | 0.507    |
| time/                 |          |
|    fps                | 163      |
|    iterations         | 1100     |
|

[I 2023-11-10 17:25:38,837] Trial 17 finished with value: 0.0 and parameters: {'gamma': 0.04283577981661914, 'lr': 0.00014202836825113227}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 84.1     |
|    ep_rew_mean        | 0.375    |
| time/                 |          |
|    fps                | 168      |
|    iterations         | 1100     |
|    time_elapsed       | 32       |
|    total_timesteps    | 5500     |
| train/                |          |
|    entropy_loss       | -0.00882 |
|    explained_variance | -2.96    |
|    learning_rate      | 0.00507  |
|    n_updates          | 1099     |
|    policy_loss        | 1.15e-06 |
|    value_loss         | 6.6e-06  |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 145      |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss 

[I 2023-11-10 17:25:39,112] Trial 18 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.04264621552893762, 'lr': 0.0001607381939188163}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 78       |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -250     |
|    learning_rate      | 1.15e-05 |
|    n_updates          | 999      |
|    policy_loss        | -0.349   |
|    value_loss         | 0.0705   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 82.1     |
|    ep_rew_mean     | 0.433    |
| time/              |          |
|    fps             | 164      |
|    iterations      | 1000     |
|    time_elapsed    | 30       |
|    total_timesteps | 5000     |
---------------------------------
------------------------------------
| rollout/ 

[I 2023-11-10 17:26:07,025] Trial 20 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.03820964435923624, 'lr': 0.005066609796774014}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 72.1      |
|    ep_rew_mean        | 0.55      |
| time/                 |           |
|    fps                | 163       |
|    iterations         | 1900      |
|    time_elapsed       | 58        |
|    total_timesteps    | 9500      |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -1.48e+03 |
|    learning_rate      | 1.15e-05  |
|    n_updates          | 1899      |
|    policy_loss        | -0.358    |
|    value_loss         | 0.0999    |
-------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 102      |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|

[I 2023-11-10 17:26:09,151] Trial 22 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 115      |
|    mean_reward        | 0.667    |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -424     |
|    learning_rate      | 1.24e-05 |
|    n_updates          | 999      |
|    policy_loss        | -0.0714  |
|    value_loss         | 0.00532  |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 78       |
|    ep_rew_mean     | 0.531    |
| time/              |          |
|    fps             | 163      |
|    iterations      | 1000     |
|    time_elapsed    | 30       |
|    total_timesteps | 5000     |
---------------------------------
------------------------------------
| rollout/ 

[I 2023-11-10 17:26:12,745] Trial 21 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.010529909815532498, 'lr': 1.1532101216281496e-05}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 68.6     |
|    ep_rew_mean        | 0.333    |
| time/                 |          |
|    fps                | 173      |
|    iterations         | 200      |
|    time_elapsed       | 5        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -18.5    |
|    learning_rate      | 1.17e-05 |
|    n_updates          | 199      |
|    policy_loss        | 0.434    |
|    value_loss         | 0.0841   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 101      |
|    ep_rew_mean        | 0.444    |
| time/                 |          |
|    fps                | 159      |
|    iterations         | 200      |
|    time_elapsed 

[I 2023-11-10 17:26:40,679] Trial 23 finished with value: 0.0 and parameters: {'gamma': 0.010939810868229509, 'lr': 1.2448861903231966e-05}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 135      |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -15.4    |
|    learning_rate      | 2.82e-05 |
|    n_updates          | 999      |
|    policy_loss        | -0.126   |
|    value_loss         | 0.0105   |
------------------------------------


[I 2023-11-10 17:26:42,847] Trial 26 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 101      |
|    ep_rew_mean        | 0.39     |
| time/                 |          |
|    fps                | 165      |
|    iterations         | 1200     |
|    time_elapsed       | 36       |
|    total_timesteps    | 6000     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -207     |
|    learning_rate      | 1.17e-05 |
|    n_updates          | 1199     |
|    policy_loss        | -0.104   |
|    value_loss         | 0.044    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 72.7     |
|    ep_rew_mean        | 0.417    |
| time/                 |          |
|    fps                | 160      |
|    iterations         | 1100     |
|    time_elapsed 

[I 2023-11-10 17:27:08,605] Trial 24 finished with value: 0.0 and parameters: {'gamma': 0.0665778221926173, 'lr': 1.1672739578723183e-05}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 81.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -91.7    |
|    learning_rate      | 2.69e-05 |
|    n_updates          | 1999     |
|    policy_loss        | -0.34    |
|    value_loss         | 0.081    |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 76.9     |
|    ep_rew_mean     | 0.44     |
| time/              |          |
|    fps             | 164      |
|    iterations      | 2000     |
|    time_elapsed    | 60       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:27:09,917] Trial 25 finished with value: 0.0 and parameters: {'gamma': 0.06988082265548134, 'lr': 2.692411535398355e-05}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 84.4     |
|    ep_rew_mean        | 0.472    |
| time/                 |          |
|    fps                | 161      |
|    iterations         | 900      |
|    time_elapsed       | 27       |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -1.6     |
|    explained_variance | -10.4    |
|    learning_rate      | 0.00025  |
|    n_updates          | 899      |
|    policy_loss        | 0.116    |
|    value_loss         | 0.00633  |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 122      |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss 

[I 2023-11-10 17:27:40,900] Trial 27 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.0613962777875986, 'lr': 0.00026451485499620825}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 83.5     |
|    ep_rew_mean        | 0.492    |
| time/                 |          |
|    fps                | 167      |
|    iterations         | 1100     |
|    time_elapsed       | 32       |
|    total_timesteps    | 5500     |
| train/                |          |
|    entropy_loss       | -1.59    |
|    explained_variance | -8.11    |
|    learning_rate      | 0.000405 |
|    n_updates          | 1099     |
|    policy_loss        | 0.00785  |
|    value_loss         | 0.000167 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 94.7     |
|    ep_rew_mean        | 0.483    |
| time/                 |          |
|    fps                | 167      |
|    iterations         | 1100     |
|    time_elapsed 

[I 2023-11-10 17:27:43,461] Trial 28 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.09973510898010474, 'lr': 0.00024971512728787194}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 83.2     |
|    ep_rew_mean        | 0.333    |
| time/                 |          |
|    fps                | 153      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -15.4    |
|    learning_rate      | 0.000391 |
|    n_updates          | 99       |
|    policy_loss        | 0.0599   |
|    value_loss         | 0.0041   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 84.1     |
|    ep_rew_mean        | 0.521    |
| time/                 |          |
|    fps                | 165      |
|    iterations         | 1200     |
|    time_elapsed 

[I 2023-11-10 17:28:10,088] Trial 29 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.057527649886595936, 'lr': 0.00040452894045729714}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 98.4     |
|    ep_rew_mean        | 0.455    |
| time/                 |          |
|    fps                | 159      |
|    iterations         | 900      |
|    time_elapsed       | 28       |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -1.59    |
|    explained_variance | -11.7    |
|    learning_rate      | 0.000324 |
|    n_updates          | 899      |
|    policy_loss        | -0.0331  |
|    value_loss         | 0.000812 |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 63.3     |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss 

[I 2023-11-10 17:28:13,240] Trial 30 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.04278994535892764, 'lr': 0.0002671182307617942}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 61.6      |
|    ep_rew_mean        | 0.286     |
| time/                 |           |
|    fps                | 150       |
|    iterations         | 100       |
|    time_elapsed       | 3         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -5.53e+03 |
|    learning_rate      | 5.35e-05  |
|    n_updates          | 99        |
|    policy_loss        | -0.721    |
|    value_loss         | 0.316     |
-------------------------------------
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 82        |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 5000      |
| train/                |       

[I 2023-11-10 17:28:15,358] Trial 32 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 92.2     |
|    ep_rew_mean        | 0.448    |
| time/                 |          |
|    fps                | 159      |
|    iterations         | 1100     |
|    time_elapsed       | 34       |
|    total_timesteps    | 5500     |
| train/                |          |
|    entropy_loss       | -1.6     |
|    explained_variance | -26.5    |
|    learning_rate      | 0.000391 |
|    n_updates          | 1099     |
|    policy_loss        | 0.00338  |
|    value_loss         | 0.000766 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 80.7     |
|    ep_rew_mean        | 0.333    |
| time/                 |          |
|    fps                | 153      |
|    iterations         | 200      |
|    time_elapsed 

[I 2023-11-10 17:28:40,441] Trial 31 finished with value: 0.0 and parameters: {'gamma': 0.099048497438559, 'lr': 0.0003906460191208291}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 105      |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -155     |
|    learning_rate      | 6.13e-05 |
|    n_updates          | 999      |
|    policy_loss        | -0.109   |
|    value_loss         | 0.0279   |
------------------------------------


[I 2023-11-10 17:28:42,325] Trial 35 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 93.2      |
|    ep_rew_mean        | 0.458     |
| time/                 |           |
|    fps                | 170       |
|    iterations         | 1100      |
|    time_elapsed       | 32        |
|    total_timesteps    | 5500      |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -1.09e+03 |
|    learning_rate      | 5.35e-05  |
|    n_updates          | 1099      |
|    policy_loss        | 0.103     |
|    value_loss         | 0.0174    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 73.2     |
|    ep_rew_mean        | 0.467    |
| time/                 |          |
|    fps                | 186      |
|    iterations         | 1100     |
|

[I 2023-11-10 17:29:08,371] Trial 33 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.03342789831555966, 'lr': 5.346323892084361e-05}. Best is trial 14 with value: 0.6666666666666666.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 79.7      |
|    mean_reward        | 1         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -4.77e+03 |
|    learning_rate      | 6.95e-05  |
|    n_updates          | 1999      |
|    policy_loss        | 0.0116    |
|    value_loss         | 0.00166   |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 73.7     |
|    ep_rew_mean     | 0.49     |
| time/              |          |
|    fps             | 181      |
|    iterations      | 2000     |
|    time_elapsed    | 55       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:29:08,466] Trial 34 finished with value: 1.0 and parameters: {'gamma': 0.03288164634677655, 'lr': 6.949018099784743e-05}. Best is trial 34 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 122      |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -166     |
|    learning_rate      | 7.72e-05 |
|    n_updates          | 999      |
|    policy_loss        | 0.201    |
|    value_loss         | 0.0287   |
------------------------------------


[I 2023-11-10 17:29:09,878] Trial 36 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 59        |
|    ep_rew_mean        | 0.5       |
| time/                 |           |
|    fps                | 200       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -2.14e+03 |
|    learning_rate      | 7.02e-05  |
|    n_updates          | 99        |
|    policy_loss        | 0.423     |
|    value_loss         | 0.113     |
-------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 81.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|

[I 2023-11-10 17:29:10,913] Trial 37 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 77.8      |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 199       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -5.59e+03 |
|    learning_rate      | 7e-05     |
|    n_updates          | 99        |
|    policy_loss        | 0.327     |
|    value_loss         | 0.0972    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 96.2      |
|    ep_rew_mean        | 0.6       |
| time/                 |           |
|    fps                | 177       |
|    iterations         | 100   

[I 2023-11-10 17:29:37,371] Trial 38 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 130      |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -882     |
|    learning_rate      | 7e-05    |
|    n_updates          | 999      |
|    policy_loss        | 0.139    |
|    value_loss         | 0.0155   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 74.1     |
|    ep_rew_mean     | 0.448    |
| time/              |          |
|    fps             | 164      |
|    iterations      | 1000     |
|    time_elapsed    | 30       |
|    total_timesteps | 5000     |
---------------------------------
------------------------------------
| eval/    

[I 2023-11-10 17:29:39,420] Trial 40 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 94.6     |
|    ep_rew_mean        | 0.2      |
| time/                 |          |
|    fps                | 157      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -127     |
|    learning_rate      | 2.56e-05 |
|    n_updates          | 99       |
|    policy_loss        | 0.206    |
|    value_loss         | 0.0284   |
------------------------------------
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 173       |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 5000      |
| train/                |           |
|    entrop

[I 2023-11-10 17:29:40,629] Trial 41 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 74.7     |
|    ep_rew_mean        | 0.431    |
| time/                 |          |
|    fps                | 164      |
|    iterations         | 1100     |
|    time_elapsed       | 33       |
|    total_timesteps    | 5500     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | 0        |
|    learning_rate      | 7e-05    |
|    n_updates          | 1099     |
|    policy_loss        | -0.00832 |
|    value_loss         | 3.52e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 54.6      |
|    ep_rew_mean        | 0.25      |
| time/                 |           |
|    fps                | 163       |
|    iterations         | 100       |
|    time_e

[I 2023-11-10 17:30:00,581] Trial 42 pruned. 


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 176      |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -979     |
|    learning_rate      | 7e-05    |
|    n_updates          | 1999     |
|    policy_loss        | -0.129   |
|    value_loss         | 0.00718  |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 84.1     |
|    ep_rew_mean     | 0.41     |
| time/              |          |
|    fps             | 186      |
|    iterations      | 2000     |
|    time_elapsed    | 53       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:30:02,116] Trial 39 finished with value: 0.0 and parameters: {'gamma': 0.019308408866853418, 'lr': 6.997075865588204e-05}. Best is trial 34 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 80.7      |
|    mean_reward        | 0.667     |
| time/                 |           |
|    total_timesteps    | 5000      |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -7.17e+03 |
|    learning_rate      | 2.07e-05  |
|    n_updates          | 999       |
|    policy_loss        | -0.0867   |
|    value_loss         | 0.00782   |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 75.5     |
|    ep_rew_mean     | 0.485    |
| time/              |          |
|    fps             | 218      |
|    iterations      | 1000     |
|    time_elapsed    | 22       |
|    total_timesteps | 5000     |
---------------------------------
----------------------------------

[I 2023-11-10 17:30:23,436] Trial 46 pruned. 


-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 125       |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 10000     |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -1.15e+04 |
|    learning_rate      | 2.07e-05  |
|    n_updates          | 1999      |
|    policy_loss        | -0.225    |
|    value_loss         | 0.0241    |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 73.2     |
|    ep_rew_mean     | 0.44     |
| time/              |          |
|    fps             | 227      |
|    iterations      | 2000     |
|    time_elapsed    | 44       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:30:23,491] Trial 43 finished with value: 0.0 and parameters: {'gamma': 0.0016498533673050637, 'lr': 2.068087142032362e-05}. Best is trial 34 with value: 1.0.


------------------------------------
| eval/                 |          |
|    mean_ep_length     | 130      |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 10000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -404     |
|    learning_rate      | 2.05e-05 |
|    n_updates          | 1999     |
|    policy_loss        | -0.244   |
|    value_loss         | 0.0861   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 75.1     |
|    ep_rew_mean     | 0.5      |
| time/              |          |
|    fps             | 232      |
|    iterations      | 2000     |
|    time_elapsed    | 42       |
|    total_timesteps | 10000    |
---------------------------------


[I 2023-11-10 17:30:23,593] Trial 44 finished with value: 0.0 and parameters: {'gamma': 0.05320464176079523, 'lr': 2.0526861812999135e-05}. Best is trial 34 with value: 1.0.


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 94.6     |
|    ep_rew_mean        | 0.368    |
| time/                 |          |
|    fps                | 239      |
|    iterations         | 1100     |
|    time_elapsed       | 23       |
|    total_timesteps    | 5500     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -182     |
|    learning_rate      | 4.79e-05 |
|    n_updates          | 1099     |
|    policy_loss        | 0.0961   |
|    value_loss         | 0.0374   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 96.6     |
|    ep_rew_mean        | 0.361    |
| time/                 |          |
|    fps                | 258      |
|    iterations         | 1200     |
|    time_elapsed       | 23       |
|    total_timesteps    | 6000     |
| train/                |          |
|

[I 2023-11-10 17:30:25,474] Trial 45 finished with value: 0.0 and parameters: {'gamma': 0.05326799764287967, 'lr': 4.789654290415112e-05}. Best is trial 34 with value: 1.0.


Number of finished trials:  47
Best trial:
  Value:  1.0
  Params: 
    gamma: 0.03288164634677655
    lr: 6.949018099784743e-05
  User attrs:
    gamma: 0.9671183536532234


### Train agent with best hyper parameters

In [7]:
a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1,
    gamma=1-0.0033700061384865873,
    learning_rate=0.08419759274683712,
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [8]:
# Train agent and save it
a2c_model.learn(int(1.0e4))
a2c_model.save("saved/a2c_c_mini")

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 128       |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 2141      |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -4.39e-21 |
|    explained_variance | 0         |
|    learning_rate      | 0.0842    |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 0.000294  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 128       |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 2226      |
|    iterations         | 200       |
|    time_elapsed       | 0         |
|    total_timesteps    | 1000      |
| train/    

#### Testing 

In [9]:
a2c_model = A2C.load("saved/a2c_c_mini")

env.reset()

mean_reward, std_reward = evaluate_policy(a2c_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:0.00 +/- 0.00


In [10]:
# RANDOM SEED
from IPython.display import SVG, display
from pogema.animation import AnimationMonitor, AnimationConfig

env = AnimationMonitor(env)

def evaluate_success_rate(model, env, num_episodes=1000):
    success_count = 0
    step_array = []
    for i in range(num_episodes):
        print(f'---{i}---')
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 128
        steps_taken = 0
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done)
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                step_array.append(steps_taken)
                env.save_animation(f"render{i}.svg", AnimationConfig(egocentric_idx=0))
                break

    success_rate = success_count / num_episodes
    return success_rate, step_array

success_rate,step_array = evaluate_success_rate(a2c_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")

---0---
1 128 0 False
1 127 0 False
1 126 0 False
1 125 0 False
1 124 0 False
1 123 0 False
1 122 0 False
1 121 0 False
1 120 0 False
1 119 0 False
1 118 0 False
1 117 0 False
1 116 0 False
1 115 0 False
1 114 0 False
1 113 0 False
1 112 0 False
1 111 0 False
1 110 0 False
1 109 0 False
1 108 0 False
1 107 0 False
1 106 0 False
1 105 0 False
1 104 0 False
1 103 0 False
1 102 0 False
1 101 0 False
1 100 0 False
1 99 0 False
1 98 0 False
1 97 0 False
1 96 0 False
1 95 0 False
1 94 0 False
1 93 0 False
1 92 0 False
1 91 0 False
1 90 0 False
1 89 0 False
1 88 0 False
1 87 0 False
1 86 0 False
1 85 0 False
1 84 0 False
1 83 0 False
1 82 0 False
1 81 0 False
1 80 0 False
1 79 0 False
1 78 0 False
1 77 0 False
1 76 0 False
1 75 0 False
1 74 0 False
1 73 0 False
1 72 0 False
1 71 0 False
1 70 0 False
1 69 0 False
1 68 0 False
1 67 0 False
1 66 0 False
1 65 0 False
1 64 0 False
1 63 0 False
1 62 0 False
1 61 0 False
1 60 0 False
1 59 0 False
1 58 0 False
1 57 0 False
1 56 0 False
1 55 0 False
1