In [1]:
import gymnasium as gym
from pogema import GridConfig
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=30
)

env = gym.make("Pogema-v0",grid_config=grid_config)

a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  logger.warn(
  logger.warn(


#### Optuna Integration

In [2]:
""" Optuna example that optimizes the hyperparameters of
a reinforcement learning agent using A2C implementation from Stable-Baselines3
on a Gymnasium environment.

This is a simplified version of what can be found in https://github.com/DLR-RM/rl-baselines3-zoo.

You can run this example as follows:
    $ python sb3_simple.py

"""
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(1.2e5)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3


DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": env
}

def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for A2C hyperparameters."""
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)

    trial.set_user_attr("gamma", gamma)

    return {
        "gamma": gamma,
        "learning_rate": learning_rate,
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_a2c_params(trial))
    # Create the RL model.
    model = A2C(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, n_jobs=4)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))


  from .autonotebook import tqdm as notebook_tqdm
[I 2023-11-05 19:17:20,572] A new study created in memory with name: no-name-2271a94c-47cf-49b3-b3a1-8acbb5ae4497


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 20.6     |
|    ep_rew_mean        | 0.364    |
| time/                 |          |
|    fps                | 235      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -21.5    |
|    learning_rate      | 3.41e-05 |
|    n_updates          | 99       |
|    policy_loss        | 0.12     |
|    value_loss         | 0.0126   |
--------------------------------

[I 2023-11-05 19:24:59,257] Trial 2 finished with value: 0.0 and parameters: {'gamma': 0.0010533686291300326, 'lr': 0.28153746448902656}. Best is trial 2 with value: 0.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 49.3     |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 120000   |
| train/                |          |
|    entropy_loss       | -1.59    |
|    explained_variance | -5.25    |
|    learning_rate      | 4.93e-05 |
|    n_updates          | 23999    |
|    policy_loss        | 0.00932  |
|    value_loss         | 0.000497 |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 25.9     |
|    ep_rew_mean     | 0.33     |
| time/              |          |
|    fps             | 261      |
|    iterations      | 24000    |
|    time_elapsed    | 458      |
|    total_timesteps | 120000   |
---------------------------------


[I 2023-11-05 19:24:59,366] Trial 0 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.00011138443927042074, 'lr': 4.930438883401521e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 30        |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 120000    |
| train/                |           |
|    entropy_loss       | -7.04e-05 |
|    explained_variance | -73.9     |
|    learning_rate      | 0.00807   |
|    n_updates          | 23999     |
|    policy_loss        | -3.69e-09 |
|    value_loss         | 1.15e-06  |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 25.3     |
|    ep_rew_mean     | 0.18     |
| time/              |          |
|    fps             | 261      |
|    iterations      | 24000    |
|    time_elapsed    | 458      |
|    total_timesteps | 120000   |
---------------------------------


[I 2023-11-05 19:24:59,569] Trial 1 finished with value: 0.0 and parameters: {'gamma': 0.00010053199046234635, 'lr': 0.00807110905519633}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 40       |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 120000   |
| train/                |          |
|    entropy_loss       | -1.53    |
|    explained_variance | -81.1    |
|    learning_rate      | 3.41e-05 |
|    n_updates          | 23999    |
|    policy_loss        | 0.0258   |
|    value_loss         | 0.0019   |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 26       |
|    ep_rew_mean     | 0.25     |
| time/              |          |
|    fps             | 260      |
|    iterations      | 24000    |
|    time_elapsed    | 459      |
|    total_timesteps | 120000   |
---------------------------------


[I 2023-11-05 19:25:00,532] Trial 3 finished with value: 0.0 and parameters: {'gamma': 0.041977260804786753, 'lr': 3.410711293837213e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 21.9      |
|    ep_rew_mean        | 0.136     |
| time/                 |           |
|    fps                | 258       |
|    iterations         | 100       |
|    time_elapsed       | 1         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -7.33e-07 |
|    explained_variance | 0         |
|    learning_rate      | 0.0209    |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 0.0444    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 21.3      |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 264       |
|    iterations         | 100   

[I 2023-11-05 19:32:46,169] Trial 5 finished with value: 0.0 and parameters: {'gamma': 0.04767645039290211, 'lr': 0.028406998983909606}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 24.1     |
|    ep_rew_mean        | 0.24     |
| time/                 |          |
|    fps                | 255      |
|    iterations         | 23900    |
|    time_elapsed       | 466      |
|    total_timesteps    | 119500   |
| train/                |          |
|    entropy_loss       | -1.57    |
|    explained_variance | -192     |
|    learning_rate      | 2.59e-05 |
|    n_updates          | 23899    |
|    policy_loss        | -0.0188  |
|    value_loss         | 0.00109  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 20.1      |
|    ep_rew_mean        | 0.32      |
| time/                 |           |
|    fps                | 256       |
|    iterations         | 23900     |
|    time_e

[I 2023-11-05 19:32:47,823] Trial 4 finished with value: 0.0 and parameters: {'gamma': 0.05709326555967728, 'lr': 0.020929022489975194}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 24.1      |
|    ep_rew_mean        | 0.1       |
| time/                 |           |
|    fps                | 243       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -1.44e+07 |
|    learning_rate      | 2.5e-05   |
|    n_updates          | 99        |
|    policy_loss        | -0.441    |
|    value_loss         | 0.107     |
-------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 23.3     |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 120000   |
| train/                |          |
|

[I 2023-11-05 19:32:48,602] Trial 6 finished with value: 0.0 and parameters: {'gamma': 0.004069347809466794, 'lr': 2.5894085209266214e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 34.3      |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 120000    |
| train/                |           |
|    entropy_loss       | -9.06e-18 |
|    explained_variance | 0         |
|    learning_rate      | 0.108     |
|    n_updates          | 23999     |
|    policy_loss        | -0        |
|    value_loss         | 3.27e-14  |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21       |
|    ep_rew_mean     | 0.31     |
| time/              |          |
|    fps             | 256      |
|    iterations      | 24000    |
|    time_elapsed    | 468      |
|    total_timesteps | 120000   |
---------------------------------


[I 2023-11-05 19:32:48,882] Trial 7 finished with value: 0.0 and parameters: {'gamma': 0.000678430189218288, 'lr': 0.10780980825269564}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 27.4      |
|    ep_rew_mean        | 0.188     |
| time/                 |           |
|    fps                | 249       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -4.18e+06 |
|    learning_rate      | 4.92e-05  |
|    n_updates          | 99        |
|    policy_loss        | 0.415     |
|    value_loss         | 0.0907    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23.2      |
|    ep_rew_mean        | 0.0976    |
| time/                 |           |
|    fps                | 247       |
|    iterations         | 200   

[I 2023-11-05 19:40:47,923] Trial 8 finished with value: 0.0 and parameters: {'gamma': 0.00012012236509474243, 'lr': 2.49836660880548e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 27.3     |
|    ep_rew_mean        | 0.24     |
| time/                 |          |
|    fps                | 248      |
|    iterations         | 23900    |
|    time_elapsed       | 481      |
|    total_timesteps    | 119500   |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | 0.862    |
|    learning_rate      | 4.92e-05 |
|    n_updates          | 23899    |
|    policy_loss        | 0.0124   |
|    value_loss         | 0.000314 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 25.5      |
|    ep_rew_mean        | 0.17      |
| time/                 |           |
|    fps                | 248       |
|    iterations         | 23900     |
|    time_e

[I 2023-11-05 19:40:51,191] Trial 9 finished with value: 0.0 and parameters: {'gamma': 0.00013358435610265243, 'lr': 4.9216706585886065e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 24.3      |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 120000    |
| train/                |           |
|    entropy_loss       | -0.00936  |
|    explained_variance | -20.7     |
|    learning_rate      | 0.00056   |
|    n_updates          | 23999     |
|    policy_loss        | -3.16e-06 |
|    value_loss         | 8.23e-06  |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 25.9     |
|    ep_rew_mean     | 0.16     |
| time/              |          |
|    fps             | 248      |
|    iterations      | 24000    |
|    time_elapsed    | 482      |
|    total_timesteps | 120000   |
---------------------------------


[I 2023-11-05 19:40:51,655] Trial 11 finished with value: 0.0 and parameters: {'gamma': 0.00010754084941771735, 'lr': 0.0005595429581414466}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 14.7      |
|    mean_reward        | 0.333     |
| time/                 |           |
|    total_timesteps    | 120000    |
| train/                |           |
|    entropy_loss       | -0.84     |
|    explained_variance | -15.5     |
|    learning_rate      | 0.000344  |
|    n_updates          | 23999     |
|    policy_loss        | -0.000404 |
|    value_loss         | 3.98e-07  |
-------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 24.2     |
|    ep_rew_mean     | 0.26     |
| time/              |          |
|    fps             | 248      |
|    iterations      | 24000    |
|    time_elapsed    | 483      |
|    total_timesteps | 120000   |
---------------------------------


[I 2023-11-05 19:40:52,057] Trial 10 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.00010654339422518009, 'lr': 0.00034382371861133433}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 26.9      |
|    ep_rew_mean        | 0.306     |
| time/                 |           |
|    fps                | 239       |
|    iterations         | 200       |
|    time_elapsed       | 4         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -1.58     |
|    explained_variance | -6.72e+03 |
|    learning_rate      | 0.00045   |
|    n_updates          | 199       |
|    policy_loss        | -0.0325   |
|    value_loss         | 0.00167   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 21.6      |
|    ep_rew_mean        | 0.304     |
| time/                 |           |
|    fps                | 241       |
|    iterations         | 100   

[I 2023-11-05 19:48:57,207] Trial 12 finished with value: 0.0 and parameters: {'gamma': 0.0004208283783785676, 'lr': 0.0004502491337976066}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 20.3      |
|    ep_rew_mean        | 0.29      |
| time/                 |           |
|    fps                | 245       |
|    iterations         | 23800     |
|    time_elapsed       | 485       |
|    total_timesteps    | 119000    |
| train/                |           |
|    entropy_loss       | -0.000121 |
|    explained_variance | 0         |
|    learning_rate      | 0.00159   |
|    n_updates          | 23799     |
|    policy_loss        | -2.77e-08 |
|    value_loss         | 1.08e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 24.5      |
|    ep_rew_mean        | 0.29      |
| time/                 |           |
|    fps                | 243       |
|    iterations         | 23700 

[I 2023-11-05 19:49:01,486] Trial 14 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.00036760523621234744, 'lr': 0.0015911701154873357}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23.8      |
|    ep_rew_mean        | 0.26      |
| time/                 |           |
|    fps                | 243       |
|    iterations         | 23900     |
|    time_elapsed       | 489       |
|    total_timesteps    | 119500    |
| train/                |           |
|    entropy_loss       | -0.425    |
|    explained_variance | -2.94     |
|    learning_rate      | 0.000513  |
|    n_updates          | 23899     |
|    policy_loss        | -0.000761 |
|    value_loss         | 5.81e-06  |
-------------------------------------
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 26.7      |
|    mean_reward        | 0.333     |
| time/                 |           |
|    total_timesteps    | 120000    |
| train/                |       

[I 2023-11-05 19:49:02,564] Trial 13 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.0004767811116872881, 'lr': 0.0007935671216617669}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23.8      |
|    ep_rew_mean        | 0.29      |
| time/                 |           |
|    fps                | 240       |
|    iterations         | 300       |
|    time_elapsed       | 6         |
|    total_timesteps    | 1500      |
| train/                |           |
|    entropy_loss       | -1.57     |
|    explained_variance | -2.78e+04 |
|    learning_rate      | 0.00041   |
|    n_updates          | 299       |
|    policy_loss        | -0.162    |
|    value_loss         | 0.0179    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 34.1      |
|    ep_rew_mean        | 0.0714    |
| time/                 |           |
|    fps                | 237       |
|    iterations         | 100   

[I 2023-11-05 19:49:04,193] Trial 15 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.0003890614480722814, 'lr': 0.0005130986987618192}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 33        |
|    ep_rew_mean        | 0.333     |
| time/                 |           |
|    fps                | 236       |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -7.41e+04 |
|    learning_rate      | 0.000138  |
|    n_updates          | 99        |
|    policy_loss        | -0.251    |
|    value_loss         | 0.0454    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 26       |
|    ep_rew_mean        | 0.263    |
| time/                 |          |
|    fps                | 241      |
|    iterations         | 400      |
|

[I 2023-11-05 19:59:34,365] Trial 19 finished with value: 0.0 and parameters: {'gamma': 0.0028718395125904997, 'lr': 0.0001751086575312934}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 24.4     |
|    ep_rew_mean        | 0.25     |
| time/                 |          |
|    fps                | 178      |
|    iterations         | 22500    |
|    time_elapsed       | 631      |
|    total_timesteps    | 112500   |
| train/                |          |
|    entropy_loss       | -0.971   |
|    explained_variance | -7.54    |
|    learning_rate      | 0.000138 |
|    n_updates          | 22499    |
|    policy_loss        | 0.0337   |
|    value_loss         | 0.000792 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 22.9      |
|    ep_rew_mean        | 0.35      |
| time/                 |           |
|    fps                | 178       |
|    iterations         | 22800     |
|    time_e

[I 2023-11-05 20:00:11,030] Trial 16 finished with value: 0.0 and parameters: {'gamma': 0.00033709393111252384, 'lr': 0.0004101422457828232}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 28.6     |
|    ep_rew_mean        | 0.16     |
| time/                 |          |
|    fps                | 198      |
|    iterations         | 1500     |
|    time_elapsed       | 37       |
|    total_timesteps    | 7500     |
| train/                |          |
|    entropy_loss       | -1.6     |
|    explained_variance | -0.683   |
|    learning_rate      | 0.000114 |
|    n_updates          | 1499     |
|    policy_loss        | -0.276   |
|    value_loss         | 0.0743   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 25.6     |
|    ep_rew_mean        | 0.25     |
| time/                 |          |
|    fps                | 177      |
|    iterations         | 23800    |
|    time_elapsed 

[I 2023-11-05 20:00:18,863] Trial 17 finished with value: 0.0 and parameters: {'gamma': 0.0016528661760854043, 'lr': 0.0001629854284910418}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 19       |
|    mean_reward        | 0.333    |
| time/                 |          |
|    total_timesteps    | 120000   |
| train/                |          |
|    entropy_loss       | -1.11    |
|    explained_variance | -4.91    |
|    learning_rate      | 0.000138 |
|    n_updates          | 23999    |
|    policy_loss        | 0.00479  |
|    value_loss         | 3.85e-05 |
------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23       |
|    ep_rew_mean     | 0.27     |
| time/              |          |
|    fps             | 177      |
|    iterations      | 24000    |
|    time_elapsed    | 676      |
|    total_timesteps | 120000   |
---------------------------------


[I 2023-11-05 20:00:19,453] Trial 18 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.0019871870263294183, 'lr': 0.00013849420173035274}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 28.3      |
|    ep_rew_mean        | 0.19      |
| time/                 |           |
|    fps                | 197       |
|    iterations         | 1800      |
|    time_elapsed       | 45        |
|    total_timesteps    | 9000      |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -5.42e+04 |
|    learning_rate      | 0.000114  |
|    n_updates          | 1799      |
|    policy_loss        | 0.0553    |
|    value_loss         | 0.00485   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 22.5      |
|    ep_rew_mean        | 0.273     |
| time/                 |           |
|    fps                | 167       |
|    iterations         | 300   

[I 2023-11-05 20:09:54,165] Trial 20 finished with value: 0.0 and parameters: {'gamma': 0.00020284802419147743, 'lr': 0.00011388867193993195}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 25.5     |
|    ep_rew_mean        | 0.22     |
| time/                 |          |
|    fps                | 170      |
|    iterations         | 19900    |
|    time_elapsed       | 583      |
|    total_timesteps    | 99500    |
| train/                |          |
|    entropy_loss       | -1.56    |
|    explained_variance | -138     |
|    learning_rate      | 8.58e-05 |
|    n_updates          | 19899    |
|    policy_loss        | 0.0131   |
|    value_loss         | 9.13e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 30.3      |
|    ep_rew_mean        | 0.16      |
| time/                 |           |
|    fps                | 169       |
|    iterations         | 19500     |
|    time_e

[I 2023-11-05 20:11:56,379] Trial 21 finished with value: 0.0 and parameters: {'gamma': 0.0002149055388192363, 'lr': 8.583477830288253e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 26        |
|    ep_rew_mean        | 0.18      |
| time/                 |           |
|    fps                | 187       |
|    iterations         | 4600      |
|    time_elapsed       | 122       |
|    total_timesteps    | 23000     |
| train/                |           |
|    entropy_loss       | -0.000963 |
|    explained_variance | -55.9     |
|    learning_rate      | 0.00216   |
|    n_updates          | 4599      |
|    policy_loss        | -2.65e-07 |
|    value_loss         | 2.16e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 26.2      |
|    ep_rew_mean        | 0.32      |
| time/                 |           |
|    fps                | 170       |
|    iterations         | 23800 

[I 2023-11-05 20:12:02,817] Trial 22 finished with value: 0.0 and parameters: {'gamma': 0.0001904524632163451, 'lr': 1.1539516515013368e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 24.3      |
|    ep_rew_mean        | 0.22      |
| time/                 |           |
|    fps                | 168       |
|    iterations         | 23800     |
|    time_elapsed       | 705       |
|    total_timesteps    | 119000    |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -3.18e+04 |
|    learning_rate      | 1.05e-05  |
|    n_updates          | 23799     |
|    policy_loss        | -0.124    |
|    value_loss         | 0.00803   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 24.4      |
|    ep_rew_mean        | 0.26      |
| time/                 |           |
|    fps                | 187       |
|    iterations         | 4900  

[I 2023-11-05 20:12:10,540] Trial 23 finished with value: 0.0 and parameters: {'gamma': 0.0002078336301372773, 'lr': 1.0491450806890773e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 22.3      |
|    ep_rew_mean        | 0.26      |
| time/                 |           |
|    fps                | 168       |
|    iterations         | 500       |
|    time_elapsed       | 14        |
|    total_timesteps    | 2500      |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -2.04e+05 |
|    learning_rate      | 1.14e-05  |
|    n_updates          | 499       |
|    policy_loss        | 0.711     |
|    value_loss         | 0.209     |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 29.4     |
|    ep_rew_mean        | 0.275    |
| time/                 |          |
|    fps                | 168      |
|    iterations         | 300      |
|

[I 2023-11-05 20:18:25,918] Trial 24 finished with value: 0.0 and parameters: {'gamma': 0.00021352297668658418, 'lr': 0.002158407353809441}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 29.4      |
|    ep_rew_mean        | 0.22      |
| time/                 |           |
|    fps                | 235       |
|    iterations         | 18400     |
|    time_elapsed       | 390       |
|    total_timesteps    | 92000     |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -7.83e+04 |
|    learning_rate      | 1.14e-05  |
|    n_updates          | 18399     |
|    policy_loss        | -0.0814   |
|    value_loss         | 0.00691   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 26.6      |
|    ep_rew_mean        | 0.2       |
| time/                 |           |
|    fps                | 238       |
|    iterations         | 18000 

[I 2023-11-05 20:20:13,084] Trial 25 finished with value: 0.0 and parameters: {'gamma': 0.00021205402829358054, 'lr': 1.1364736637350329e-05}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 25.8      |
|    ep_rew_mean        | 0.22      |
| time/                 |           |
|    fps                | 274       |
|    iterations         | 5900      |
|    time_elapsed       | 107       |
|    total_timesteps    | 29500     |
| train/                |           |
|    entropy_loss       | -0.00465  |
|    explained_variance | 0         |
|    learning_rate      | 0.00138   |
|    n_updates          | 5899      |
|    policy_loss        | -8.75e-07 |
|    value_loss         | 3.82e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 26.4      |
|    ep_rew_mean        | 0.2       |
| time/                 |           |
|    fps                | 242       |
|    iterations         | 23800 

[I 2023-11-05 20:20:18,363] Trial 26 finished with value: 0.0 and parameters: {'gamma': 0.00021951966065434535, 'lr': 0.0014662574487097045}. Best is trial 0 with value: 0.3333333333333333.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 25.3      |
|    ep_rew_mean        | 0.15      |
| time/                 |           |
|    fps                | 244       |
|    iterations         | 23900     |
|    time_elapsed       | 488       |
|    total_timesteps    | 119500    |
| train/                |           |
|    entropy_loss       | -3.75e-05 |
|    explained_variance | -2.22     |
|    learning_rate      | 0.00177   |
|    n_updates          | 23899     |
|    policy_loss        | -6.99e-09 |
|    value_loss         | 1e-05     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 24.2      |
|    ep_rew_mean        | 0.28      |
| time/                 |           |
|    fps                | 273       |
|    iterations         | 6200  

[I 2023-11-05 20:20:21,119] Trial 27 finished with value: 1.0 and parameters: {'gamma': 0.0005900168535066029, 'lr': 0.001773005206375445}. Best is trial 27 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 26.6      |
|    ep_rew_mean        | 0.194     |
| time/                 |           |
|    fps                | 261       |
|    iterations         | 200       |
|    time_elapsed       | 3         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -0.2      |
|    explained_variance | -21.6     |
|    learning_rate      | 0.00454   |
|    n_updates          | 199       |
|    policy_loss        | -9.33e-05 |
|    value_loss         | 1.08e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 27.1      |
|    ep_rew_mean        | 0.25      |
| time/                 |           |
|    fps                | 273       |
|    iterations         | 6400  

[I 2023-11-05 20:26:01,548] Trial 28 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.0006551658660718711, 'lr': 0.0013831135767516391}. Best is trial 27 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 25.5      |
|    ep_rew_mean        | 0.14      |
| time/                 |           |
|    fps                | 249       |
|    iterations         | 17000     |
|    time_elapsed       | 341       |
|    total_timesteps    | 85000     |
| train/                |           |
|    entropy_loss       | -0.000217 |
|    explained_variance | -0.49     |
|    learning_rate      | 0.00314   |
|    n_updates          | 16999     |
|    policy_loss        | 3.01e-08  |
|    value_loss         | 3.74e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 26.1      |
|    ep_rew_mean        | 0.27      |
| time/                 |           |
|    fps                | 249       |
|    iterations         | 17400 

[I 2023-11-05 20:28:04,331] Trial 29 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.0005972275584032104, 'lr': 0.0011459516510804793}. Best is trial 27 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 24.6      |
|    ep_rew_mean        | 0.2       |
| time/                 |           |
|    fps                | 254       |
|    iterations         | 23600     |
|    time_elapsed       | 463       |
|    total_timesteps    | 118000    |
| train/                |           |
|    entropy_loss       | -4.86e-05 |
|    explained_variance | -52.5     |
|    learning_rate      | 0.00314   |
|    n_updates          | 23599     |
|    policy_loss        | -2.63e-08 |
|    value_loss         | 5.98e-05  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 27.1     |
|    ep_rew_mean        | 0.11     |
| time/                 |          |
|    fps                | 255      |
|    iterations         | 23800    |
|

[I 2023-11-05 20:28:08,441] Trial 30 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.0006729174889033615, 'lr': 0.004536288569949908}. Best is trial 27 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 24.4      |
|    ep_rew_mean        | 0.12      |
| time/                 |           |
|    fps                | 277       |
|    iterations         | 7100      |
|    time_elapsed       | 127       |
|    total_timesteps    | 35500     |
| train/                |           |
|    entropy_loss       | -0.00045  |
|    explained_variance | -2.55e+03 |
|    learning_rate      | 0.00479   |
|    n_updates          | 7099      |
|    policy_loss        | 1.26e-08  |
|    value_loss         | 1.51e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 22.9      |
|    ep_rew_mean        | 0.18      |
| time/                 |           |
|    fps                | 254       |
|    iterations         | 23900 

[I 2023-11-05 20:28:11,967] Trial 31 finished with value: 0.0 and parameters: {'gamma': 0.0007514922696902498, 'lr': 0.003135578477716134}. Best is trial 27 with value: 1.0.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 36.3     |
|    ep_rew_mean        | 0.0741   |
| time/                 |          |
|    fps                | 267      |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -0.0107  |
|    explained_variance | -44.2    |
|    learning_rate      | 0.004    |
|    n_updates          | 199      |
|    policy_loss        | 2.47e-06 |
|    value_loss         | 7.12e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 25.4      |
|    ep_rew_mean        | 0.13      |
| time/                 |           |
|    fps                | 277       |
|    iterations         | 7300      |
|    time_e

### Train agent with best hyper parameters

In [23]:
a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1,
    gamma=1-0.001583962761136084,
    learning_rate=0.00030466139396632876,
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [24]:
# Train agent and save it
a2c_model.learn(int(1.2e5))
a2c_model.save("saved/a2c_baseline")

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 28.3      |
|    ep_rew_mean        | 0.176     |
| time/                 |           |
|    fps                | 2091      |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.61     |
|    explained_variance | -1.25e+03 |
|    learning_rate      | 0.000305  |
|    n_updates          | 99        |
|    policy_loss        | 0.0797    |
|    value_loss         | 0.0024    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 26.1      |
|    ep_rew_mean        | 0.263     |
| time/                 |           |
|    fps                | 2117      |
|    iterations         | 200       |
|    time_elapsed       | 0         |
|    total_timesteps    | 1000      |
| train/    

#### Testing 

In [25]:
a2c_model = A2C.load("saved/a2c_baseline")

env.reset()

mean_reward, std_reward = evaluate_policy(a2c_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:0.15 +/- 0.36




In [26]:
# RANDOM SEED
from IPython.display import SVG, display
from pogema.animation import AnimationMonitor, AnimationConfig

env = AnimationMonitor(env)

def evaluate_success_rate(model, env, num_episodes=100):
    success_count = 0
    step_array = []
    for i in range(num_episodes):
        print(f'---{i}---')
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 100
        steps_taken = 0
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done)
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                step_array.append(steps_taken)
                env.save_animation(f"render{i}.svg", AnimationConfig(egocentric_idx=0))
                break

    success_rate = success_count / num_episodes
    return success_rate, step_array

success_rate,step_array = evaluate_success_rate(a2c_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")

---0---
2 100 0 False
2 99 0 False
3 98 0 False
4 97 0 False
1 96 0 False
2 95 0 False
3 94 0 False
4 93 0 False
3 92 0 False
4 91 0 False
3 90 0 False
4 89 0 False
0 88 0 False
1 87 0 False
2 86 0 False
3 85 0 False
0 84 0 False
3 83 0 False
2 82 0 False
2 81 0 False
4 80 0 False
0 79 0 False
3 78 0 False
4 77 0 False
3 76 0 False
2 75 0 False
3 74 0 False
2 73 0 False
2 72 0 False
2 71 0 False
2 70 0 False
2 69 0 False
2 68 0 False
2 67 0 False
2 66 0 False
2 65 0 False
2 64 0 False
2 63 0 False
2 62 0 False
2 61 0 False
2 60 0 False
2 59 0 False
2 58 0 False
2 57 0 False
2 56 0 False
2 55 0 False
2 54 0 False
2 53 0 False
2 52 0 False
2 51 0 False
2 50 0 False
2 49 0 False
2 48 0 False
2 47 0 False
2 46 0 False
2 45 0 False
2 44 0 False
2 43 0 False
2 42 0 False
2 41 0 False
2 40 0 False
2 39 0 False
2 38 0 False
2 37 0 False
2 36 0 False
2 35 0 False
2 34 0 False
2 33 0 False
2 32 0 False
2 31 0 False
2 30 0 False
2 29 0 False
2 28 0 False
2 27 0 False
2 26 0 False
2 25 0 False
2 2

In [None]:
# we got 35% success rate for 1.25e5 steps
# we got 7% success rate for 3.0e5 steps
# better to use 1.25e5 steps