In [16]:
import gymnasium as gym
from pogema import GridConfig
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=30
)

env = gym.make("Pogema-v0",grid_config=grid_config)

a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  logger.warn(
  logger.warn(


#### Optuna Integration

In [17]:
""" Optuna example that optimizes the hyperparameters of
a reinforcement learning agent using A2C implementation from Stable-Baselines3
on a Gymnasium environment.

This is a simplified version of what can be found in https://github.com/DLR-RM/rl-baselines3-zoo.

You can run this example as follows:
    $ python sb3_simple.py

"""
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(1.2e5)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3


ENV_ID = "Pogema-v0"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": ENV_ID
}

def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for A2C hyperparameters."""
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
    gae_lambda = 1.0 - trial.suggest_float("gae_lambda", 0.001, 0.2, log=True)
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 3, 10)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)
    ent_coef = trial.suggest_float("ent_coef", 0.00000001, 0.1, log=True)
    ortho_init = trial.suggest_categorical("ortho_init", [False, True])
    net_arch = trial.suggest_categorical("net_arch", ["tiny", "small"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    # Display true values.
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("gae_lambda_", gae_lambda)
    trial.set_user_attr("n_steps", n_steps)

    net_arch = [
        {"pi": [64], "vf": [64]} if net_arch == "tiny" else {"pi": [64, 64], "vf": [64, 64]}
    ]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "max_grad_norm": max_grad_norm,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
            "ortho_init": ortho_init,
        },
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_a2c_params(trial))
    # Create the RL model.
    model = A2C(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))


[I 2023-10-31 21:54:20,111] A new study created in memory with name: no-name-466b23a5-262d-4128-85aa-19e661157efa


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 64        |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 4994      |
|    iterations         | 100       |
|    time_elapsed       | 5         |
|    total_timesteps    | 25600     |
| train/                |           |
|    entropy_loss       | -1.97e-19 |
|    explained_variance | 0         |
|    learning_rate      | 0.138     |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 0.465     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 64        |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps         

[I 2023-10-31 21:54:43,668] Trial 0 finished with value: 0.0 and parameters: {'gamma': 0.007766388324533128, 'max_grad_norm': 4.8474070671681675, 'gae_lambda': 0.00217321112824313, 'exponent_n_steps': 8, 'lr': 0.13786081068620215, 'ent_coef': 1.3948922996470193e-06, 'ortho_init': False, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 30       |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 60000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | 0.0244   |
|    learning_rate      | 4.92e-05 |
|    n_updates          | 58       |
|    policy_loss        | 0.031    |
|    value_loss         | 0.0436   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 49.2     |
|    ep_rew_mean        | 0.35     |
| time/                 |          |
|    fps                | 4837     |
|    iterations         | 100      |
|    time_elapsed       | 21       |
|    total_timesteps    | 102400   |
| 

[I 2023-10-31 21:55:08,647] Trial 1 finished with value: 0.0 and parameters: {'gamma': 0.011581858761745256, 'max_grad_norm': 0.3556731284678197, 'gae_lambda': 0.019533437342307433, 'exponent_n_steps': 10, 'lr': 4.9182808011402786e-05, 'ent_coef': 4.0973433150248817e-08, 'ortho_init': False, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 57.1      |
|    ep_rew_mean        | 0.111     |
| time/                 |           |
|    fps                | 3398      |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 1600      |
| train/                |           |
|    entropy_loss       | -2.14e-10 |
|    explained_variance | 0         |
|    learning_rate      | 0.0381    |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 0.00481   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 58.3      |
|    ep_rew_mean        | 0.0926    |
| time/                 |           |
|    fps         

[I 2023-10-31 21:55:40,944] Trial 2 finished with value: 0.0 and parameters: {'gamma': 0.004687854283560035, 'max_grad_norm': 0.6304394737081792, 'gae_lambda': 0.0025356020795575655, 'exponent_n_steps': 4, 'lr': 0.03814354585567485, 'ent_coef': 0.0002066303196566671, 'ortho_init': False, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 58.2      |
|    ep_rew_mean        | 0.0926    |
| time/                 |           |
|    fps                | 3931      |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 3200      |
| train/                |           |
|    entropy_loss       | -2.89e-14 |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0647    |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 5.72e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 60.9      |
|    ep_rew_mean        | 0.05      |
| time/                 |           |
|    fps         

[I 2023-10-31 21:56:10,843] Trial 3 finished with value: 0.0 and parameters: {'gamma': 0.008412595766767976, 'max_grad_norm': 1.756283620779071, 'gae_lambda': 0.007745150779740534, 'exponent_n_steps': 5, 'lr': 0.06474918285977753, 'ent_coef': 2.0200593339020013e-08, 'ortho_init': False, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 49       |
|    ep_rew_mean        | 0.4      |
| time/                 |          |
|    fps                | 4631     |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 6400     |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -4.91    |
|    learning_rate      | 3.66e-05 |
|    n_updates          | 99       |
|    policy_loss        | -0.106   |
|    value_loss         | 0.044    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 48.5     |
|    ep_rew_mean        | 0.34     |
| time/                 |          |
|    fps                | 4654     |
| 

[I 2023-10-31 21:56:36,270] Trial 4 finished with value: 0.0 and parameters: {'gamma': 0.024721819971218598, 'max_grad_norm': 0.5807632068964226, 'gae_lambda': 0.03450851209437536, 'exponent_n_steps': 6, 'lr': 3.659609238185422e-05, 'ent_coef': 2.8926314207478015e-08, 'ortho_init': True, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 64        |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 5239      |
|    iterations         | 100       |
|    time_elapsed       | 9         |
|    total_timesteps    | 51200     |
| train/                |           |
|    entropy_loss       | -8.26e-37 |
|    explained_variance | 0.999     |
|    learning_rate      | 0.861     |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 0.146     |
-------------------------------------
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 30        |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timest

[I 2023-10-31 21:56:59,281] Trial 5 finished with value: 0.0 and parameters: {'gamma': 0.00031150332857982223, 'max_grad_norm': 4.243918629113777, 'gae_lambda': 0.0010315554071207312, 'exponent_n_steps': 9, 'lr': 0.8607585575887486, 'ent_coef': 0.013918884550001644, 'ortho_init': True, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 43.3     |
|    ep_rew_mean        | 0.46     |
| time/                 |          |
|    fps                | 5077     |
|    iterations         | 100      |
|    time_elapsed       | 5        |
|    total_timesteps    | 25600    |
| train/                |          |
|    entropy_loss       | -1.27    |
|    explained_variance | 0.426    |
|    learning_rate      | 0.00215  |
|    n_updates          | 99       |
|    policy_loss        | 0.0416   |
|    value_loss         | 0.0243   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 31.1     |
|    ep_rew_mean        | 0.71     |
| time/                 |          |
|    fps                | 5046     |
| 

[I 2023-10-31 21:57:23,627] Trial 6 finished with value: 0.0 and parameters: {'gamma': 0.06325989424032957, 'max_grad_norm': 4.732864183942282, 'gae_lambda': 0.16150265932208213, 'exponent_n_steps': 8, 'lr': 0.0021524327354643354, 'ent_coef': 5.924502038979039e-06, 'ortho_init': False, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 61.5     |
|    ep_rew_mean        | 0.04     |
| time/                 |          |
|    fps                | 4951     |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 12800    |
| train/                |          |
|    entropy_loss       | -1.1e-37 |
|    explained_variance | 0.566    |
|    learning_rate      | 0.279    |
|    n_updates          | 99       |
|    policy_loss        | -0       |
|    value_loss         | 0.395    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 59.7     |
|    ep_rew_mean        | 0.07     |
| time/                 |          |
|    fps                | 4982     |
| 

[I 2023-10-31 21:57:47,593] Trial 7 finished with value: 0.0 and parameters: {'gamma': 0.001404420646105219, 'max_grad_norm': 2.1218391640384398, 'gae_lambda': 0.003775851414188718, 'exponent_n_steps': 7, 'lr': 0.2785189513163106, 'ent_coef': 3.865421548103447e-06, 'ortho_init': False, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 64       |
|    ep_rew_mean        | 0        |
| time/                 |          |
|    fps                | 2429     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 800      |
| train/                |          |
|    entropy_loss       | -0.00486 |
|    explained_variance | 0        |
|    learning_rate      | 0.0044   |
|    n_updates          | 99       |
|    policy_loss        | 4.94e-09 |
|    value_loss         | 9.23e-11 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 64        |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 2494    

[I 2023-10-31 21:58:34,498] Trial 8 finished with value: 0.0 and parameters: {'gamma': 0.001452177494538863, 'max_grad_norm': 2.7734815269913113, 'gae_lambda': 0.0010588798748320987, 'exponent_n_steps': 3, 'lr': 0.004400119837346594, 'ent_coef': 0.0001846281099469569, 'ortho_init': True, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 60.9      |
|    ep_rew_mean        | 0.05      |
| time/                 |           |
|    fps                | 5026      |
|    iterations         | 100       |
|    time_elapsed       | 5         |
|    total_timesteps    | 25600     |
| train/                |           |
|    entropy_loss       | -6.02e-41 |
|    explained_variance | 0         |
|    learning_rate      | 0.981     |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 7.53      |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 64       |
|    ep_rew_mean        | 0        |
| time/                 |          |
|    fps              

[I 2023-10-31 21:58:58,064] Trial 9 finished with value: 0.0 and parameters: {'gamma': 0.08995787380068657, 'max_grad_norm': 1.1465355959844217, 'gae_lambda': 0.006686814449879918, 'exponent_n_steps': 8, 'lr': 0.981354581164826, 'ent_coef': 7.445058539801516e-07, 'ortho_init': False, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 30        |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 60000     |
| train/                |           |
|    entropy_loss       | -1.37e-16 |
|    explained_variance | 0.544     |
|    learning_rate      | 0.0148    |
|    n_updates          | 58        |
|    policy_loss        | -0        |
|    value_loss         | 0.00154   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 61.5      |
|    ep_rew_mean        | 0.04      |
| time/                 |           |
|    fps                | 4888      |
|    iterations         | 100       |
|    time_elapsed       | 20        |
|    total_timest

[I 2023-10-31 21:59:22,809] Trial 10 finished with value: 0.0 and parameters: {'gamma': 0.00012066402439795925, 'max_grad_norm': 3.0579064008976204, 'gae_lambda': 0.0025005281985875953, 'exponent_n_steps': 10, 'lr': 0.014764952722090349, 'ent_coef': 4.4729795021085463e-07, 'ortho_init': True, 'net_arch': 'small', 'activation_fn': 'relu'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 30       |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 60000    |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | 0.0694   |
|    learning_rate      | 1.39e-05 |
|    n_updates          | 58       |
|    policy_loss        | 0.0338   |
|    value_loss         | 0.0419   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 50.4     |
|    ep_rew_mean        | 0.32     |
| time/                 |          |
|    fps                | 4843     |
|    iterations         | 100      |
|    time_elapsed       | 21       |
|    total_timesteps    | 102400   |
| 

[I 2023-10-31 21:59:47,776] Trial 11 finished with value: 0.0 and parameters: {'gamma': 0.0142190383809775, 'max_grad_norm': 0.3066533960598381, 'gae_lambda': 0.02125723216209317, 'exponent_n_steps': 10, 'lr': 1.3936120978492886e-05, 'ent_coef': 2.4326119142451555e-07, 'ortho_init': False, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 48.7     |
|    ep_rew_mean        | 0.4      |
| time/                 |          |
|    fps                | 4682     |
|    iterations         | 100      |
|    time_elapsed       | 10       |
|    total_timesteps    | 51200    |
| train/                |          |
|    entropy_loss       | -1.56    |
|    explained_variance | 0.253    |
|    learning_rate      | 0.000294 |
|    n_updates          | 99       |
|    policy_loss        | -0.0233  |
|    value_loss         | 0.0349   |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 30       |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 60000    |
| 

[I 2023-10-31 22:00:14,213] Trial 12 finished with value: 0.0 and parameters: {'gamma': 0.02230368263092605, 'max_grad_norm': 1.2577880301514777, 'gae_lambda': 0.017009031672683923, 'exponent_n_steps': 9, 'lr': 0.0002943761509086802, 'ent_coef': 1.3749005540026068e-08, 'ortho_init': False, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 46.9     |
|    ep_rew_mean        | 0.36     |
| time/                 |          |
|    fps                | 4143     |
|    iterations         | 100      |
|    time_elapsed       | 6        |
|    total_timesteps    | 25600    |
| train/                |          |
|    entropy_loss       | -1.6     |
|    explained_variance | 0.205    |
|    learning_rate      | 0.000304 |
|    n_updates          | 99       |
|    policy_loss        | 0.00434  |
|    value_loss         | 0.00984  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 50.5     |
|    ep_rew_mean        | 0.32     |
| time/                 |          |
|    fps                | 4267     |
| 

[I 2023-10-31 22:00:41,624] Trial 13 finished with value: 0.0 and parameters: {'gamma': 0.004943491706027309, 'max_grad_norm': 0.3441859378423065, 'gae_lambda': 0.04349885998994662, 'exponent_n_steps': 8, 'lr': 0.00030413788642229186, 'ent_coef': 1.2582406030704463e-07, 'ortho_init': False, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 30        |
|    mean_reward        | 0         |
| time/                 |           |
|    total_timesteps    | 60000     |
| train/                |           |
|    entropy_loss       | -1e-18    |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.146     |
|    n_updates          | 58        |
|    policy_loss        | -0        |
|    value_loss         | 0.00452   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 61.5      |
|    ep_rew_mean        | 0.04      |
| time/                 |           |
|    fps                | 4953      |
|    iterations         | 100       |
|    time_elapsed       | 20        |
|    total_timest

[I 2023-10-31 22:01:05,932] Trial 14 finished with value: 0.0 and parameters: {'gamma': 0.0024535731792239258, 'max_grad_norm': 4.910031296455579, 'gae_lambda': 0.010065378174088577, 'exponent_n_steps': 10, 'lr': 0.1455644775429977, 'ent_coef': 3.0142558066241973e-06, 'ortho_init': False, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 64        |
|    ep_rew_mean        | 0         |
| time/                 |           |
|    fps                | 4329      |
|    iterations         | 100       |
|    time_elapsed       | 2         |
|    total_timesteps    | 12800     |
| train/                |           |
|    entropy_loss       | -0.000467 |
|    explained_variance | 0.305     |
|    learning_rate      | 0.0127    |
|    n_updates          | 99        |
|    policy_loss        | 4.21e-06  |
|    value_loss         | 0.0134    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 64       |
|    ep_rew_mean        | 0        |
| time/                 |          |
|    fps              

[I 2023-10-31 22:01:33,355] Trial 15 finished with value: 0.0 and parameters: {'gamma': 0.010075663958367786, 'max_grad_norm': 0.7442737996326273, 'gae_lambda': 0.004968999389097294, 'exponent_n_steps': 7, 'lr': 0.012652460656987649, 'ent_coef': 1.7752792057195865e-07, 'ortho_init': False, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 47.1     |
|    ep_rew_mean        | 0.42     |
| time/                 |          |
|    fps                | 4495     |
|    iterations         | 100      |
|    time_elapsed       | 11       |
|    total_timesteps    | 51200    |
| train/                |          |
|    entropy_loss       | -1.53    |
|    explained_variance | 0.173    |
|    learning_rate      | 0.000876 |
|    n_updates          | 99       |
|    policy_loss        | -0.0901  |
|    value_loss         | 0.034    |
------------------------------------
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 30       |
|    mean_reward        | 0        |
| time/                 |          |
|    total_timesteps    | 60000    |
| 

[I 2023-10-31 22:01:59,838] Trial 16 finished with value: 0.0 and parameters: {'gamma': 0.03796731376311302, 'max_grad_norm': 0.4708597714505552, 'gae_lambda': 0.013024773078617333, 'exponent_n_steps': 9, 'lr': 0.0008757227712404402, 'ent_coef': 1.0108470357271522e-08, 'ortho_init': False, 'net_arch': 'small', 'activation_fn': 'tanh'}. Best is trial 0 with value: 0.0.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 47.6     |
|    ep_rew_mean        | 0.4      |
| time/                 |          |
|    fps                | 4150     |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 6400     |
| train/                |          |
|    entropy_loss       | -1.59    |
|    explained_variance | 0.119    |
|    learning_rate      | 8.5e-05  |
|    n_updates          | 99       |
|    policy_loss        | 0.322    |
|    value_loss         | 0.0797   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 47.2     |
|    ep_rew_mean        | 0.38     |
| time/                 |          |
|    fps                | 4197     |
| 

[I 2023-10-31 22:02:26,563] Trial 17 finished with value: 0.3333333333333333 and parameters: {'gamma': 0.014799308736613791, 'max_grad_norm': 1.0267419222225287, 'gae_lambda': 0.028819325050136246, 'exponent_n_steps': 6, 'lr': 8.495909802433532e-05, 'ent_coef': 1.5789455898864403e-05, 'ortho_init': False, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 17 with value: 0.3333333333333333.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 60.2      |
|    ep_rew_mean        | 0.07      |
| time/                 |           |
|    fps                | 4493      |
|    iterations         | 100       |
|    time_elapsed       | 1         |
|    total_timesteps    | 6400      |
| train/                |           |
|    entropy_loss       | -0.000153 |
|    explained_variance | 0.913     |
|    learning_rate      | 0.0103    |
|    n_updates          | 99        |
|    policy_loss        | 6.38e-09  |
|    value_loss         | 4.53e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 57.7      |
|    ep_rew_mean        | 0.1       |
| time/                 |           |
|    fps         

[I 2023-10-31 22:02:53,078] Trial 18 finished with value: 0.0 and parameters: {'gamma': 0.020726745620378406, 'max_grad_norm': 0.9451635635578872, 'gae_lambda': 0.06533583823665852, 'exponent_n_steps': 6, 'lr': 0.010337760903370796, 'ent_coef': 1.9526714343855372e-05, 'ortho_init': True, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 17 with value: 0.3333333333333333.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 56.2     |
|    ep_rew_mean        | 0.143    |
| time/                 |          |
|    fps                | 4224     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 3200     |
| train/                |          |
|    entropy_loss       | -0.758   |
|    explained_variance | -0.337   |
|    learning_rate      | 0.00155  |
|    n_updates          | 99       |
|    policy_loss        | 0.236    |
|    value_loss         | 0.189    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 59.3     |
|    ep_rew_mean        | 0.1      |
| time/                 |          |
|    fps                | 4241     |
| 

[I 2023-10-31 22:03:22,297] Trial 19 finished with value: 0.0 and parameters: {'gamma': 0.005895639264222927, 'max_grad_norm': 1.6408086637454555, 'gae_lambda': 0.010419204002362827, 'exponent_n_steps': 5, 'lr': 0.0015528707234077053, 'ent_coef': 8.800320799540755e-05, 'ortho_init': False, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 17 with value: 0.3333333333333333.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 50.9     |
|    ep_rew_mean        | 0.274    |
| time/                 |          |
|    fps                | 4314     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 3200     |
| train/                |          |
|    entropy_loss       | -1.58    |
|    explained_variance | -0.203   |
|    learning_rate      | 0.000259 |
|    n_updates          | 99       |
|    policy_loss        | 0.548    |
|    value_loss         | 0.231    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 47.8     |
|    ep_rew_mean        | 0.34     |
| time/                 |          |
|    fps                | 4323     |
| 

[I 2023-10-31 22:03:51,012] Trial 20 finished with value: 0.6666666666666666 and parameters: {'gamma': 0.03619853995916698, 'max_grad_norm': 0.8746145594736527, 'gae_lambda': 0.028187121356535644, 'exponent_n_steps': 5, 'lr': 0.00025939043596031973, 'ent_coef': 2.8784426309308087e-05, 'ortho_init': False, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 20 with value: 0.6666666666666666.


Using cpu device
Creating environment from the given name 'Pogema-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 47.6     |
|    ep_rew_mean        | 0.388    |
| time/                 |          |
|    fps                | 4107     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 3200     |
| train/                |          |
|    entropy_loss       | -1.6     |
|    explained_variance | -1.38    |
|    learning_rate      | 0.000189 |
|    n_updates          | 99       |
|    policy_loss        | -0.0785  |
|    value_loss         | 0.00574  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 48.1     |
|    ep_rew_mean        | 0.4      |
| time/                 |          |
|    fps                | 4181     |
| 

[I 2023-10-31 22:04:20,676] Trial 21 finished with value: 0.0 and parameters: {'gamma': 0.04369849934381889, 'max_grad_norm': 1.0320557281366587, 'gae_lambda': 0.027222553370946506, 'exponent_n_steps': 5, 'lr': 0.0001890419040917064, 'ent_coef': 3.0151527434143574e-05, 'ortho_init': False, 'net_arch': 'tiny', 'activation_fn': 'tanh'}. Best is trial 20 with value: 0.6666666666666666.


Number of finished trials:  22
Best trial:
  Value:  0.6666666666666666
  Params: 
    gamma: 0.03619853995916698
    max_grad_norm: 0.8746145594736527
    gae_lambda: 0.028187121356535644
    exponent_n_steps: 5
    lr: 0.00025939043596031973
    ent_coef: 2.8784426309308087e-05
    ortho_init: False
    net_arch: tiny
    activation_fn: tanh
  User attrs:
    gamma_: 0.963801460040833
    gae_lambda_: 0.9718128786434643
    n_steps: 32


### Train agent with best hyper parameters

In [31]:
a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1,
    gamma=0.03619853995916698,
    max_grad_norm=0.8746145594736527,
    gae_lambda=0.028187121356535644,
    learning_rate=0.00025939043596031973,
    ent_coef=2.8784426309308087e-05,
    n_steps=32,
    #policy_kwargs={"net_arch": [{"pi": [64], "vf": [64]}], "activation_fn": nn.Tanh, "ortho_init": False}
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [32]:
# Train agent and save it
a2c_model.learn(int(1.2e5))
a2c_model.save("saved/a2c_baseline")

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 25.2     |
|    ep_rew_mean        | 0.3      |
| time/                 |          |
|    fps                | 781      |
|    iterations         | 100      |
|    time_elapsed       | 4        |
|    total_timesteps    | 3200     |
| train/                |          |
|    entropy_loss       | -1.6     |
|    explained_variance | -784     |
|    learning_rate      | 0.000259 |
|    n_updates          | 99       |
|    policy_loss        | 0.0966   |
|    value_loss         | 0.0192   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 25       |
|    ep_rew_mean        | 0.3      |
| time/                 |          |
|    fps                | 782      |
|    iterations         | 200      |
|    time_elapsed       | 8        |
|    total_timesteps    | 6400     |
| train/                |          |
|

#### Testing 

In [33]:
a2c_model = A2C.load("saved/a2c_baseline")

env.reset()

mean_reward, std_reward = evaluate_policy(a2c_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:0.20 +/- 0.40


In [34]:
# RANDOM SEED
from IPython.display import SVG, display
from pogema.animation import AnimationMonitor, AnimationConfig

env = AnimationMonitor(env)

def evaluate_success_rate(model, env, num_episodes=100):
    success_count = 0
    step_array = []
    for i in range(num_episodes):
        print(f'---{i}---')
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 100
        steps_taken = 0
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done)
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                step_array.append(steps_taken)
                env.save_animation(f"render{i}.svg", AnimationConfig(egocentric_idx=0))
                break

    success_rate = success_count / num_episodes
    return success_rate, step_array

success_rate,step_array = evaluate_success_rate(a2c_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")

---0---
2 100 0 False
1 99 0 False
2 98 0 False
1 97 0 False
4 96 0 False
3 95 0 False
2 94 0 False
3 93 0 False
3 92 0 False
3 91 0 False
3 90 0 True
---1---
1 100 1 False
2 99 1 False
1 98 1 False
1 97 1 False
1 96 1 False
2 95 1 False
1 94 1 False
3 93 1 False
1 92 1 False
2 91 1 False
2 90 1 False
4 89 1 False
2 88 1 False
1 87 1 False
1 86 1 False
1 85 1 False
3 84 1 False
4 83 1 False
3 82 1 False
1 81 1 False
1 80 1 False
2 79 1 False
4 78 1 False
1 77 1 False
3 76 1 False
3 75 1 False
1 74 1 False
2 73 1 False
2 72 1 False
2 71 1 False
1 70 1 False
1 69 1 False
1 68 1 False
2 67 1 False
2 66 1 False
2 65 1 False
2 64 1 False
1 63 1 False
1 62 1 False
2 61 1 False
1 60 1 False
2 59 1 False
3 58 1 False
2 57 1 False
3 56 1 False
4 55 1 False
1 54 1 False
1 53 1 False
1 52 1 False
3 51 1 False
1 50 1 False
2 49 1 False
2 48 1 False
3 47 1 False
1 46 1 False
1 45 1 False
2 44 1 False
1 43 1 False
2 42 1 False
1 41 1 False
2 40 1 False
2 39 1 False
2 38 1 False
4 37 1 False
0 36 1 F