In [29]:
import gymnasium as gym
from pogema import GridConfig
import pogema
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline


GRID_LEN = 18

def custom_step(self, action: list):
        assert len(action) == self.grid_config.num_agents
        rewards = []

        terminated = []

        self.move_agents(action)
        self.update_was_on_goal()

        for agent_idx in range(self.grid_config.num_agents):

            c_x, c_y = self.grid.positions_xy[agent_idx]
            f_x, f_y = self.grid.finishes_xy[agent_idx]

            #d = math.sqrt((c_x - f_x) ** 2 + (c_y - f_y) ** 2)    
            #reward = 1 - (d / (math.sqrt(2) * GRID_LEN))
            reward = - ( (abs(c_x - f_x) + abs(c_y - f_y)) / (2 * GRID_LEN) )
            #print(f"[CURR] {c_x}, {c_y} [FINISH] {f_x}, {f_y} [DIST] {d} [REWARD] {reward}")


            on_goal = self.grid.on_goal(agent_idx)
            if on_goal and self.grid.is_active[agent_idx]:
                print("FINISH", reward)
                rewards.append(100)
            else:
                rewards.append(reward)
            terminated.append(on_goal)

        for agent_idx in range(self.grid_config.num_agents):
            if self.grid.on_goal(agent_idx):
                self.grid.hide_agent(agent_idx)
                self.grid.is_active[agent_idx] = False

        infos = self._get_infos()

        observations = self._obs()
        truncated = [False] * self.grid_config.num_agents
        return observations, rewards, terminated, truncated, infos

pogema.envs.Pogema.step = custom_step

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=64,
    obs_radius=5
)

env = gym.make("Pogema-v0",grid_config=grid_config)

a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


#### Optuna Integration

In [30]:
""" Optuna example that optimizes the hyperparameters of
a reinforcement learning agent using A2C implementation from Stable-Baselines3
on a Gymnasium environment.

This is a simplified version of what can be found in https://github.com/DLR-RM/rl-baselines3-zoo.

You can run this example as follows:
    $ python sb3_simple.py

"""
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(5.0e5)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3


DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": env
}

def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for A2C hyperparameters."""
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)

    trial.set_user_attr("gamma", gamma)

    return {
        "gamma": gamma,
        "learning_rate": learning_rate,
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_a2c_params(trial))
    # Create the RL model.
    model = A2C(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, n_jobs=4, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))


[I 2023-11-11 17:51:10,668] A new study created in memory with name: no-name-f2098ac5-82ed-49dd-a4dc-aebf4935860c


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISHFINISH -0.0
 -0.0
FINISH -0.0
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 45        |
|    ep_rew_mean        | 10.3      |
| time/                 |           |
|    fps                | 259       |
|    iterations         | 100       |
|    time_elapsed       | 1         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -0.0096   |
|    explained_variance | 0.00568   |
|    learning_rate      | 0.00587   

[I 2023-11-11 18:26:01,298] Trial 2 finished with value: -8.712963 and parameters: {'gamma': 0.012967937258382623, 'lr': 0.0017287529740896933}. Best is trial 2 with value: -8.712963.


FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 24.9     |
|    ep_rew_mean        | 64.8     |
| time/                 |          |
|    fps                | 238      |
|    iterations         | 99900    |
|    time_elapsed       | 2091     |
|    total_timesteps    | 499500   |
| train/                |          |
|    entropy_loss       | -0.276   |
|    explained_variance | 1.79e-07 |
|    learning_rate      | 0.000211 |
|    n_updates          | 99899    |
|    policy_loss        | 0.034    |
|    value_loss         | 0.0234   |
------------------------------------
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
-------------------------------------
| rollout/              |           |
|    ep_len_mean   

[I 2023-11-11 18:26:02,843] Trial 0 finished with value: 27.046296333333334 and parameters: {'gamma': 0.0004387302355706096, 'lr': 0.0002112857978014993}. Best is trial 0 with value: 27.046296333333334.


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 42.6      |
|    ep_rew_mean        | 41.3      |
| time/                 |           |
|    fps                | 238       |
|    iterations         | 99900     |
|    time_elapsed       | 2092      |
|    total_timesteps    | 499500    |
| train/                |           |
|    entropy_loss       | -1.39e-05 |
|    explained_variance | 0         |
|    learning_rate      | 0.00587   |
|    n_updates          | 99899     |
|    policy_loss        | -0        |
|    value_loss         | 0.173     |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 37.2     |
|    ep_rew_mean        | 47.3     |
| time/                 |          |
|    fps                | 237      |
|    iterations         | 99400    |
|    time_elapsed       | 2092     |
|    total_timesteps    | 497000   |
| train/             

[I 2023-11-11 18:26:03,539] Trial 1 finished with value: -7.092592666666666 and parameters: {'gamma': 0.0002659752638108986, 'lr': 0.005865580404816237}. Best is trial 0 with value: 27.046296333333334.


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 38.8      |
|    ep_rew_mean        | 47.1      |
| time/                 |           |
|    fps                | 237       |
|    iterations         | 99500     |
|    time_elapsed       | 2092      |
|    total_timesteps    | 497500    |
| train/                |           |
|    entropy_loss       | -0.875    |
|    explained_variance | -0.000136 |
|    learning_rate      | 0.0001    |
|    n_updates          | 99499     |
|    policy_loss        | 0.0947    |
|    value_loss         | 0.0291    |
-------------------------------------
FINISH -0.0
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 39.8      |
|    ep_rew_mean        | 45.1      |
| time/                 |           |
|    fps                | 237       |
|    iterations         | 99600     |
|    time_elapsed       | 2093      |
|    total_timesteps    | 498000    |


[I 2023-11-11 18:26:04,645] Trial 3 finished with value: -5.314814666666667 and parameters: {'gamma': 0.0013670245766961074, 'lr': 0.00010019916354642707}. Best is trial 0 with value: 27.046296333333334.


Number of finished trials:  4
Best trial:
  Value:  27.046296333333334
  Params: 
    gamma: 0.0004387302355706096
    lr: 0.0002112857978014993
  User attrs:
    gamma: 0.9995612697644294


### Train agent with best hyper parameters

In [31]:
a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1,
    gamma=1-0.0004387302355706096,#0.0001747891704351033,
    learning_rate=0.0002112857978014993#0.02847505337754138,
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [32]:
# Train agent and save it
a2c_model.learn(int(5.0e5))
a2c_model.save("saved/a2c_d_v2")

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 64       |
|    ep_rew_mean        | -12.2    |
| time/                 |          |
|    fps                | 1876     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.61    |
|    explained_variance | -3.49    |
|    learning_rate      | 0.000211 |
|    n_updates          | 99       |
|    policy_loss        | -1.43    |
|    value_loss         | 1.02     |
------------------------------------
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 54.8     |
|    ep_rew_mean        | 13.3     |
| time/                 |          |
|    fps                | 2042     |
|    iterations         | 200      |
|    time_elapsed       | 0        |
|    total_timesteps    | 1

#### Testing 

In [37]:
a2c_model = A2C.load("saved/a2c_d_v2")

env.reset()

mean_reward, std_reward = evaluate_policy(a2c_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
mean_reward:39.55 +/- 54.70


In [38]:
# RANDOM SEED
from IPython.display import SVG, display
from pogema.animation import AnimationMonitor, AnimationConfig

env = AnimationMonitor(env)

def evaluate_success_rate(model, env, num_episodes=1000):
    success_count = 0
    step_array = []
    rewards_arr = []
    all_rewards_arr = []
    for i in range(num_episodes):
        print(f'---{i}---')
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 64
        steps_taken = 0
        reward_acc = 0 
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done, reward)
            reward_acc += reward
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                print("acc reward", reward_acc)
                step_array.append(steps_taken)
                rewards_arr.append(reward_acc)
                env.save_animation(f"media_d/render{i}.svg", AnimationConfig(egocentric_idx=0))
                break
        
        all_rewards_arr.append(reward_acc)

    success_rate = success_count / num_episodes
    return success_rate, step_array, rewards_arr, all_rewards_arr

success_rate,step_array, reward_arr, all_rewards_arr = evaluate_success_rate(a2c_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")
print(f"Accumulated reward : {reward_acc_arr}")

---0---
3 64 0 False -0.05555555555555555
3 63 0 False -0.05555555555555555
3 62 0 False -0.05555555555555555
3 61 0 False -0.05555555555555555
3 60 0 False -0.05555555555555555
3 59 0 False -0.05555555555555555
3 58 0 False -0.05555555555555555
3 57 0 False -0.05555555555555555
3 56 0 False -0.05555555555555555
3 55 0 False -0.05555555555555555
3 54 0 False -0.05555555555555555
3 53 0 False -0.05555555555555555
3 52 0 False -0.05555555555555555
3 51 0 False -0.05555555555555555
3 50 0 False -0.05555555555555555
3 49 0 False -0.05555555555555555
3 48 0 False -0.05555555555555555
3 47 0 False -0.05555555555555555
3 46 0 False -0.05555555555555555
3 45 0 False -0.05555555555555555
3 44 0 False -0.05555555555555555
3 43 0 False -0.05555555555555555
3 42 0 False -0.05555555555555555
3 41 0 False -0.05555555555555555
3 40 0 False -0.05555555555555555
3 39 0 False -0.05555555555555555
3 38 0 False -0.05555555555555555
3 37 0 False -0.05555555555555555
3 36 0 False -0.05555555555555555
3 35 0

In [39]:
step_array

[7,
 3,
 2,
 4,
 3,
 2,
 1,
 2,
 7,
 5,
 7,
 50,
 6,
 4,
 2,
 2,
 2,
 2,
 7,
 2,
 3,
 3,
 2,
 8,
 4,
 4,
 2,
 5,
 4,
 7,
 5,
 3,
 11,
 5,
 8,
 10,
 4,
 60,
 6,
 5,
 1,
 3,
 6,
 13,
 2,
 2,
 2,
 11,
 2,
 5,
 3,
 5,
 35,
 6,
 2,
 6,
 5,
 11,
 2,
 15,
 8,
 3,
 2,
 20,
 3,
 4,
 4,
 3,
 5,
 2,
 6,
 8,
 36,
 4,
 2,
 10,
 20,
 4,
 1,
 5,
 1,
 2,
 5,
 6,
 32,
 15,
 1,
 2,
 2,
 36,
 5,
 6,
 63,
 6,
 2,
 3,
 2,
 5,
 6,
 3,
 2,
 6,
 3,
 45,
 13,
 3,
 6,
 8,
 7,
 6,
 2,
 11,
 4,
 4,
 18,
 7,
 1,
 2,
 6,
 6,
 2,
 4,
 7,
 3,
 1,
 2,
 6,
 10,
 8,
 21,
 18,
 23,
 5,
 1,
 3,
 5,
 2,
 25,
 14,
 6,
 2,
 3,
 57,
 2,
 18,
 4,
 1,
 2,
 62,
 5,
 5,
 4,
 4,
 62,
 20,
 2,
 4,
 13,
 4,
 4,
 1,
 4,
 3,
 4,
 6,
 3,
 7,
 2,
 6,
 9,
 1,
 4,
 4,
 9,
 3,
 1,
 4,
 3,
 8,
 3,
 13,
 9,
 10,
 9,
 1,
 4,
 1,
 1,
 6,
 11,
 3,
 1,
 16,
 2,
 2,
 3,
 7,
 2,
 2,
 2,
 2,
 5,
 6,
 6,
 24,
 15,
 13,
 1,
 1,
 1,
 1,
 3,
 11,
 29,
 3,
 9,
 2,
 5,
 2,
 5,
 3,
 1,
 8,
 4,
 3,
 29,
 1,
 4,
 10,
 21,
 2,
 1,
 37,
 2,
 10,
 4,
 10,
 2,


In [40]:
import numpy as np
np.mean(step_array)

7.406203840472673

In [41]:
len(all_rewards_arr)

1000

In [42]:
all_rewards_arr

[-3.5555555555555505,
 99.41666666666667,
 99.91666666666667,
 -5.361111111111109,
 99.97222222222223,
 99.83333333333333,
 -4.472222222222213,
 99.91666666666667,
 -3.7499999999999942,
 99.97222222222223,
 -3.9722222222222157,
 100,
 99.97222222222223,
 99.47222222222223,
 99.72222222222223,
 99.41666666666667,
 90.80555555555554,
 99.58333333333333,
 -5.361111111111109,
 99.83333333333333,
 99.97222222222223,
 -3.6388888888888835,
 -4.1111111111111045,
 -5.694444444444441,
 -10.861111111111118,
 99.97222222222223,
 99.97222222222223,
 99.97222222222223,
 99.44444444444444,
 99.97222222222223,
 99.91666666666667,
 99.91666666666667,
 -10.97222222222223,
 99.97222222222223,
 99.36111111111111,
 99.83333333333333,
 99.83333333333333,
 -14.30555555555555,
 99.97222222222223,
 99.72222222222223,
 99.83333333333333,
 -12.250000000000007,
 99.5,
 99.72222222222223,
 99.91666666666667,
 99.22222222222223,
 99.72222222222223,
 99.25,
 -5.499999999999991,
 98.80555555555556,
 99.83333333333333

In [43]:
len(reward_arr)

677

In [44]:
reward_arr

[99.41666666666667,
 99.91666666666667,
 99.97222222222223,
 99.83333333333333,
 99.91666666666667,
 99.97222222222223,
 100,
 99.97222222222223,
 99.47222222222223,
 99.72222222222223,
 99.41666666666667,
 90.80555555555554,
 99.58333333333333,
 99.83333333333333,
 99.97222222222223,
 99.97222222222223,
 99.97222222222223,
 99.97222222222223,
 99.44444444444444,
 99.97222222222223,
 99.91666666666667,
 99.91666666666667,
 99.97222222222223,
 99.36111111111111,
 99.83333333333333,
 99.83333333333333,
 99.97222222222223,
 99.72222222222223,
 99.83333333333333,
 99.5,
 99.72222222222223,
 99.91666666666667,
 99.22222222222223,
 99.72222222222223,
 99.25,
 98.80555555555556,
 99.83333333333333,
 95.08333333333333,
 99.63888888888889,
 99.72222222222223,
 100,
 99.91666666666667,
 99.75,
 98.52777777777777,
 99.97222222222223,
 99.97222222222223,
 99.97222222222223,
 98.69444444444444,
 99.97222222222223,
 99.72222222222223,
 99.91666666666667,
 99.72222222222223,
 96.77777777777777,
 99.6

In [45]:
from IPython.display import SVG, display
from utils.util import *

MODEL_NAME = f"saved/a2c_d_v2"
MAX_EPISODE_STEPS = 64
MAX_TRIALS = 1000
SAVE_METRICS_PATH = 'saved/evaluation_metrics.yml'


metrics = evaluate_metrics(a2c_model, env, MODEL_NAME, num_episodes=MAX_EPISODE_STEPS, num_trials=MAX_TRIALS, verbose=False, save_animation=False)
save_metrics(metrics, MODEL_NAME, SAVE_METRICS_PATH)
print(f"Agent Success Rate: {metrics['success_rate'] * 100:.2f}%")
print(f"Steps to termination : {metrics['step_array']}")
print(f"Average steps to termination : {metrics['ave_steps']}")

FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINISH -0.0
FINI