In [2]:
import gymnasium as gym
from pogema import GridConfig
import pogema
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline


GRID_LEN = 18

def custom_step(self, action: list):
        assert len(action) == self.grid_config.num_agents
        rewards = []

        terminated = []

        self.move_agents(action)
        self.update_was_on_goal()

        for agent_idx in range(self.grid_config.num_agents):

            c_x, c_y = self.grid.positions_xy[agent_idx]
            f_x, f_y = self.grid.finishes_xy[agent_idx]

            #d = math.sqrt((c_x - f_x) ** 2 + (c_y - f_y) ** 2)    
            #reward = 1 - (d / (math.sqrt(2) * GRID_LEN))
            reward = 1 - ( (abs(c_x - f_x) + abs(c_y - f_y)) / (2 * GRID_LEN) )
            #print(f"[CURR] {c_x}, {c_y} [FINISH] {f_x}, {f_y} [DIST] {d} [REWARD] {reward}")


            on_goal = self.grid.on_goal(agent_idx)
            if on_goal and self.grid.is_active[agent_idx]:
                print("FINISH", reward)
                rewards.append(100)
            else:
                rewards.append(reward)
            terminated.append(on_goal)

        for agent_idx in range(self.grid_config.num_agents):
            if self.grid.on_goal(agent_idx):
                self.grid.hide_agent(agent_idx)
                self.grid.is_active[agent_idx] = False

        infos = self._get_infos()

        observations = self._obs()
        truncated = [False] * self.grid_config.num_agents
        return observations, rewards, terminated, truncated, infos

pogema.envs.Pogema.step = custom_step

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=64,
    obs_radius=5
)

env = gym.make("Pogema-v0",grid_config=grid_config)

a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


#### Optuna Integration

In [3]:
""" Optuna example that optimizes the hyperparameters of
a reinforcement learning agent using A2C implementation from Stable-Baselines3
on a Gymnasium environment.

This is a simplified version of what can be found in https://github.com/DLR-RM/rl-baselines3-zoo.

You can run this example as follows:
    $ python sb3_simple.py

"""
from typing import Any
from typing import Dict

import gymnasium
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(5.0e5)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3


DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "verbose": 1,
    "env": env
}

def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for A2C hyperparameters."""
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)

    trial.set_user_attr("gamma", gamma)

    return {
        "gamma": gamma,
        "learning_rate": learning_rate,
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gymnasium.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need.
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:
    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters.
    kwargs.update(sample_a2c_params(trial))
    # Create the RL model.
    model = A2C(**kwargs)
    # Create env used for evaluation.
    eval_env = Monitor(env)
    # Create the callback that will periodically evaluate and report the performance.
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN.
        print(e)
        nan_encountered = True
    finally:
        # Free memory.
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed.
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training.
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used.
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, n_jobs=4, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))


  from .autonotebook import tqdm as notebook_tqdm
[I 2023-11-11 08:55:39,742] A new study created in memory with name: no-name-6983f7b3-a767-40b7-ba9e-70613b0dcbeb


Using cpu deviceUsing cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISHFINISH 1.0
 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 42.5     |
|    ep_rew_mean        | 63.9     |
| time/                 |          |
|    fps                | 253      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|

[I 2023-11-11 09:27:46,304] Trial 0 finished with value: 128.56481499999998 and parameters: {'gamma': 0.002948135982833872, 'lr': 0.004790797938243059}. Best is trial 0 with value: 128.56481499999998.


FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 34.9     |
|    ep_rew_mean        | 97.1     |
| time/                 |          |
|    fps                | 259      |
|    iterations         | 99900    |
|    time_elapsed       | 1926     |
|    total_timesteps    | 499500   |
| train/                |          |
|    entropy_loss       | -0.905   |
|    explained_variance | 0.000913 |
|    learning_rate      | 4.63e-05 |
|    n_updates          | 99899    |
|    policy_loss        | -0.193   |
|    value_loss         | 0.0873   |
------------------------------------
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
--------

[I 2023-11-11 09:27:47,395] Trial 1 finished with value: 101.60185200000001 and parameters: {'gamma': 0.020989671112955608, 'lr': 4.6321848520712615e-05}. Best is trial 0 with value: 128.56481499999998.


FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 17.6     |
|    ep_rew_mean        | 97.9     |
| time/                 |          |
|    fps                | 258      |
|    iterations         | 99800    |
|    time_elapsed       | 1927     |
|    total_timesteps    | 499000   |
| train/                |          |
|    entropy_loss       | -0.00454 |
|    explained_variance | 0.0164   |
|    learning_rate      | 0.000902 |
|    n_updates          | 99799    |
|    policy_loss        | 0.00777  |
|    value_loss         | 2.97e+03 |
------------------------------------
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 45       |
|    mean_reward        | 38.5     |
| time/                 |          |
|    total_timesteps 

[I 2023-11-11 09:27:47,818] Trial 2 finished with value: 38.518518666666665 and parameters: {'gamma': 0.06462349753742755, 'lr': 0.0008691353930263321}. Best is trial 0 with value: 128.56481499999998.


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 20.7      |
|    ep_rew_mean        | 95.8      |
| time/                 |           |
|    fps                | 259       |
|    iterations         | 99900     |
|    time_elapsed       | 1928      |
|    total_timesteps    | 499500    |
| train/                |           |
|    entropy_loss       | -0.000606 |
|    explained_variance | nan       |
|    learning_rate      | 0.000902  |
|    n_updates          | 99899     |
|    policy_loss        | 1.81e-11  |
|    value_loss         | 0         |
-------------------------------------
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 43.3      |
|    mean_reward        | 73.7      |
| time/                 |           |
|    total_timesteps    | 500000    |
| train/        

[I 2023-11-11 09:27:48,241] Trial 3 finished with value: 73.65740733333334 and parameters: {'gamma': 0.09418004597652863, 'lr': 0.0009019527602572411}. Best is trial 0 with value: 128.56481499999998.


Number of finished trials:  4
Best trial:
  Value:  128.56481499999998
  Params: 
    gamma: 0.002948135982833872
    lr: 0.004790797938243059
  User attrs:
    gamma: 0.9970518640171662


### Train agent with best hyper parameters

In [6]:
a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1,
    gamma=1-0.002948135982833872,#0.0001747891704351033,
    learning_rate=0.004790797938243059#0.02847505337754138,
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [7]:
# Train agent and save it
a2c_model.learn(int(5.0e5))
a2c_model.save("saved/a2c_d_v2")

FINISH 1.0
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 56.1     |
|    ep_rew_mean        | 59.4     |
| time/                 |          |
|    fps                | 2193     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.00976 |
|    explained_variance | 0        |
|    learning_rate      | 0.00479  |
|    n_updates          | 99       |
|    policy_loss        | 0.00209  |
|    value_loss         | 3.96     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 60.1     |
|    ep_rew_mean        | 59.1     |
| time/                 |          |
|    fps                | 2241     |
|    iterations         | 200      |
|    time_elapsed       | 0        |
|    total_timesteps    | 1000     |
| train/                |  

#### Testing 

In [12]:
a2c_model = A2C.load("saved/a2c_d_v2")

env.reset()

mean_reward, std_reward = evaluate_policy(a2c_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:53.17 +/- 5.37


In [13]:
# RANDOM SEED
from IPython.display import SVG, display
from pogema.animation import AnimationMonitor, AnimationConfig

env = AnimationMonitor(env)

def evaluate_success_rate(model, env, num_episodes=1000):
    success_count = 0
    step_array = []
    reward_acc_arr = []
    all_acc_arr = []
    for i in range(num_episodes):
        print(f'---{i}---')
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 64
        steps_taken = 0
        reward_acc = 0 
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done, reward)
            reward_acc += reward
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                print("acc reward", reward_acc)
                step_array.append(steps_taken)
                reward_acc_arr.append(reward_acc)
                env.save_animation(f"media_d/render{i}.svg", AnimationConfig(egocentric_idx=0))
                break
        
        all_acc_arr.append(reward_acc)

    success_rate = success_count / num_episodes
    return success_rate, step_array, reward_acc_arr, all_acc_arr

success_rate,step_array, reward_acc_arr, failures = evaluate_success_rate(a2c_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")
print(f"Accumulated reward : {reward_acc_arr}")

---0---
4 64 0 False 0.9722222222222222
4 63 0 False 0.9722222222222222
4 62 0 False 0.9722222222222222
4 61 0 False 0.9722222222222222
4 60 0 False 0.9722222222222222
4 59 0 False 0.9722222222222222
4 58 0 False 0.9722222222222222
4 57 0 False 0.9722222222222222
4 56 0 False 0.9722222222222222
4 55 0 False 0.9722222222222222
4 54 0 False 0.9722222222222222
4 53 0 False 0.9722222222222222
4 52 0 False 0.9722222222222222
4 51 0 False 0.9722222222222222
4 50 0 False 0.9722222222222222
4 49 0 False 0.9722222222222222
4 48 0 False 0.9722222222222222
4 47 0 False 0.9722222222222222
4 46 0 False 0.9722222222222222
4 45 0 False 0.9722222222222222
4 44 0 False 0.9722222222222222
4 43 0 False 0.9722222222222222
4 42 0 False 0.9722222222222222
4 41 0 False 0.9722222222222222
4 40 0 False 0.9722222222222222
4 39 0 False 0.9722222222222222
4 38 0 False 0.9722222222222222
4 37 0 False 0.9722222222222222
4 36 0 False 0.9722222222222222
4 35 0 False 0.9722222222222222
4 34 0 False 0.9722222222222222


In [14]:
step_array

[2,
 1,
 3,
 1,
 1,
 4,
 2,
 3,
 3,
 1,
 2,
 1,
 3,
 1,
 1,
 3,
 2,
 2,
 2,
 1,
 1,
 2,
 3,
 1,
 1,
 1,
 2,
 1,
 1,
 3,
 2,
 4,
 1,
 1,
 1,
 1,
 2,
 1,
 5,
 2,
 1]

In [15]:
import numpy as np
np.mean(step_array)

1.853658536585366

In [36]:
len(failures)

1000

In [38]:
failures

[48.0,
 51.55555555555562,
 53.33333333333338,
 48.27777777777778,
 49.777777777777814,
 44.472222222222165,
 51.55555555555562,
 44.61111111111105,
 48.0,
 6.416666666666667,
 58.69444444444439,
 55.111111111111185,
 55.111111111111185,
 55.111111111111185,
 53.33333333333338,
 60.44444444444437,
 44.80555555555549,
 55.138888888888964,
 58.58333333333328,
 51.972222222222285,
 1.9722222222222223,
 62.22222222222218,
 51.55555555555562,
 44.444444444444386,
 48.083333333333336,
 55.111111111111185,
 60.388888888888815,
 53.33333333333338,
 51.55555555555562,
 44.444444444444386,
 55.111111111111185,
 56.916666666666586,
 53.33333333333338,
 53.33333333333338,
 49.944444444444485,
 48.0,
 56.88888888888881,
 51.55555555555562,
 62.22222222222218,
 60.44444444444437,
 1.9722222222222223,
 58.69444444444439,
 62.22222222222218,
 56.88888888888881,
 53.33333333333338,
 51.55555555555562,
 62.1944444444444,
 62.22222222222218,
 56.88888888888881,
 62.22222222222218,
 55.138888888888964,
 5

In [37]:
len(reward_acc_arr)

41

In [39]:
reward_acc_arr

[6.416666666666667,
 1.9722222222222223,
 1.9722222222222223,
 1.0,
 1.0,
 1.0,
 1.9722222222222223,
 1.0,
 1.0,
 1.0,
 4.722222222222222,
 1.9722222222222223,
 1.0,
 4.722222222222222,
 1.0,
 1.9722222222222223,
 1.9722222222222223,
 1.9722222222222223,
 1.0,
 2.9166666666666665,
 1.0,
 1.0,
 1.0,
 1.9722222222222223,
 1.0,
 1.0,
 3.8333333333333335,
 1.0,
 1.0,
 4.722222222222222,
 1.0,
 2.9166666666666665,
 1.0,
 1.0,
 4.722222222222222,
 1.0,
 5.583333333333333,
 1.0,
 1.0,
 1.0,
 2.9166666666666665]