In [24]:
import gymnasium as gym
from pogema import GridConfig
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=30
)

env = gym.make("Pogema-v0",grid_config=grid_config)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### A2C

In [25]:
a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1,
    gamma=0.99,
    learning_rate=0.0007,
    seed=42
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [26]:
# Evaluate untrained agent using separate env

val_env = gym.make("Pogema-v0",grid_config=grid_config)

val_a2c_model = A2C(
    "MlpPolicy",
    val_env,
    verbose=1,
    gamma=0.99,
    learning_rate=0.0007,
    seed=42
)

mean_reward, std_reward = evaluate_policy(
    val_a2c_model,
    val_a2c_model.get_env(),
    deterministic=True,
    n_eval_episodes=20,
)

print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
mean_reward: 0.00 +/- 0.00


In [27]:
# Train agent and save it
a2c_model.learn(total_timesteps=int(1.2e5))
a2c_model.save("saved/a2c_baseline")

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 27.7     |
|    ep_rew_mean        | 0.118    |
| time/                 |          |
|    fps                | 1820     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.6     |
|    explained_variance | -16.6    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.021    |
|    value_loss         | 0.00201  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 26.3      |
|    ep_rew_mean        | 0.158     |
| time/                 |           |
|    fps                | 1837      |
|    iterations         | 200       |
|    time_elapsed       | 0         |
|    total_timesteps    | 1000      |
| train/                |    

### Load trained agent and evaluate it

In [28]:
a2c_model = A2C.load("saved/a2c_baseline")

env.reset()

mean_reward, std_reward = evaluate_policy(a2c_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:0.60 +/- 0.49


### Video Recording

In [32]:
# RANDOM SEED
from IPython.display import SVG, display
from pogema.animation import AnimationMonitor, AnimationConfig

env = AnimationMonitor(env)

def evaluate_success_rate(model, env, num_episodes=100):
    success_count = 0
    step_array = []
    for i in range(num_episodes):
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 100
        steps_taken = 0
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done)
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                step_array.append(steps_taken)
                env.save_animation(f"render{i}.svg", AnimationConfig(egocentric_idx=0))
                break

    success_rate = success_count / num_episodes
    return success_rate, step_array

success_rate,step_array = evaluate_success_rate(a2c_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")

1 100 0 False
2 99 0 False
1 98 0 False
1 97 0 False
1 96 0 False
4 95 0 False
4 94 0 False
1 93 0 False
1 92 0 False
4 91 0 True
1 100 1 False
1 99 1 False
1 98 1 False
1 97 1 False
3 96 1 False
3 95 1 False
3 94 1 False
3 93 1 True
1 100 2 False
1 99 2 False
4 98 2 False
4 97 2 False
2 96 2 False
4 95 2 False
4 94 2 False
3 93 2 False
4 92 2 False
3 91 2 False
4 90 2 False
3 89 2 False
4 88 2 False
3 87 2 False
4 86 2 False
3 85 2 False
4 84 2 False
3 83 2 False
4 82 2 False
3 81 2 False
4 80 2 False
1 79 2 False
3 78 2 False
4 77 2 False
1 76 2 False
3 75 2 False
4 74 2 False
3 73 2 False
4 72 2 False
3 71 2 False
4 70 2 False
3 69 2 False
4 68 2 False
3 67 2 False
4 66 2 False
3 65 2 False
4 64 2 False
1 63 2 False
3 62 2 False
4 61 2 False
2 60 2 False
4 59 2 False
4 58 2 False
1 57 2 False
1 56 2 False
3 55 2 False
1 54 2 False
3 53 2 True
4 100 3 False
4 99 3 False
4 98 3 True
1 100 4 False
1 99 4 False
1 98 4 False
4 97 4 False
4 96 4 False
4 95 4 False
4 94 4 False
4 93 4 True