In [13]:
from utils.util import *
import utils
import gymnasium as gym
import pogema
from pogema import GridConfig
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.evaluation import evaluate_policy

import math

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
GRID_LEN = 18

In [16]:
pogema.envs.Pogema.step = custom_step

In [16]:
grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=30
)

env = gym.make("Pogema-v0",grid_config=grid_config)

### A2C

In [17]:
a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1,
    gamma=0.99,
    #learning_rate=0.0007,
    seed=42
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [18]:
# Evaluate untrained agent using separate env

val_env = gym.make("Pogema-v0",grid_config=grid_config)

val_a2c_model = A2C(
    "MlpPolicy",
    val_env,
    verbose=1,
    gamma=0.99,
    #learning_rate=0.0007,
    seed=42
)

mean_reward, std_reward = evaluate_policy(
    val_a2c_model,
    val_a2c_model.get_env(),
    deterministic=True,
    n_eval_episodes=20,
)

print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
mean_reward: 25.49 +/- 2.50


In [19]:
# Train agent and save it
a2c_model.learn(total_timesteps=int(1.2e5))
a2c_model.save("saved/a2c_baseline")

FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 26.2     |
|    ep_rew_mean        | 22.6     |
| time/                 |          |
|    fps                | 2019     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.57    |
|    explained_variance | -0.00237 |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -7.32    |
|    value_loss         | 72.7     |
------------------------------------
FINISH 1.0
FINISH 1.0
FINISH 1.0
FINISH 1.0
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 25.7     |
|    ep_rew_mean        | 22       |
| time/                 |          |
|    fps                | 2108     |
|    iterations         | 200      |
| 

KeyboardInterrupt: 

### Load trained agent and evaluate it

In [10]:
a2c_model = A2C.load("saved/a2c_baseline")

env.reset()

mean_reward, std_reward = evaluate_policy(a2c_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:26.04 +/- 1.94




### Video Recording

In [11]:
# RANDOM SEED
from IPython.display import SVG, display
from pogema.animation import AnimationMonitor, AnimationConfig

env = AnimationMonitor(env)

def evaluate_success_rate(model, env, num_episodes=100):
    success_count = 0
    step_array = []
    for i in range(num_episodes):
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 100
        steps_taken = 0
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done)
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                step_array.append(steps_taken)
                env.save_animation(f"render{i}.svg", AnimationConfig(egocentric_idx=0))
                break

    success_rate = success_count / num_episodes
    return success_rate, step_array

success_rate,step_array = evaluate_success_rate(a2c_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")

1 100 0 False
2 99 0 False
1 98 0 False
2 97 0 False
1 96 0 False
2 95 0 False
1 94 0 False
2 93 0 False
1 92 0 False
2 91 0 False
1 90 0 False
2 89 0 False
1 88 0 False
2 87 0 False
1 86 0 False
2 85 0 False
1 84 0 False
2 83 0 False
1 82 0 False
2 81 0 False
1 80 0 False
2 79 0 False
1 78 0 False
2 77 0 False
1 76 0 False
2 75 0 False
1 74 0 False
2 73 0 False
1 72 0 False
2 71 0 False
1 70 0 False
2 69 0 False
1 68 0 False
2 67 0 False
1 66 0 False
2 65 0 False
1 64 0 False
2 63 0 False
1 62 0 False
2 61 0 False
1 60 0 False
2 59 0 False
1 58 0 False
2 57 0 False
1 56 0 False
2 55 0 False
1 54 0 False
2 53 0 False
1 52 0 False
2 51 0 False
1 50 0 False
2 49 0 False
1 48 0 False
2 47 0 False
1 46 0 False
2 45 0 False
1 44 0 False
2 43 0 False
1 42 0 False
2 41 0 False
1 40 0 False
2 39 0 False
1 38 0 False
2 37 0 False
1 36 0 False
2 35 0 False
1 34 0 False
2 33 0 False
1 32 0 False
2 31 0 False
1 30 0 False
2 29 0 False
1 28 0 False
2 27 0 False
1 26 0 False
2 25 0 False
1 24 0 Fals