# Pogema DQN Example

In [None]:
!pip install -q pogema==1.2.2
!pip install -q stable_baselines3==2.1.0

### DQN

Sources:

* https://github.com/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/dqn_sb3.ipynb
* https://github.com/araffin/rl-tutorial-jnrr19/blob/sb3/1_getting_started.ipynb

In [15]:
import gymnasium as gym
from pogema import GridConfig
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=30
)

env = gym.make("Pogema-v0",grid_config=grid_config)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
dqn_model = DQN(
    "MlpPolicy",
    env,
    verbose=1,
    train_freq=16,
    gradient_steps=8,
    gamma=0.99,
    exploration_fraction=0.2,
    exploration_final_eps=0.07,
    target_update_interval=600,
    learning_starts=1000,
    buffer_size=10000,
    batch_size=128,
    learning_rate=4e-3,
    policy_kwargs=dict(net_arch=[256, 256]),
    seed=42,
    tensorboard_log="./tensorboard"
)

In [None]:
# Evaluate untrained agent using separate env

val_env = gym.make("Pogema-v0",grid_config=grid_config)

val_dqn_model = DQN(
    "MlpPolicy",
    val_env,
    verbose=1,
    train_freq=16,
    gradient_steps=8,
    gamma=0.99,
    exploration_fraction=0.2,
    exploration_final_eps=0.07,
    target_update_interval=600,
    learning_starts=1000,
    buffer_size=10000,
    batch_size=128,
    learning_rate=4e-3,
    policy_kwargs=dict(net_arch=[256, 256]),
    seed=42,
)

mean_reward, std_reward = evaluate_policy(
    val_dqn_model,
    val_dqn_model.get_env(),
    deterministic=True,
    n_eval_episodes=20,
)

print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
# Train agent and save it
dqn_model.learn(total_timesteps=int(1.2e5))
dqn_model.save("saved/dqn_baseline")

### Load trained agent and evaluate it

In [None]:
dqn_model = DQN.load("saved/dqn_baseline")

env.reset()

mean_reward, std_reward = evaluate_policy(dqn_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

### Video Recording

In [None]:
from IPython.display import SVG, display
from pogema.animation import AnimationMonitor, AnimationConfig

env = AnimationMonitor(env)

obs, info = env.reset()

# terminated = truncated = [False, ...]

# while not all(terminated) and not all(truncated):
#     # Use random policy to make actions
#     obs, reward, terminated, truncated, info = env.step([env.action_space.sample() for _ in range(grid_config.num_agents)])

#terminated = truncated = [False, ...]
terminated = truncated = False

while not terminated or not truncated:
    # Use random policy to make actions
    print(env.action_space.sample())
    obs, reward, terminated, truncated, info = env.step(env.action_space.sample())

In [None]:
env.save_animation("render.svg", AnimationConfig(egocentric_idx=0))
display(SVG('render.svg'))

#### Evaluation Metric
-  Agent success rate (How often agent reaches the goal state )
-  Steps to Termination (Avg steps to termination)

In [None]:
def evaluate_success_rate(model, env, num_episodes=10):
    success_count = 0
    step_array = []
    for _ in range(num_episodes):
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 100
        steps_taken = 0
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done)
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                step_array.append(steps_taken)
                break

    success_rate = success_count / num_episodes
    return success_rate, step_array

success_rate,step_array = evaluate_success_rate(dqn_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")

### PPO

In [16]:
import gymnasium as gym
from pogema import GridConfig
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

%load_ext autoreload
%autoreload 2
%matplotlib inline

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=30
)

env = gym.make("Pogema-v0",grid_config=grid_config)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  logger.warn(
  logger.warn(


In [17]:
ppo_model = PPO(
    "MlpPolicy",
    val_env,
    verbose=1,
    gamma=0.99,
    batch_size=128,
    learning_rate=4e-3,
    policy_kwargs=dict(net_arch=[256, 256]),
    seed=42,
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [18]:
# Evaluate untrained agent using separate env

val_env = gym.make("Pogema-v0",grid_config=grid_config)

val_ppo_model = PPO(
    "MlpPolicy",
    val_env,
    verbose=1,
    # train_freq=16,
    # gradient_steps=8,
    gamma=0.99,
    # exploration_fraction=0.2,
    # exploration_final_eps=0.07,
    # target_update_interval=600,
    # learning_starts=1000,
    # buffer_size=10000,
    batch_size=128,
    learning_rate=4e-3,
    policy_kwargs=dict(net_arch=[256, 256]),
    seed=42,
)

mean_reward, std_reward = evaluate_policy(
    val_ppo_model,
    val_ppo_model.get_env(),
    deterministic=True,
    n_eval_episodes=20,
)

print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

  logger.warn(
  logger.warn(


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
mean_reward: 0.00 +/- 0.00


In [19]:
# Train agent and save it
ppo_model.learn(total_timesteps=int(1.2e5))
ppo_model.save("saved/ppo_baseline")

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 25       |
|    ep_rew_mean     | 0.244    |
| time/              |          |
|    fps             | 1529     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 25.1        |
|    ep_rew_mean          | 0.23        |
| time/                   |             |
|    fps                  | 965         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.095853485 |
|    clip_fraction        | 0.546       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.55       |
|    explained_variance   | -0.355      |
|    learning_rate        | 0.

#### Load trained agent and evaluate it

In [21]:
ppo_model = PPO.load("saved/ppo_baseline")

env.reset()

mean_reward, std_reward = evaluate_policy(ppo_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:0.60 +/- 0.49


#### Evaluation Metric
-  Agent success rate (How often agent reaches the goal state )
-  Steps to Termination (Avg steps to termination)

In [22]:
def evaluate_success_rate(model, env, num_episodes=10):
    success_count = 0
    step_array = []
    for _ in range(num_episodes):
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 100
        steps_taken = 0
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done)
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                step_array.append(steps_taken)
                break

    success_rate = success_count / num_episodes
    return success_rate, step_array

success_rate,step_array = evaluate_success_rate(ppo_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")

2 100 0 False
4 99 0 False
3 98 0 False
0 97 0 False
4 96 0 False
3 95 0 False
3 94 0 False
4 93 0 False
3 92 0 False
4 91 0 False
3 90 0 False
2 89 0 False
4 88 0 False
3 87 0 False
4 86 0 False
3 85 0 False
4 84 0 False
3 83 0 False
0 82 0 False
4 81 0 False
3 80 0 False
0 79 0 False
4 78 0 False
3 77 0 False
4 76 0 False
3 75 0 False
3 74 0 False
4 73 0 False
3 72 0 False
4 71 0 False
3 70 0 False
2 69 0 False
4 68 0 False
3 67 0 False
4 66 0 False
3 65 0 False
4 64 0 False
3 63 0 False
4 62 0 False
3 61 0 False
4 60 0 False
3 59 0 False
4 58 0 False
3 57 0 False
0 56 0 False
0 55 0 False
4 54 0 False
3 53 0 False
4 52 0 False
3 51 0 False
0 50 0 False
0 49 0 False
0 48 0 False
4 47 0 False
3 46 0 False
0 45 0 False
4 44 0 False
3 43 0 False
4 42 0 False
3 41 0 False
0 40 0 False
0 39 0 False
4 38 0 False
3 37 0 False
2 36 0 False
0 35 0 False
0 34 0 False
4 33 0 False
3 32 0 False
4 31 0 False
3 30 0 False
4 29 0 False
3 28 0 False
4 27 0 False
3 26 0 False
4 25 0 False
3 24 0 Fals