In [9]:
import gymnasium as gym
from pogema import GridConfig
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.a2c.policies import MlpPolicy

%load_ext autoreload
%autoreload 2
%matplotlib inline

grid_config = GridConfig(
    size=8,
    density=0.3,
    num_agents=1,
    max_episode_steps=30
)

env = gym.make("Pogema-v0",grid_config=grid_config)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  logger.warn(
  logger.warn(


### A2C

In [16]:
a2c_model = A2C(
    "MlpPolicy",
    env,
    verbose=1,
    gamma=0.0005406382028272944,
    max_grad_norm=4.652733372309419,
    gae_lambda=0.02971819120861959,
    #exponent_n_steps=8,
    learning_rate=2.5286198276591433e-05,
    ent_coef=7.492474092174222e-07,
    #activation_fn="tanh", (belongs to MlpPolicy)
    #ortho_init=True,
    #activation_fn="tanh",
    #gamma_=0.9994593617971727,
    #gae_lambda_=0.9702818087913804,
    n_steps=256,
    seed=42
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [17]:
# Train agent and save it
a2c_model.learn(int(2e4))
a2c_model.save("saved/a2c_baseline")

### Load trained agent and evaluate it

In [18]:
a2c_model = A2C.load("saved/a2c_baseline")

env.reset()

mean_reward, std_reward = evaluate_policy(a2c_model, env, deterministic=True, n_eval_episodes=20)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:0.00 +/- 0.00




### Video Recording

In [19]:
# RANDOM SEED
from IPython.display import SVG, display
from pogema.animation import AnimationMonitor, AnimationConfig

env = AnimationMonitor(env)

def evaluate_success_rate(model, env, num_episodes=100):
    success_count = 0
    step_array = []
    for i in range(num_episodes):
        obs = env.reset()

        # Check if observation is a tuple and extract the first element if true.
        if isinstance(obs, tuple):
            obs = obs[0]
        max_step = 100
        steps_taken = 0
        done = truncated = False
        while not done and max_step > 0:
            action, _ = model.predict(obs)
            next_obs, reward, done, truncated, info = env.step(action)
            print(action,max_step,success_count,done)
            max_step -= 1
            steps_taken += 1
            # Check if next_obs is a tuple and extract the first element if true.
            if isinstance(next_obs, tuple):
                next_obs = next_obs[0]
            obs = next_obs

            # Check if agent was successful in that episode.
            if done:
                success_count += 1
                step_array.append(steps_taken)
                env.save_animation(f"render{i}.svg", AnimationConfig(egocentric_idx=0))
                break

    success_rate = success_count / num_episodes
    return success_rate, step_array

success_rate,step_array = evaluate_success_rate(a2c_model, env)
print(f"Agent Success Rate: {success_rate * 100:.2f}%")
print(f"steps to termination : {step_array}")

2 100 0 False
0 99 0 False
4 98 0 False
0 97 0 False
4 96 0 False
0 95 0 False
2 94 0 False
0 93 0 False
3 92 0 False
1 91 0 False
3 90 0 False
0 89 0 False
2 88 0 False
4 87 0 False
4 86 0 False
4 85 0 False
1 84 0 False
3 83 0 False
4 82 0 False
4 81 0 False
4 80 0 False
1 79 0 False
4 78 0 False
1 77 0 False
3 76 0 False
0 75 0 False
1 74 0 False
1 73 0 False
0 72 0 False
1 71 0 False
2 70 0 False
4 69 0 False
1 68 0 False
2 67 0 False
0 66 0 False
2 65 0 False
3 64 0 False
1 63 0 False
3 62 0 False
0 61 0 False
2 60 0 False
2 59 0 False
1 58 0 False
0 57 0 False
4 56 0 False
2 55 0 False
0 54 0 False
0 53 0 False
0 52 0 False
0 51 0 False
4 50 0 False
1 49 0 False
0 48 0 False
3 47 0 False
1 46 0 False
0 45 0 False
0 44 0 False
4 43 0 False
1 42 0 False
4 41 0 False
0 40 0 False
0 39 0 False
1 38 0 False
4 37 0 False
4 36 0 False
2 35 0 False
4 34 0 False
1 33 0 False
3 32 0 False
3 31 0 False
1 30 0 False
1 29 0 False
0 28 0 False
4 27 0 False
0 26 0 False
4 25 0 False
2 24 0 Fals