# Multi-Agent Eval Walkthrough

In [1]:
import os

import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, VecMonitor, is_vecenv_wrapped
from sumo_rl import parallel_env
import supersuit as ss

In [2]:
from pettingzoo.test import parallel_api_test
from reward_functions import diff_wait_time

env_params = {
    "net_file": os.path.join("nets","RESCO","grid4x4","grid4x4.net.xml"),
    "route_file": os.path.join("nets","RESCO","grid4x4","grid4x4_1.rou.xml"),
    "num_seconds": 3600,
    "reward_fn": diff_wait_time,
    "sumo_seed": 42,
}
env = parallel_env(**env_params)

parallel_api_test(env, num_cycles=10)

# Maybe add frame-stacking here
env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(env, 2, num_cpus=1, base_class="stable_baselines3")
env = VecMonitor(env)

In [3]:
from helper_functions import linear_schedule

# Using hyperparams for Atari (except for n_steps) from
# https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/ppo.yml

model = PPO(
    "MlpPolicy",
    env,
    learning_rate=2.5e-4,
    n_steps=1024,
    batch_size=256,
    n_epochs=4,
    clip_range=0.1,
    ent_coef=1e-3,
    verbose=1
)

Using cuda device


In [4]:
is_monitor_wrapped = False
from stable_baselines3.common.monitor import Monitor

if not isinstance(env, VecEnv):
    env = DummyVecEnv([lambda: env])  # type: ignore[list-item, return-value]

is_monitor_wrapped = is_vecenv_wrapped(env, VecMonitor) or env.env_is_wrapped(Monitor)[0]

is_monitor_wrapped

True

In [5]:
n_eval_episodes = 1

n_envs = env.num_envs
episode_rewards = []
episode_lengths = []

n_envs

32

In [6]:
episode_counts = np.zeros(n_envs, dtype="int")
# Divides episodes among different sub environments in the vector as evenly as possible
episode_count_targets = np.array([(n_eval_episodes + i) // n_envs for i in range(n_envs)], dtype="int")

print(episode_counts)
print(episode_count_targets)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]


In [7]:
current_rewards = np.zeros(n_envs)
current_lengths = np.zeros(n_envs, dtype="int")
observations = env.reset()
states = None
episode_starts = np.ones((env.num_envs,), dtype=bool)

print(observations.shape)
print(episode_starts)

(32, 33)
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True]


In [8]:
actions, states = model.predict(
    observations,  # type: ignore[arg-type]
    state=states,
    episode_start=episode_starts,
    deterministic=True,
)
print(actions)
print(len(actions))
print(states)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
32
None


In [9]:
new_observations, rewards, dones, infos = env.step(actions)

print(rewards)
print(dones)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [11]:
markov_vector_envs = env.unwrapped.vec_envs
agents_by_env = [mve.par_env.unwrapped.env.traffic_signals for mve in markov_vector_envs]

In [13]:
from collections import defaultdict

import traci
from helper_functions import get_total_waiting_time, get_tyre_pm

stats = defaultdict(float)

for ts in agents_by_env[0].values():
    stats["arrived"] += traci.simulation.getArrivedNumber()
    stats["avg_speed"] += ts.get_average_speed()
    stats["pressure"] += ts.get_pressure()
    stats["queued"] += ts.get_total_queued()
    stats["tyre_pm"] += get_tyre_pm(ts)
    stats["wait_time"] += get_total_waiting_time(ts)

stats

defaultdict(float,
            {'arrived': 0.0,
             'avg_speed': 15.513779840108208,
             'pressure': -1.0,
             'queued': 0.0,
             'tyre_pm': 11.223127148696221,
             'wait_time': 0.0})

In [14]:
stats = defaultdict(float)

for ts in agents_by_env[1].values():
    stats["arrived"] += traci.simulation.getArrivedNumber()
    stats["avg_speed"] += ts.get_average_speed()
    stats["pressure"] += ts.get_pressure()
    stats["queued"] += ts.get_total_queued()
    stats["tyre_pm"] += get_tyre_pm(ts)
    stats["wait_time"] += get_total_waiting_time(ts)

stats

defaultdict(float,
            {'arrived': 0.0,
             'avg_speed': 15.513779840108208,
             'pressure': -1.0,
             'queued': 0.0,
             'tyre_pm': 11.223127148696221,
             'wait_time': 0.0})

In [12]:
current_lengths += 1
current_lengths

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [13]:
for i in range(n_envs):
    if episode_counts[i] < episode_count_targets[i]:
        # unpack values so that the callback can access the local variables
        reward = rewards[i]
        done = dones[i]
        info = infos[i]
        episode_starts[i] = done

print(dones)
print(episode_starts)

observations = new_observations

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True False]


In [14]:
import traci
num_steps = (env_params["num_seconds"]-5 - traci.simulation.getTime())/5
int(num_steps)

718

In [15]:
for _ in range(int(num_steps)):
    actions, states = model.predict(
        observations,  # type: ignore[arg-type]
        state=states,
        episode_start=episode_starts,
        deterministic=True,
    )
    new_observations, rewards, dones, infos = env.step(actions)
    current_rewards += rewards
    current_lengths += 1

    for i in range(n_envs):
        if episode_counts[i] < episode_count_targets[i]:
            # unpack values so that the callback can access the local variables
            reward = rewards[i]
            done = dones[i]
            info = infos[i]
            episode_starts[i] = done
            
    observations = new_observations

print(dones)
print(current_rewards)
print(current_lengths)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[ -41508.  -62784.  -76102. -156450.  -94771.   -2288.    -995. -144120.
   -9576.   -5076.  -49432. -131762.  -11418.  -75823.  -94679. -147589.
  -41508.  -62784.  -76102. -156450.  -94771.   -2288.    -995. -144120.
   -9576.   -5076.  -49432. -131762.  -11418.  -75823.  -94679. -147589.]
[719 719 719 719 719 719 719 719 719 719 719 719 719 719 719 719 719 719
 719 719 719 719 719 719 719 719 719 719 719 719 719 719]


In [16]:
traci.simulation.getTime()

3595.0

In [128]:
env.unwrapped.vec_envs[0].par_env.unwrapped.env.sim_max_time

3600

In [105]:
env.unwrapped.vec_envs[0].par_env.unwrapped.env._compute_dones()

{'A0': False,
 'A1': False,
 'A2': False,
 'A3': False,
 'B0': False,
 'B1': False,
 'B2': False,
 'B3': False,
 'C0': False,
 'C1': False,
 'C2': False,
 'C3': False,
 'D0': False,
 'D1': False,
 'D2': False,
 'D3': False,
 '__all__': False}

In [17]:
actions, states = model.predict(
    observations,  # type: ignore[arg-type]
    state=states,
    episode_start=episode_starts,
    deterministic=True,
)
new_observations, rewards, dones, infos = env.step(actions)
current_rewards += rewards
current_lengths += 1

for i in range(n_envs):
    if episode_counts[i] < episode_count_targets[i]:
        # unpack values so that the callback can access the local variables
        reward = rewards[i]
        done = dones[i]
        info = infos[i]
        episode_starts[i] = done

print(dones)
print(infos)
print(current_rewards)
print(current_lengths)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[{'terminal_observation': array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 1.        , 0.        ,
       0.41239002, 0.13746335, 0.        , 0.02749267, 0.        ,
       0.        , 0.28805867, 0.        , 0.        , 0.49755585,
       0.13093576, 0.        , 0.41239002, 0.13746335, 0.        ,
       0.        , 0.        , 0.        , 0.28805867, 0.        ,
       0.        , 0.49755585, 0.13093576], dtype=float32)}, {'terminal_observation': array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 1.        , 0.        ,
       0.6048387 , 0.13746335, 0.        , 0.        , 0.        ,
       0.        , 0.4398827 , 0.05498534, 0.        , 0.39280728,
       0.07856146, 0.        , 0.6048387 , 0.13746335, 0.        ,
       0.        , 0.        , 0.        , 0.4398827 , 0.05498534,
       0.        ,

In [19]:
episode_rewards.append(current_rewards[i])
episode_lengths.append(current_lengths[i])

In [21]:
mean_reward = np.mean(episode_rewards)
std_reward = np.std(episode_rewards)

mean_reward, std_reward

(-148099.0, 0.0)