# Grid4x4 - PettingZoo + RLlib

In [1]:
import os

import ray
from ray.tune.registry import register_env

In [2]:
import random
import numpy as np
import torch

SEED = 23423  # default SUMO seed no.
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x23758ce12d0>

## Environment Setup

In [3]:
import supersuit as ss
from reward_functions import combined_reward

env_name = "grid2x2"

In [4]:
from envs import MultiAgentSumoEnv
from observation import Grid2x2ObservationFunction

def env_creator(args):
    env_params = {
        "net_file": os.path.join("nets",env_name,f"{env_name}.net.xml"),
        "route_file": os.path.join("nets",env_name,f"{env_name}.rou.xml"),
        "num_seconds": 3600,
        "reward_fn": combined_reward,
        "sumo_seed": SEED,
        "observation_class": Grid2x2ObservationFunction,
        "add_system_info": False,
    }
    env = MultiAgentSumoEnv(**env_params)
    return env

In [5]:
from ray.rllib.env.wrappers.multi_agent_env_compatibility import MultiAgentEnvCompatibility

ray.init()

register_env(env_name, lambda config: MultiAgentEnvCompatibility(env_creator(config)))

2023-05-31 23:25:58,555	INFO worker.py:1625 -- Started a local Ray instance.


## Training the RL Agent

In [6]:
from ray.rllib.algorithms.ppo import PPOConfig

# From https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/atari-ppo.yaml

config: PPOConfig
config = (
    PPOConfig()
    .environment(env=env_name)
    .framework(framework="torch")
    .rollouts(
        num_rollout_workers=4, rollout_fragment_length=128,
        # num_envs_per_worker=5,
        # batch_mode="truncate_episodes",
    )
    .training(
        train_batch_size=512,
        lr=2e-5,
        gamma=0.99,
        lambda_=0.9,
        use_gae=True,
        clip_param=0.4,
        grad_clip=None,
        entropy_coeff=0.1,
        vf_loss_coeff=0.25,
        sgd_minibatch_size=64,
        num_sgd_iter=10,
        # lambda_=0.95,
        # kl_coeff=0.5,
        # clip_param=0.1,
        # vf_clip_param=10.0,
        # entropy_coeff=0.01,
        # train_batch_size=5000,
        # sgd_minibatch_size=500,
        # num_sgd_iter=10,
    )
    .evaluation(
        # evaluation_duration=3600,
        # evaluation_duration_unit="timesteps",
        evaluation_num_workers=1,
    )
    .debugging(log_level="WARN", seed=SEED)
    .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "1")))
)

In [7]:
algo = config.build()

2023-05-31 23:26:02,938	INFO algorithm.py:527 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(RolloutWorker pid=13784)[0m Step #0.00 (0ms ?*RT. ?UPS, TraCI: 8ms, vehicles TOT 0 ACT 0 BUF 0)                      


2023-05-31 23:26:13,983	INFO trainable.py:172 -- Trainable.setup took 11.048 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(RolloutWorker pid=29436)[0m Step #0.00 (0ms ?*RT. ?UPS, TraCI: 12ms, vehicles TOT 0 ACT 0 BUF 0)                     [32m [repeated 4x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m


In [8]:
algo.evaluate()

Step #3600.00 (1ms ~= 1000.00*RT, ~182000.00UPS, TraCI: 199ms, vehicles TOT 2022 ACT 182 B34ms, vehicles TOT 3 ACT 3 BUF 0)      
Step #3600.00 (1ms ~= 1000.00*RT, ~228000.00UPS, TraCI: 226ms, vehicles TOT 1913 ACT 228 BOT 3 ACT 3 BUF 0)                      
Step #3600.00 (2ms ~= 500.00*RT, ~122500.00UPS, TraCI: 214ms, vehicles TOT 1458 ACT 245 BU32ms, vehicles TOT 3 ACT 3 BUF 0)      
Step #3600.00 (1ms ~= 1000.00*RT, ~162000.00UPS, TraCI: 212ms, vehicles TOT 1994 ACT 162 BOT 3 ACT 3 BUF 0)                      
Step #3600.00 (2ms ~= 500.00*RT, ~111500.00UPS, TraCI: 214ms, vehicles TOT 1585 ACT 223 BUOT 3 ACT 3 BUF 0)                      
Step #3600.00 (2ms ~= 500.00*RT, ~115000.00UPS, TraCI: 214ms, vehicles TOT 1451 ACT 230 BUOT 3 ACT 3 BUF 0)                      
Step #3600.00 (1ms ~= 1000.00*RT, ~220000.00UPS, TraCI: 231ms, vehicles TOT 1717 ACT 220 BOT 3 ACT 3 BUF 0)                      
Step #3600.00 (1ms ~= 1000.00*RT, ~229000.00UPS, TraCI: 208ms, vehicles TOT 1676 ACT 229 B

{'evaluation': {'episode_reward_max': -172951.6673387349,
  'episode_reward_min': -455934.2899789813,
  'episode_reward_mean': -313990.761392424,
  'episode_len_mean': 720.0,
  'episode_media': {},
  'episodes_this_iter': 10,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [-172951.6673387349,
    -218625.77240152424,
    -455934.2899789813,
    -176077.63037164957,
    -385718.7365816415,
    -442348.2225033426,
    -323648.8439603477,
    -337031.8295013124,
    -286961.70627487445,
    -340608.9150118316],
   'episode_lengths': [720, 720, 720, 720, 720, 720, 720, 720, 720, 720]},
  'sampler_perf': {'mean_raw_obs_processing_ms': 1.5209045298909696,
   'mean_inference_ms': 1.1370092642802263,
   'mean_action_processing_ms': 0.2551095682024177,
   'mean_env_wait_ms': 117.73392356546235,
   'mean_env_render_ms': 0.0},
  'num_faulty_episodes': 0,
  'connector_metrics': {'ObsPreprocessorConnector_

In [11]:
import os

for i in range(10):
    algo.train()
    print(f"Iteration {i+1} done!")

    if i % 5 == 0:
        algo.save(os.path.join("ray_checkpoints","grid2x2"))

Iteration 1 done!
Iteration 2 done!
Iteration 3 done!
Iteration 4 done!
Iteration 5 done!
Step #3600.00 (1ms ~= 1000.00*RT, ~157000.00UPS, TraCI: 113ms, vehicles TOT 2094 ACT 157 B35ms, vehicles TOT 3 ACT 3 BUF 0)      
Step #3600.00 (2ms ~= 500.00*RT, ~127000.00UPS, TraCI: 134ms, vehicles TOT 1521 ACT 254 BU35ms, vehicles TOT 3 ACT 3 BUF 0)      
Step #3600.00 (2ms ~= 500.00*RT, ~127500.00UPS, TraCI: 133ms, vehicles TOT 1903 ACT 255 BUOT 3 ACT 3 BUF 0)                      
Step #3600.00 (1ms ~= 1000.00*RT, ~271000.00UPS, TraCI: 103ms, vehicles TOT 1679 ACT 271 BOT 3 ACT 3 BUF 0)                      
Iteration 6 done!
Iteration 7 done!
Iteration 8 done!
Iteration 9 done!
Iteration 10 done!


In [12]:
algo.evaluate()

Step #3600.00 (1ms ~= 1000.00*RT, ~125000.00UPS, TraCI: 168ms, vehicles TOT 2106 ACT 125 BOT 3 ACT 3 BUF 0)                      
Step #3600.00 (1ms ~= 1000.00*RT, ~186000.00UPS, TraCI: 177ms, vehicles TOT 1500 ACT 186 BOT 3 ACT 3 BUF 0)                      
Step #3600.00 (1ms ~= 1000.00*RT, ~88000.00UPS, TraCI: 153ms, vehicles TOT 2106 ACT 88 BUFOT 3 ACT 3 BUF 0)                      
Step #3600.00 (1ms ~= 1000.00*RT, ~230000.00UPS, TraCI: 177ms, vehicles TOT 1899 ACT 230 BOT 3 ACT 3 BUF 0)                      
Step #3600.00 (0ms ?*RT. ?UPS, TraCI: 160ms, vehicles TOT 2106 ACT 112 BUF 0)             OT 3 ACT 3 BUF 0)                      
Step #3600.00 (0ms ?*RT. ?UPS, TraCI: 154ms, vehicles TOT 2099 ACT 121 BUF 7)             OT 3 ACT 3 BUF 0)                      
Step #3600.00 (1ms ~= 1000.00*RT, ~110000.00UPS, TraCI: 158ms, vehicles TOT 2106 ACT 110 BOT 3 ACT 3 BUF 0)                      
Step #3600.00 (1ms ~= 1000.00*RT, ~215000.00UPS, TraCI: 175ms, vehicles TOT 2027 ACT 215 B

{'evaluation': {'episode_reward_max': -130306.71221711389,
  'episode_reward_min': -388866.99752421613,
  'episode_reward_mean': -182555.8182178615,
  'episode_len_mean': 720.0,
  'episode_media': {},
  'episodes_this_iter': 10,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [-145568.20744274557,
    -388866.99752421613,
    -148103.49036157245,
    -248713.81197247276,
    -138754.53009340414,
    -130306.71221711389,
    -139718.62651089474,
    -194547.91902648786,
    -146844.5645265106,
    -144133.32250319677],
   'episode_lengths': [720, 720, 720, 720, 720, 720, 720, 720, 720, 720]},
  'sampler_perf': {'mean_raw_obs_processing_ms': 1.4675512850909556,
   'mean_inference_ms': 1.0730825392010064,
   'mean_action_processing_ms': 0.24676591139288054,
   'mean_env_wait_ms': 93.95156260306916,
   'mean_env_render_ms': 0.0},
  'num_faulty_episodes': 0,
  'connector_metrics': {'ObsPreprocessorC