# Grid4x4 - PettingZoo + RLlib

In [1]:
import os

import ray
from ray import tune
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv  # RLlib-PZ interface
from ray.tune.registry import register_env
import supersuit as ss

from helper_functions import make_parallel_env

In [2]:
import random
import numpy as np
import torch

SEED = 23423  # default SUMO seed no.
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1ed06143970>

In [3]:
from observation import Grid4x4ObservationFunction
from reward_functions import combined_reward

def env_creator(args):
    env_params = {
        "net_file": os.path.join("nets","grid4x4","grid4x4.net.xml"),
        "route_file": os.path.join("nets","grid4x4","grid4x4_1.rou.xml"),
        "num_seconds": 3600,
        "reward_fn": combined_reward,
        "sumo_seed": SEED,
        "observation_class": Grid4x4ObservationFunction
    }
    env = make_parallel_env(**env_params)
    env = ss.pad_observations_v0(env)
    env = ss.frame_stack_v1(env, 3)
    return env

In [4]:
ray.init()

env_name = "grid4x4"

register_env(env_name, lambda config: ParallelPettingZooEnv(env_creator(config)))

2023-05-28 12:43:33,965	INFO worker.py:1625 -- Started a local Ray instance.


In [5]:
from ray.rllib.algorithms.ppo import PPOConfig

# From https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/atari-ppo.yaml

config = (
    PPOConfig()
    .environment(env=env_name)
    .framework(framework="torch")
    .rollouts(
        rollout_fragment_length=100,
        num_rollout_workers=10,
        num_envs_per_worker=5,
        batch_mode="truncate_episodes",
    )
    .training(
        lambda_=0.95,
        kl_coeff=0.5,
        clip_param=0.1,
        vf_clip_param=10.0,
        entropy_coeff=0.01,
        train_batch_size=5000,
        sgd_minibatch_size=500,
        num_sgd_iter=10,
    )
    .evaluation(evaluation_num_workers=1)
    .debugging(log_level="INFO")
    .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "1")))
)

In [6]:
algo = config.build()



[2m[36m(RolloutWorker pid=22460)[0m Step #0.00 (0ms ?*RT. ?UPS, TraCI: 81ms, vehicles TOT 0 ACT 0 BUF 0)                     


[2m[36m(RolloutWorker pid=24732)[0m 2023-05-28 12:43:51,204	INFO policy.py:1285 -- Policy (worker=3) running on CPU.
[2m[36m(RolloutWorker pid=24732)[0m 2023-05-28 12:43:51,204	INFO torch_policy_v2.py:110 -- Found 0 visible cuda devices.
[2m[36m(RolloutWorker pid=24732)[0m 2023-05-28 12:43:51,217	INFO util.py:118 -- Using connectors:
[2m[36m(RolloutWorker pid=24732)[0m 2023-05-28 12:43:51,217	INFO util.py:119 --     AgentConnectorPipeline
[2m[36m(RolloutWorker pid=24732)[0m         ObsPreprocessorConnector
[2m[36m(RolloutWorker pid=24732)[0m         StateBufferConnector
[2m[36m(RolloutWorker pid=24732)[0m         ViewRequirementAgentConnector
[2m[36m(RolloutWorker pid=24732)[0m 2023-05-28 12:43:51,217	INFO util.py:120 --     ActionConnectorPipeline
[2m[36m(RolloutWorker pid=24732)[0m         ConvertToNumpyConnector
[2m[36m(RolloutWorker pid=24732)[0m         NormalizeActionsConnector
[2m[36m(RolloutWorker pid=24732)[0m         ImmutableActionsConnector


[2m[36m(RolloutWorker pid=23232)[0m Step #0.00 (0ms ?*RT. ?UPS, TraCI: 62ms, vehicles TOT 0 ACT 0 BUF 0)                     [32m [repeated 32x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m


2023-05-28 12:43:57,874	INFO worker_set.py:312 -- Inferred observation/action spaces from remote worker (local worker has no env): {'default_policy': (Box(0.0, 1.0, (495,), float32), Discrete(8)), '__env__': (Box(0.0, 1.0, (495,), float32), Discrete(8))}
2023-05-28 12:43:57,911	INFO policy.py:1285 -- Policy (worker=local) running on 1 GPUs.
2023-05-28 12:43:57,912	INFO torch_policy_v2.py:110 -- Found 1 visible cuda devices.
2023-05-28 12:43:59,799	INFO util.py:118 -- Using connectors:
2023-05-28 12:43:59,800	INFO util.py:119 --     AgentConnectorPipeline
        ObsPreprocessorConnector
        StateBufferConnector
        ViewRequirementAgentConnector
2023-05-28 12:43:59,800	INFO util.py:120 --     ActionConnectorPipeline
        ConvertToNumpyConnector
        NormalizeActionsConnector
        ImmutableActionsConnector
2023-05-28 12:43:59,801	INFO rollout_worker.py:2000 -- Built policy map: <PolicyMap lru-caching-capacity=100 policy-IDs=['default_policy']>
2023-05-28 12:43:59,801	INF

[2m[36m(RolloutWorker pid=14240)[0m Step #0.00 (0ms ?*RT. ?UPS, TraCI: 53ms, vehicles TOT 0 ACT 0 BUF 0)                     [32m [repeated 18x across cluster][0m


[2m[36m(RolloutWorker pid=11512)[0m 2023-05-28 12:43:51,453	INFO policy.py:1285 -- Policy (worker=6) running on CPU.[32m [repeated 9x across cluster][0m
[2m[36m(RolloutWorker pid=11512)[0m 2023-05-28 12:43:51,453	INFO torch_policy_v2.py:110 -- Found 0 visible cuda devices.[32m [repeated 9x across cluster][0m
[2m[36m(RolloutWorker pid=11512)[0m 2023-05-28 12:43:51,461	INFO util.py:118 -- Using connectors:[32m [repeated 9x across cluster][0m
[2m[36m(RolloutWorker pid=11512)[0m 2023-05-28 12:43:51,461	INFO util.py:119 --     AgentConnectorPipeline[32m [repeated 9x across cluster][0m
[2m[36m(RolloutWorker pid=11512)[0m         ObsPreprocessorConnector[32m [repeated 9x across cluster][0m
[2m[36m(RolloutWorker pid=11512)[0m         StateBufferConnector[32m [repeated 9x across cluster][0m
[2m[36m(RolloutWorker pid=11512)[0m         ViewRequirementAgentConnector[32m [repeated 9x across cluster][0m
[2m[36m(RolloutWorker pid=11512)[0m 2023-05-28 12:43:51,461

[2m[36m(RolloutWorker pid=14240)[0m Step #0.00 (0ms ?*RT. ?UPS, TraCI: 56ms, vehicles TOT 0 ACT 0 BUF 0)                     [32m [repeated 4x across cluster][0m


2023-05-28 12:44:13,768	INFO worker_set.py:312 -- Inferred observation/action spaces from remote worker (local worker has no env): {'default_policy': (Box(0.0, 1.0, (495,), float32), Discrete(8)), '__env__': (Box(0.0, 1.0, (495,), float32), Discrete(8))}
2023-05-28 12:44:13,776	INFO policy.py:1285 -- Policy (worker=local) running on 1 GPUs.
2023-05-28 12:44:13,777	INFO torch_policy_v2.py:110 -- Found 1 visible cuda devices.
2023-05-28 12:44:13,800	INFO util.py:118 -- Using connectors:
2023-05-28 12:44:13,800	INFO util.py:119 --     AgentConnectorPipeline
        ObsPreprocessorConnector
        StateBufferConnector
        ViewRequirementAgentConnector
2023-05-28 12:44:13,801	INFO util.py:120 --     ActionConnectorPipeline
        ConvertToNumpyConnector
        NormalizeActionsConnector
        ImmutableActionsConnector
2023-05-28 12:44:13,802	INFO rollout_worker.py:2000 -- Built policy map: <PolicyMap lru-caching-capacity=100 policy-IDs=['default_policy']>
2023-05-28 12:44:13,802	INF

In [7]:
# Changes made in ParallelPettingZooEnv's reset() method: see line 202-206 in
# C:\Users\admin\anaconda3\envs\marl\lib\site-packages\ray\rllib\env\wrappers\pettingzoo_env.py

algo.evaluate()

2023-05-28 12:44:13,860	INFO algorithm.py:935 -- Evaluating current state of PPO for 10 episodes.
[2m[36m(RolloutWorker pid=14240)[0m 2023-05-28 12:44:13,862	INFO rollout_worker.py:909 -- Generating sample batch of size 5
[2m[36m(RolloutWorker pid=14240)[0m 2023-05-28 12:44:13,862	INFO rollout_worker.py:909 -- Generating sample batch of size 5
[2m[36m(RolloutWorker pid=14240)[0m 2023-05-28 12:44:13,862	INFO rollout_worker.py:909 -- Generating sample batch of size 5
[2m[36m(RolloutWorker pid=14240)[0m 2023-05-28 12:44:13,862	INFO rollout_worker.py:909 -- Generating sample batch of size 5
[2m[36m(RolloutWorker pid=14240)[0m 2023-05-28 12:44:13,862	INFO rollout_worker.py:909 -- Generating sample batch of size 5
[2m[36m(RolloutWorker pid=14240)[0m 2023-05-28 12:44:13,862	INFO rollout_worker.py:909 -- Generating sample batch of size 5
[2m[36m(RolloutWorker pid=14240)[0m 2023-05-28 12:44:13,862	INFO rollout_worker.py:909 -- Generating sample batch of size 5
[2m[36m(Rol

{'evaluation': {'episode_reward_max': nan,
  'episode_reward_min': nan,
  'episode_reward_mean': nan,
  'episode_len_mean': nan,
  'episode_media': {},
  'episodes_this_iter': 0,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [], 'episode_lengths': []},
  'sampler_perf': {},
  'num_faulty_episodes': 0,
  'connector_metrics': {},
  'num_agent_steps_sampled_this_iter': 0,
  'num_env_steps_sampled_this_iter': 0,
  'timesteps_this_iter': 0}}

In [8]:
from ray.tune.logger import pretty_print

result = algo.train()
print(pretty_print(result))

[2m[36m(RolloutWorker pid=22460)[0m 2023-05-28 12:48:15,818	INFO rollout_worker.py:909 -- Generating sample batch of size 500


[2m[36m(RolloutWorker pid=25156)[0m Step #0.00 (0ms ?*RT. ?UPS, TraCI: 264930ms, vehicles TOT 0 ACT 0 BUF 0)                 [32m [repeated 6x across cluster][0m


[2m[36m(RolloutWorker pid=22460)[0m 2023-05-28 12:53:54,532	INFO rollout_worker.py:950 -- Completed sample batch:
[2m[36m(RolloutWorker pid=22460)[0m 
[2m[36m(RolloutWorker pid=22460)[0m { 'count': 500,
[2m[36m(RolloutWorker pid=22460)[0m   'policy_batches': { 'default_policy': { 'action_dist_inputs': np.ndarray((8000, 8), dtype=float32, min=-0.008, max=0.008, mean=-0.0),
[2m[36m(RolloutWorker pid=22460)[0m                                           'action_logp': np.ndarray((8000,), dtype=float32, min=-2.086, max=-2.073, mean=-2.079),
[2m[36m(RolloutWorker pid=22460)[0m                                           'actions': np.ndarray((8000,), dtype=int64, min=0.0, max=7.0, mean=3.455),
[2m[36m(RolloutWorker pid=22460)[0m                                           'advantages': np.ndarray((8000,), dtype=float32, min=-276.407, max=283.89, mean=-90.413),
[2m[36m(RolloutWorker pid=22460)[0m                                           'agent_index': np.ndarray((8000,), d

agent_timesteps_total: 80000
connector_metrics: {}
counters:
  num_agent_steps_sampled: 80000
  num_agent_steps_trained: 80000
  num_env_steps_sampled: 5000
  num_env_steps_trained: 5000
custom_metrics: {}
date: 2023-05-28_12-54-09
done: false
episode_len_mean: .nan
episode_media: {}
episode_reward_max: .nan
episode_reward_mean: .nan
episode_reward_min: .nan
episodes_this_iter: 0
episodes_total: 0
hostname: JM-M16
info:
  learner:
    default_policy:
      custom_metrics: {}
      diff_num_grad_updates_vs_sampler_policy: 799.5
      learner_stats:
        cur_kl_coeff: 0.5
        cur_lr: 5.000000000000001e-05
        entropy: 2.0785656407475472
        entropy_coeff: 0.009999999999999998
        kl: 0.000886725546912217
        policy_loss: -0.002734486180188469
        total_loss: 9.884481087327003
        vf_explained_var: -0.0008695485442876816
        vf_loss: 9.907557814121246
      model: {}
      num_grad_updates_lifetime: 800.5
  num_agent_steps_sampled: 80000
  num_agent_step

In [10]:
algo.evaluate()

2023-05-28 12:56:53,654	INFO algorithm.py:935 -- Evaluating current state of PPO for 10 episodes.


Step #2000.00 (2ms ~= 500.00*RT, ~106500.00UPS, TraCI: 1646ms, vehicles TOT 907 ACT 213 BU1212ms, vehicles TOT 1 ACT 1 BUF 0)    




{'evaluation': {'episode_reward_max': nan,
  'episode_reward_min': nan,
  'episode_reward_mean': nan,
  'episode_len_mean': nan,
  'episode_media': {},
  'episodes_this_iter': 0,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [], 'episode_lengths': []},
  'sampler_perf': {},
  'num_faulty_episodes': 0,
  'connector_metrics': {},
  'num_agent_steps_sampled_this_iter': 0,
  'num_env_steps_sampled_this_iter': 0,
  'timesteps_this_iter': 0}}

In [11]:
ray.shutdown()