In [1]:
# Third party imports.
import gym
from gym.spaces import Discrete, MultiDiscrete
from ipywidgets import Output
from IPython import display
import numpy as np
import os
from starlette.requests import Request
import time

# Ray imports.
import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.policy.policy import PolicySpec
from ray import serve
from ray import tune

### Running on Anyscale

Let's connect to an existing 1GPU/16CPUs cluster via `ray.init(address=...)`.

In [2]:
ray.init(
    # Connecting to an existing (and running) cluster ("cluster-12" in my account).
    address="anyscale://cluster-12",

    # This will upload this directory to Anyscale so that the code can be run on cluster.
    project_dir=".",
    
    #cloud="anyscale_default_cloud",
    
    # Our Python dependencies, e.g. tensorflow
    # (make sure everything is available on the cluster).
    runtime_env={"pip": "./requirements.txt"}
)

[1m[36m(anyscale +0.1s)[0m Loaded Anyscale authentication token from ~/.anyscale/credentials.json
[1m[36m(anyscale +0.1s)[0m Loaded Anyscale authentication token from ~/.anyscale/credentials.json
[1m[36m(anyscale +0.9s)[0m .anyscale.yaml found in project_dir. Directory is attached to a project.
[1m[36m(anyscale +1.7s)[0m Using project (name: cuj_rllib, project_dir: /Users/sven/Dropbox/Projects/anyscale_projects/cuj-rl-in-production, id: prj_84JWkW5F1TqLJwhSqLDadyML).
[1m[36m(anyscale +3.1s)[0m cluster cluster-12 is currently running, the cluster will not be restarted.


The 'pip' or 'conda' field was specified in the runtime env, so it may take some time to install the environment before Ray connects.


[2m[36m(pid=runtime_env)[0m 2021-12-01 09:17:12,040	INFO conda.py:219 -- Setting up conda environment with {'_ray_commit': '72fdf3be605e4aadfd9a8c3f01dd7843c01cb8fd', 'env_vars': {'RAY_SERVE_ROOT_URL': 'https://serve-session-qkbdqbk6fq939rimgf52smke.i.anyscaleuserdata.com'}, 'excludes': ['.git', '__pycache__', 'venv', '/Users/sven/Dropbox/Projects/anyscale_projects/cuj-rl-in-production/.anyscale.yaml', '/Users/sven/Dropbox/Projects/anyscale_projects/cuj-rl-in-production/session-default.yaml'], 'pip': ['ray[rllib]', 'jupyter', 'tblib', 'fastapi', 'uvicorn', 'anyscale', 'requests', 'torch', 'tensorflow', 'gsutil'], 'uris': ['gcs://_ray_pkg_9d.zip']}
[2m[36m(pid=runtime_env)[0m 2021-12-01 09:17:13,162	INFO conda.py:243 -- Finished setting up runtime environment at /tmp/ray/session_2021-12-01_04-27-46_331452_161/runtime_resources/conda/ray-1fdfbc69e9e1c069ec07b30674483e16d97bdd20
[1m[36m(anyscale +28.8s)[0m Connected to cluster-12, see: https://console.anyscale.com/projects/prj_84

AnyscaleClientContext(dashboard_url='https://session-qkbdqbk6fq939rimgf52smke.i.anyscaleuserdata.com/auth/?token=688034c9-c779-4257-b833-e7dc6d33a430&redirect_to=dashboard', python_version='3.8.5', ray_version='1.8.0', ray_commit='72fdf3be605e4aadfd9a8c3f01dd7843c01cb8fd', protocol_version='2021-09-22', _num_clients=1, _context_to_restore=None)

### Coding/defining our "problem" via an RL environment.

We will use the following (adversarial) multi-agent environment throughout this demo.

<img src="img/environment.png" width=800>

In [3]:
# Let's code our multi-agent environment.

class MultiAgentArena(MultiAgentEnv):
    def __init__(self, config=None):
        config = config or {}
        # Dimensions of the grid.
        self.width = config.get("width", 10)
        self.height = config.get("height", 10)

        # End an episode after this many timesteps.
        self.timestep_limit = config.get("ts", 100)

        self.observation_space = MultiDiscrete([self.width * self.height,
                                                self.width * self.height])
        # 0=up, 1=right, 2=down, 3=left.
        self.action_space = Discrete(4)

        # Reset env.
        self.reset()

        # For rendering.
        self.out = None
        if config.get("render"):
            self.out = Output()
            display.display(self.out)

    def reset(self):
        """Returns initial observation of next(!) episode."""
        # Row-major coords.
        self.agent1_pos = [0, 0]  # upper left corner
        self.agent2_pos = [self.height - 1, self.width - 1]  # lower bottom corner

        # Accumulated rewards in this episode.
        self.agent1_R = 0.0
        self.agent2_R = 0.0

        # Reset agent1's visited fields.
        self.agent1_visited_fields = set([tuple(self.agent1_pos)])

        # How many timesteps have we done in this episode.
        self.timesteps = 0

        # Did we have a collision in recent step?
        self.collision = False
        # How many collisions in total have we had in this episode?
        self.num_collisions = 0

        # Return the initial observation in the new episode.
        return self._get_obs()

    def step(self, action: dict):
        """
        Returns (next observation, rewards, dones, infos) after having taken the given actions.
        
        e.g.
        `action={"agent1": action_for_agent1, "agent2": action_for_agent2}`
        """
        
        # increase our time steps counter by 1.
        self.timesteps += 1
        # An episode is "done" when we reach the time step limit.
        is_done = self.timesteps >= self.timestep_limit

        # Agent2 always moves first.
        # events = [collision|agent1_new_field]
        events = self._move(self.agent2_pos, action["agent2"], is_agent1=False)
        events |= self._move(self.agent1_pos, action["agent1"], is_agent1=True)

        # Useful for rendering.
        self.collision = "collision" in events
        if self.collision is True:
            self.num_collisions += 1
            
        # Get observations (based on new agent positions).
        obs = self._get_obs()

        # Determine rewards based on the collected events:
        r1 = -1.0 if "collision" in events else 1.0 if "agent1_new_field" in events else -0.5
        r2 = 1.0 if "collision" in events else -0.1

        self.agent1_R += r1
        self.agent2_R += r2
        
        rewards = {
            "agent1": r1,
            "agent2": r2,
        }

        # Generate a `done` dict (per-agent and total).
        dones = {
            "agent1": is_done,
            "agent2": is_done,
            # special `__all__` key indicates that the episode is done for all agents.
            "__all__": is_done,
        }

        return obs, rewards, dones, {}  # <- info dict (not needed here).

    def _get_obs(self):
        """
        Returns obs dict (agent name to discrete-pos tuple) using each
        agent's current x/y-positions.
        """
        ag1_discrete_pos = self.agent1_pos[0] * self.width + \
            (self.agent1_pos[1] % self.width)
        ag2_discrete_pos = self.agent2_pos[0] * self.width + \
            (self.agent2_pos[1] % self.width)
        return {
            "agent1": np.array([ag1_discrete_pos, ag2_discrete_pos]),
            "agent2": np.array([ag2_discrete_pos, ag1_discrete_pos]),
        }

    def _move(self, coords, action, is_agent1):
        """
        Moves an agent (agent1 iff is_agent1=True, else agent2) from `coords` (x/y) using the
        given action (0=up, 1=right, etc..) and returns a resulting events dict:
        Agent1: "new" when entering a new field. "bumped" when having been bumped into by agent2.
        Agent2: "bumped" when bumping into agent1 (agent1 then gets -1.0).
        """
        orig_coords = coords[:]
        # Change the row: 0=up (-1), 2=down (+1)
        coords[0] += -1 if action == 0 else 1 if action == 2 else 0
        # Change the column: 1=right (+1), 3=left (-1)
        coords[1] += 1 if action == 1 else -1 if action == 3 else 0

        # Solve collisions.
        # Make sure, we don't end up on the other agent's position.
        # If yes, don't move (we are blocked).
        if (is_agent1 and coords == self.agent2_pos) or (not is_agent1 and coords == self.agent1_pos):
            coords[0], coords[1] = orig_coords
            # Agent2 blocked agent1 (agent1 tried to run into agent2)
            # OR Agent2 bumped into agent1 (agent2 tried to run into agent1)
            return {"collision"}

        # No agent blocking -> check walls.
        if coords[0] < 0:
            coords[0] = 0
        elif coords[0] >= self.height:
            coords[0] = self.height - 1
        if coords[1] < 0:
            coords[1] = 0
        elif coords[1] >= self.width:
            coords[1] = self.width - 1

        # If agent1 -> "new" if new tile covered.
        if is_agent1 and not tuple(coords) in self.agent1_visited_fields:
            self.agent1_visited_fields.add(tuple(coords))
            return {"agent1_new_field"}
        # No new tile for agent1.
        return set()

    def render(self, mode=None):

        if self.out is not None:
            self.out.clear_output(wait=True)

        print("_" * (self.width + 2))
        for r in range(self.height):
            print("|", end="")
            for c in range(self.width):
                field = r * self.width + c % self.width
                if self.agent1_pos == [r, c]:
                    print("1", end="")
                elif self.agent2_pos == [r, c]:
                    print("2", end="")
                elif (r, c) in self.agent1_visited_fields:
                    print(".", end="")
                else:
                    print(" ", end="")
            print("|")
        print("‾" * (self.width + 2))
        print(f"{'!!Collision!!' if self.collision else ''}")
        print("R1={: .1f}".format(self.agent1_R))
        print("R2={: .1f} ({} collisions)".format(self.agent2_R, self.num_collisions))
        print()
        time.sleep(0.25)


env = MultiAgentArena(config={"render": True})
obs = env.reset()

with env.out:
    # Agent1 moves down, Agent2 moves up.
    obs, rewards, dones, infos = env.step(action={"agent1": 2, "agent2": 0})
    env.render()

    # Agent1 moves right, Agent2 moves left.
    obs, rewards, dones, infos = env.step(action={"agent1": 1, "agent2": 3})
    env.render()

    # Agent1 moves right, Agent2 moves left.
    obs, rewards, dones, infos = env.step(action={"agent1": 1, "agent2": 3})
    env.render()

    # Agent1 moves down, Agent2 moves up.
    obs, rewards, dones, infos = env.step(action={"agent1": 2, "agent2": 0})
    env.render()


print("Agent1's x/y position={}".format(env.agent1_pos))
print("Agent2's x/y position={}".format(env.agent2_pos))
print("Env timesteps={}".format(env.timesteps))


Output()

Agent1's x/y position=[2, 2]
Agent2's x/y position=[7, 7]
Env timesteps=4


### Configuring our Trainer

In [4]:
TRAINER_CFG = {
    # Using our environment class defined above.
    "env": MultiAgentArena,
    # Use `framework=torch` here for PyTorch.
    "framework": "tf",

    # Run on 1 GPU on the "learner".
    "num_gpus": 1,
    # Use 15 ray-parallelized environment workers,
    # which collect samples to learn from. Each worker gets assigned
    # 1 CPU.
    "num_workers": 15,
    # Each of the 15 workers has 10 environment copies ("vectorization")
    # for faster (batched) forward passes.
    "num_envs_per_worker": 10,

    # Multi-agent setup: 2 policies.
    "multiagent": {
        "policies": {"policy1", "policy2"},
        "policy_mapping_fn": lambda agent_id: "policy1" if agent_id == "agent1" else "policy2"
    },
}

### Training our 2 Policies (agent1 and agent2)

In [5]:
results = tune.run(
    # RLlib Trainer class (we use the "PPO" algorithm today).
    PPOTrainer,
    # Give our experiment a name (we will find results/checkpoints
    # under this name on the server's `~ray_results/` dir).
    name=f"CUJ-RL",
    # The RLlib config (defined in a cell above).
    config=TRAINER_CFG,
    # Take a snapshot every 2 iterations.
    checkpoint_freq=2,
    # Plus one at the very end of training.
    checkpoint_at_end=True,
    # Run for exactly 30 training iterations.
    stop={"training_iteration": 20},
    # Define what we are comparing for, when we search for the
    # "best" checkpoint at the end.
    metric="episode_reward_mean",
    mode="max")

print("Best checkpoint: ", results.best_checkpoint)


[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:24:41 (running for 00:00:00.14)
[2m[36m(run pid=None)[0m Memory usage on this node: 4.7/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 0/16 CPUs, 0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Result logdir: /home/ray/ray_results/CUJ-RL
[2m[36m(run pid=None)[0m Number of trials: 1/1 (1 PENDING)
[2m[36m(run pid=None)[0m +---------------------------------+----------+-------+
[2m[36m(run pid=None)[0m | Trial name                      | status   | loc   |
[2m[36m(run pid=None)[0m |---------------------------------+----------+-------|
[2m[36m(run pid=None)[0m | PPO_MultiAgentArena_9155f_00000 | PENDING  |       |
[2m[36m(run pid=None)[0m +---------------------------------+----------+-------+
[2m[36m(run pid=None)[0m 
[2m[36m(run pid=No

[2m[36m(pid=None)[0m 2021-12-01 09:24:47,812	INFO trainer.py:753 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=None)[0m 2021-12-01 09:24:47,813	INFO ppo.py:166 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=None)[0m 2021-12-01 09:24:47,813	INFO trainer.py:770 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:25:09 (running for 00:00:28.45)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Result logdir: /home/ray/ray_results/CUJ-RL
[2m[36m(run pid=None)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(run pid=None)[0m +---------------------------------+----------+---------------------+
[2m[36m(run pid=None)[0m | Trial name                      | status   | loc                 |
[2m[36m(run pid=None)[0m |---------------------------------+----------+---------------------|
[2m[36m(run pid=None)[0m | PPO_MultiAgentArena_9155f_00000 | RUNNING  | 172.31.43.185:58807 |
[2m[36m(run pid=None)[0m +---------------------------------+-------

[2m[36m(pid=None)[0m 2021-12-01 09:25:09,716	INFO trainable.py:110 -- Trainable.setup took 21.905 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:25:10 (running for 00:00:29.47)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Result logdir: /home/ray/ray_results/CUJ-RL
[2m[36m(run pid=None)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(run pid=None)[0m +---------------------------------+----------+---------------------+
[2m[36m(run pid=None)[0m | Trial name                      | status   | loc                 |
[2m[36m(run pid=None)[0m |---------------------------------+----------+---------------------|
[2m[36m(run pid=None)[0m | PPO_MultiAgentArena_9155f_00000 | RUNNING  | 172.31.43.185:58807 |
[2m[36m(run pid=None)[0m +---------------------------------+-------

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:25:31 (running for 00:00:49.91)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Result logdir: /home/ray/ray_results/CUJ-RL
[2m[36m(run pid=None)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(run pid=None)[0m +---------------------------------+----------+---------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
[2m[36m(run pid=None)[0m | Trial name                      | status   | loc                 |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
[2m[36m(run pid=None)[0m |---------

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:25:51 (running for 00:01:10.55)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=-5.1979999999999915 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:26:01 (running for 00:01:20.57)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=-5.1979999999999915 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no

[2m[36m(run pid=None)[0m Result for PPO_MultiAgentArena_9155f_00000:
[2m[36m(run pid=None)[0m   agent_timesteps_total: 46800
[2m[36m(run pid=None)[0m   custom_metrics: {}
[2m[36m(run pid=None)[0m   date: 2021-12-01_09-26-08
[2m[36m(run pid=None)[0m   done: false
[2m[36m(run pid=None)[0m   episode_len_mean: 100.0
[2m[36m(run pid=None)[0m   episode_media: {}
[2m[36m(run pid=None)[0m   episode_reward_max: 11.700000000000017
[2m[36m(run pid=None)[0m   episode_reward_mean: -5.039999999999991
[2m[36m(run pid=None)[0m   episode_reward_min: -25.500000000000032
[2m[36m(run pid=None)[0m   episodes_this_iter: 0
[2m[36m(run pid=None)[0m   episodes_total: 150
[2m[36m(run pid=None)[0m   experiment_id: 9b88d1fc0f0740009943392898e1ec20
[2m[36m(run pid=None)[0m   hostname: ip-172-31-43-185
[2m[36m(run pid=None)[0m   info:
[2m[36m(run pid=None)[0m     learner:
[2m[36m(run pid=None)[0m       policy1:
[2m[36m(run pid=None)[0m         learner_stats:
[2

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:26:17 (running for 00:01:36.00)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=-5.039999999999991 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:26:27 (running for 00:01:46.02)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=-5.039999999999991 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:26:32 (running for 00:01:51.24)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=-2.183999999999988 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:26:42 (running for 00:02:01.26)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=-2.183999999999988 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:26:47 (running for 00:02:06.42)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=-2.0009999999999866 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:26:57 (running for 00:02:16.44)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=-2.0009999999999866 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no

[2m[36m(run pid=None)[0m Result for PPO_MultiAgentArena_9155f_00000:
[2m[36m(run pid=None)[0m   agent_timesteps_total: 93600
[2m[36m(run pid=None)[0m   custom_metrics: {}
[2m[36m(run pid=None)[0m   date: 2021-12-01_09-27-06
[2m[36m(run pid=None)[0m   done: false
[2m[36m(run pid=None)[0m   episode_len_mean: 100.0
[2m[36m(run pid=None)[0m   episode_media: {}
[2m[36m(run pid=None)[0m   episode_reward_max: 23.099999999999962
[2m[36m(run pid=None)[0m   episode_reward_mean: 1.5220000000000091
[2m[36m(run pid=None)[0m   episode_reward_min: -16.79999999999999
[2m[36m(run pid=None)[0m   episodes_this_iter: 150
[2m[36m(run pid=None)[0m   episodes_total: 450
[2m[36m(run pid=None)[0m   experiment_id: 9b88d1fc0f0740009943392898e1ec20
[2m[36m(run pid=None)[0m   hostname: ip-172-31-43-185
[2m[36m(run pid=None)[0m   info:
[2m[36m(run pid=None)[0m     learner:
[2m[36m(run pid=None)[0m       policy1:
[2m[36m(run pid=None)[0m         learner_stats:
[

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:27:13 (running for 00:02:32.34)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=1.5220000000000091 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:27:23 (running for 00:02:42.36)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=1.5220000000000091 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:27:29 (running for 00:02:48.27)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=1.8660000000000079 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:27:39 (running for 00:02:58.28)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=1.8660000000000079 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m Result for PPO_MultiAgentArena_9155f_00000:
[2m[36m(run pid=None)[0m   agent_timesteps_total: 124800
[2m[36m(run pid=None)[0m   custom_metrics: {}
[2m[36m(run pid=None)[0m   date: 2021-12-01_09-27-45
[2m[36m(run pid=None)[0m   done: false
[2m[36m(run pid=None)[0m   episode_len_mean: 100.0
[2m[36m(run pid=None)[0m   episode_media: {}
[2m[36m(run pid=None)[0m   episode_reward_max: 27.29999999999993
[2m[36m(run pid=None)[0m   episode_reward_mean: 4.014000000000002
[2m[36m(run pid=None)[0m   episode_reward_min: -15.899999999999988
[2m[36m(run pid=None)[0m   episodes_this_iter: 150
[2m[36m(run pid=None)[0m   episodes_total: 600
[2m[36m(run pid=None)[0m   experiment_id: 9b88d1fc0f0740009943392898e1ec20
[2m[36m(run pid=None)[0m   hostname: ip-172-31-43-185
[2m[36m(run pid=None)[0m   info:
[2m[36m(run pid=None)[0m     learner:
[2m[36m(run pid=None)[0m       policy1:
[2m[36m(run pid=None)[0m         learner_stats:
[

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:27:55 (running for 00:03:13.86)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=4.014000000000002 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_f

[2m[36m(run pid=None)[0m Result for PPO_MultiAgentArena_9155f_00000:
[2m[36m(run pid=None)[0m   agent_timesteps_total: 140400
[2m[36m(run pid=None)[0m   custom_metrics: {}
[2m[36m(run pid=None)[0m   date: 2021-12-01_09-28-03
[2m[36m(run pid=None)[0m   done: false
[2m[36m(run pid=None)[0m   episode_len_mean: 100.0
[2m[36m(run pid=None)[0m   episode_media: {}
[2m[36m(run pid=None)[0m   episode_reward_max: 25.199999999999896
[2m[36m(run pid=None)[0m   episode_reward_mean: 4.2810000000000015
[2m[36m(run pid=None)[0m   episode_reward_min: -15.899999999999988
[2m[36m(run pid=None)[0m   episodes_this_iter: 0
[2m[36m(run pid=None)[0m   episodes_total: 600
[2m[36m(run pid=None)[0m   experiment_id: 9b88d1fc0f0740009943392898e1ec20
[2m[36m(run pid=None)[0m   hostname: ip-172-31-43-185
[2m[36m(run pid=None)[0m   info:
[2m[36m(run pid=None)[0m     learner:
[2m[36m(run pid=None)[0m       policy1:
[2m[36m(run pid=None)[0m         learner_stats:
[

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:28:10 (running for 00:03:29.56)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=4.2810000000000015 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:28:20 (running for 00:03:39.58)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=4.2810000000000015 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:28:26 (running for 00:03:45.15)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=7.78199999999999 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_fi

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:28:36 (running for 00:03:55.16)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=7.78199999999999 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_fi

[2m[36m(run pid=None)[0m Result for PPO_MultiAgentArena_9155f_00000:
[2m[36m(run pid=None)[0m   agent_timesteps_total: 171600
[2m[36m(run pid=None)[0m   custom_metrics: {}
[2m[36m(run pid=None)[0m   date: 2021-12-01_09-28-41
[2m[36m(run pid=None)[0m   done: false
[2m[36m(run pid=None)[0m   episode_len_mean: 100.0
[2m[36m(run pid=None)[0m   episode_media: {}
[2m[36m(run pid=None)[0m   episode_reward_max: 29.099999999999923
[2m[36m(run pid=None)[0m   episode_reward_mean: 7.202999999999994
[2m[36m(run pid=None)[0m   episode_reward_min: -13.49999999999998
[2m[36m(run pid=None)[0m   episodes_this_iter: 0
[2m[36m(run pid=None)[0m   episodes_total: 750
[2m[36m(run pid=None)[0m   experiment_id: 9b88d1fc0f0740009943392898e1ec20
[2m[36m(run pid=None)[0m   hostname: ip-172-31-43-185
[2m[36m(run pid=None)[0m   info:
[2m[36m(run pid=None)[0m     learner:
[2m[36m(run pid=None)[0m       policy1:
[2m[36m(run pid=None)[0m         learner_stats:
[2m

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:28:51 (running for 00:04:10.48)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=7.202999999999994 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_f

[2m[36m(run pid=None)[0m Result for PPO_MultiAgentArena_9155f_00000:
[2m[36m(run pid=None)[0m   agent_timesteps_total: 187200
[2m[36m(run pid=None)[0m   custom_metrics: {}
[2m[36m(run pid=None)[0m   date: 2021-12-01_09-29-00
[2m[36m(run pid=None)[0m   done: false
[2m[36m(run pid=None)[0m   episode_len_mean: 100.0
[2m[36m(run pid=None)[0m   episode_media: {}
[2m[36m(run pid=None)[0m   episode_reward_max: 37.499999999999915
[2m[36m(run pid=None)[0m   episode_reward_mean: 12.037999999999975
[2m[36m(run pid=None)[0m   episode_reward_min: -17.999999999999986
[2m[36m(run pid=None)[0m   episodes_this_iter: 150
[2m[36m(run pid=None)[0m   episodes_total: 900
[2m[36m(run pid=None)[0m   experiment_id: 9b88d1fc0f0740009943392898e1ec20
[2m[36m(run pid=None)[0m   hostname: ip-172-31-43-185
[2m[36m(run pid=None)[0m   info:
[2m[36m(run pid=None)[0m     learner:
[2m[36m(run pid=None)[0m       policy1:
[2m[36m(run pid=None)[0m         learner_stats:


[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:29:07 (running for 00:04:26.42)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=12.037999999999975 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:29:17 (running for 00:04:36.44)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=12.037999999999975 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:29:23 (running for 00:04:41.80)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=11.816999999999977 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:29:33 (running for 00:04:51.82)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=11.816999999999977 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:29:39 (running for 00:04:57.73)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=17.83799999999995 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_f

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:29:49 (running for 00:05:07.74)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=17.83799999999995 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_f

[2m[36m(run pid=None)[0m Result for PPO_MultiAgentArena_9155f_00000:
[2m[36m(run pid=None)[0m   agent_timesteps_total: 234000
[2m[36m(run pid=None)[0m   custom_metrics: {}
[2m[36m(run pid=None)[0m   date: 2021-12-01_09-29-56
[2m[36m(run pid=None)[0m   done: false
[2m[36m(run pid=None)[0m   episode_len_mean: 100.0
[2m[36m(run pid=None)[0m   episode_media: {}
[2m[36m(run pid=None)[0m   episode_reward_max: 31.499999999999915
[2m[36m(run pid=None)[0m   episode_reward_mean: 18.314999999999948
[2m[36m(run pid=None)[0m   episode_reward_min: -3.2999999999999794
[2m[36m(run pid=None)[0m   episodes_this_iter: 0
[2m[36m(run pid=None)[0m   episodes_total: 1050
[2m[36m(run pid=None)[0m   experiment_id: 9b88d1fc0f0740009943392898e1ec20
[2m[36m(run pid=None)[0m   hostname: ip-172-31-43-185
[2m[36m(run pid=None)[0m   info:
[2m[36m(run pid=None)[0m     learner:
[2m[36m(run pid=None)[0m       policy1:
[2m[36m(run pid=None)[0m         learner_stats:


[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:30:04 (running for 00:05:23.40)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=18.314999999999948 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:30:14 (running for 00:05:33.42)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=18.314999999999948 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:30:20 (running for 00:05:38.77)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=20.341999999999935 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:30:30 (running for 00:05:48.79)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.2/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=20.341999999999935 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:30:35 (running for 00:05:54.63)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=21.71999999999993 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_f

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:30:45 (running for 00:06:04.65)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=21.71999999999993 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_f

[2m[36m(run pid=None)[0m Result for PPO_MultiAgentArena_9155f_00000:
[2m[36m(run pid=None)[0m   agent_timesteps_total: 280800
[2m[36m(run pid=None)[0m   custom_metrics: {}
[2m[36m(run pid=None)[0m   date: 2021-12-01_09-30-53
[2m[36m(run pid=None)[0m   done: false
[2m[36m(run pid=None)[0m   episode_len_mean: 100.0
[2m[36m(run pid=None)[0m   episode_media: {}
[2m[36m(run pid=None)[0m   episode_reward_max: 36.2999999999999
[2m[36m(run pid=None)[0m   episode_reward_mean: 18.68999999999994
[2m[36m(run pid=None)[0m   episode_reward_min: -11.999999999999982
[2m[36m(run pid=None)[0m   episodes_this_iter: 150
[2m[36m(run pid=None)[0m   episodes_total: 1350
[2m[36m(run pid=None)[0m   experiment_id: 9b88d1fc0f0740009943392898e1ec20
[2m[36m(run pid=None)[0m   hostname: ip-172-31-43-185
[2m[36m(run pid=None)[0m   info:
[2m[36m(run pid=None)[0m     learner:
[2m[36m(run pid=None)[0m       policy1:
[2m[36m(run pid=None)[0m         learner_stats:
[

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:31:02 (running for 00:06:20.73)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=18.68999999999994 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_f

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:31:12 (running for 00:06:30.75)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=18.68999999999994 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_f

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:31:17 (running for 00:06:36.70)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=19.100999999999942 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(run pid=None)[0m == Status ==
[2m[36m(run pid=None)[0m Current time: 2021-12-01 09:31:28 (running for 00:06:46.73)
[2m[36m(run pid=None)[0m Memory usage on this node: 11.3/119.9 GiB
[2m[36m(run pid=None)[0m Using FIFO scheduling algorithm.
[2m[36m(run pid=None)[0m Resources requested: 16.0/16 CPUs, 1.0/1 GPUs, 0.0/74.36 GiB heap, 0.0/35.86 GiB objects (0.0/1.0 accelerator_type:M60)
[2m[36m(run pid=None)[0m Current best trial: 9155f_00000 with episode_reward_mean=19.100999999999942 and parameters={'num_workers': 15, 'num_envs_per_worker': 10, 'create_env_on_driver': False, 'rollout_fragment_length': 26, 'batch_mode': 'truncate_episodes', 'gamma': 0.99, 'lr': 5e-05, 'train_batch_size': 4000, 'model': {'_use_default_native_models': False, '_disable_preprocessor_api': False, 'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'post_fcnet_hiddens': [], 'post_fcnet_activation': 'relu', 'free_log_std': False, 'no_

[2m[36m(pid=None)[0m [2021-12-01 09:31:32,370 E 58909 59316] raylet_client.cc:159: IOError: Broken pipe [RayletClient] Failed to disconnect from raylet.
[2m[36m(pid=None)[0m 2021-12-01 09:31:32,371	ERROR worker.py:425 -- SystemExit was raised from the worker
[2m[36m(pid=None)[0m Traceback (most recent call last):
[2m[36m(pid=None)[0m   File "python/ray/_raylet.pyx", line 692, in ray._raylet.task_execution_handler
[2m[36m(pid=None)[0m   File "python/ray/_raylet.pyx", line 521, in ray._raylet.execute_task
[2m[36m(pid=None)[0m   File "python/ray/_raylet.pyx", line 558, in ray._raylet.execute_task
[2m[36m(pid=None)[0m   File "python/ray/_raylet.pyx", line 565, in ray._raylet.execute_task
[2m[36m(pid=None)[0m   File "python/ray/_raylet.pyx", line 569, in ray._raylet.execute_task
[2m[36m(pid=None)[0m   File "python/ray/_raylet.pyx", line 519, in ray._raylet.execute_task.function_executor
[2m[36m(pid=None)[0m   File "/tmp/ray/session_2021-12-01_04-27-46_331452_16

Best checkpoint:  /home/ray/ray_results/CUJ-RL/PPO_MultiAgentArena_9155f_00000_0_2021-12-01_09-24-41/checkpoint_000020/checkpoint-20


### Restoring from a checkpoint

In [6]:
local_checkpoint = "/Users/sven/Downloads/checkpoint-20-2"

if os.path.isfile(local_checkpoint):
    print("yes, checkpoint files are on local machine ('Downloads' folder)")

yes, checkpoint files are on local machine ('Downloads' folder)


In [7]:
# We'll restore the trained PPOTrainer locally on this laptop here and have it run
# through a new environment to demonstrate it has learnt useful policies for our agents:

cpu_config = TRAINER_CFG.copy()
cpu_config["num_gpus"] = 0
cpu_config["num_workers"] = 0

new_trainer = PPOTrainer(config=cpu_config)
# Restore weights of the learnt policies via `restore()`.
new_trainer.restore(local_checkpoint)

Install gputil for GPU system monitoring.


### Running inference locally

In [10]:
env = MultiAgentArena(config={"render": True})

with env.out:

    obs = env.reset()
    env.render()

    while True:
        a1 = new_trainer.compute_single_action(obs["agent1"], policy_id="policy1", explore=True)
        a2 = new_trainer.compute_single_action(obs["agent2"], policy_id="policy2", explore=False)

        obs, rewards, dones, _ = env.step({"agent1": a1, "agent2": a2})

        env.render()

        if dones["agent1"] is True:
            break


Output()

### Inference using Ray Serve

In [None]:
@serve.deployment(route_prefix="/multi-agent-arena")
class ServeRLlibTrainer:

    def __init__(self, config, checkpoint_path):
        # Link to our trainer.
        self.trainer = PPOTrainer(cpu_config)
        self.trainer.restore(checkpoint_path)

    async def __call__(self, request: Request):
        json_input = await request.json()

        # Compute and return the action for the given observation.
        obs1 = json_input["observation_agent1"]
        obs2 = json_input["observation_agent2"]
        a1 = self.trainer.compute_single_action(obs1, policy_id="policy1")
        a2 = self.trainer.compute_single_action(obs2, policy_id="policy2")

        return {"action": {"agent1": int(a1), "agent2": int(a2)}}


In [None]:
client = serve.start()
ServeRLlibTrainer.deploy(cpu_config, results.best_checkpoint)