# Install RLLib

In [None]:
# May need to restart the kernel if anything new is installed
%pip install ray[rllib]
%pip install pettingzoo[all]

In [2]:
# If this succeeds, your rllib / pettingzoo install should be all set
# Make sure you also have pytorch or tensorflow working
from ray import tune
from ray.rllib.examples.env.rock_paper_scissors import RockPaperScissors
from ray.rllib.examples.policy.rock_paper_scissors_dummies import AlwaysSameHeuristic
from ray.rllib.agents.registry import get_agent_class

from gym.spaces import Discrete

# Minimal MARL Example
## Trivial agent

In [3]:
# Rock-paper-scissors with template agent
config = {
    "env": RockPaperScissors,
    "framework": "torch",
    "num_envs_per_worker": 4,
}

# Iterate until either is achieved
stop = {
    "training_iteration": 150,
    "timesteps_total": 100000,
    "episode_reward_mean": 1000.0
}

In [4]:
# Train the policy
results = tune.run("PG", config=config, stop=stop, verbose=1)

2021-01-06 01:46:12,862	INFO tune.py:448 -- Total run time: 29.99 seconds (20.48 seconds for the tuning loop).


## Multiple policies

In [7]:
# Modify faulty source code (not part of the agent implementation)
import gym
import numpy as np
import random

from ray.rllib.policy.policy import Policy
from ray.rllib.policy.view_requirement import ViewRequirement

class AlwaysSameHeuristic(Policy):
    """Pick a random move and stick with it for the entire episode."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # [Force-removed exploration - was breaking without tensorflow]
        self.view_requirements.update({
            "state_in_0": ViewRequirement(
                "state_out_0",
                shift=-1,
                space=gym.spaces.Box(0, 100, shape=(), dtype=np.int32))
        })

    def get_initial_state(self):
        return [
            random.choice([
                RockPaperScissors.ROCK, RockPaperScissors.PAPER,
                RockPaperScissors.SCISSORS
            ])
        ]

    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        return state_batches[0], state_batches, {}

In [8]:
# Rock-paper-scissors against a deterministic agent
config = {
    "env": RockPaperScissors,
    "env_config": {
        "sheldon_cooper": False,
    },
    "num_gpus": 1,
    "num_envs_per_worker": 4,
    "rollout_fragment_length": 10,
    "multiagent": {
        "policies_to_train": ["learned"],
        "policies": {
            "always_same": (AlwaysSameHeuristic, Discrete(3), Discrete(3), {}),
            "learned": (None, Discrete(3), Discrete(3), {"framework": "torch"}),
        },
        "policy_mapping_fn": lambda p: "learned" if p == "player1" else "always_same"
    },
    "framework": "torch",
}

In [12]:
# Iterate until either of the stopping criteria
def train(trainer, env):
    for _ in range(150):
        results = trainer.train()
        print(results["policy_reward_mean"])
        if results["timesteps_total"] > 100000:
            break
        elif env.player1_score - env.player2_score > 1000.0:
            return
    raise ValueError(
        "Desired reward difference ({}) not reached! Only got to {}.".
        format(args.stop_reward, env.player1_score - env.player2_score))

# Train the policy
cls = get_agent_class("PG")
trainer = cls(config=config)
env = trainer.workers.local_worker().env
train(trainer, env)

{'learned': -0.4, 'always_same': 0.4}
{'learned': -0.2, 'always_same': 0.2}
{'learned': 0.11666666666666667, 'always_same': -0.11666666666666667}
{'learned': 0.275, 'always_same': -0.275}
{'learned': 0.43, 'always_same': -0.43}
{'learned': 0.85, 'always_same': -0.85}
{'learned': 1.22, 'always_same': -1.22}
{'learned': 1.55, 'always_same': -1.55}
{'learned': 1.98, 'always_same': -1.98}
{'learned': 2.22, 'always_same': -2.22}
{'learned': 2.55, 'always_same': -2.55}
{'learned': 2.98, 'always_same': -2.98}
{'learned': 3.35, 'always_same': -3.35}
{'learned': 3.94, 'always_same': -3.94}
{'learned': 4.69, 'always_same': -4.69}
{'learned': 5.43, 'always_same': -5.43}
{'learned': 5.99, 'always_same': -5.99}
{'learned': 6.53, 'always_same': -6.53}
{'learned': 6.96, 'always_same': -6.96}
{'learned': 7.45, 'always_same': -7.45}
{'learned': 7.8, 'always_same': -7.8}
{'learned': 8.15, 'always_same': -8.15}
{'learned': 8.58, 'always_same': -8.58}
{'learned': 8.78, 'always_same': -8.78}
{'learned': 8.