# Install RLLib

In [None]:
# May need to restart the kernel if anything new is installed
%pip install ray[rllib]
%pip install pettingzoo[all]

In [1]:
# If this succeeds, your rllib / pettingzoo install should be all set
# Make sure you also have pytorch or tensorflow working
from ray import tune
from ray.rllib.examples.env.rock_paper_scissors import RockPaperScissors
from ray.rllib.examples.policy.rock_paper_scissors_dummies import AlwaysSameHeuristic

# Minimal MARL Example
## Trivial agent

In [2]:
# Rock-paper-scissors with template agent
config = {
    "env": RockPaperScissors,
    "framework": "torch",
    "num_envs_per_worker": 4,
}

# Iterate until either is achieved
stop = {
    "training_iteration": 150,
    "timesteps_total": 100000,
    "episode_reward_mean": 1000.0
}

In [3]:
# Train the policy
results = tune.run("PG", config=config, stop=stop, verbose=1)

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PG_RockPaperScissors_adba3_00000,TERMINATED,,125,25.808,100000,0,0,0,10


2020-11-18 17:52:35,824	INFO tune.py:439 -- Total run time: 40.91 seconds (30.63 seconds for the tuning loop).


## Multiple policies (does not work yet)

In [None]:
# Top sneaky: remove faulty code (this cell should be hidden)
import gym
import numpy as np
import random

from ray.rllib.examples.env.rock_paper_scissors import RockPaperScissors
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.view_requirement import ViewRequirement

class AlwaysSameHeuristic(Policy):
    """Pick a random move and stick with it for the entire episode."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # [Force-removed exploration - was breaking without tensorflow]
        self.view_requirements.update({
            "state_in_0": ViewRequirement(
                "state_out_0",
                shift=-1,
                space=gym.spaces.Box(0, 100, shape=(), dtype=np.int32))
        })

    def get_initial_state(self):
        return [
            random.choice([
                RockPaperScissors.ROCK, RockPaperScissors.PAPER,
                RockPaperScissors.SCISSORS
            ])
        ]

    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        return state_batches[0], state_batches, {}

In [None]:
# Rock-paper-scissors against a deterministic agent
config = {
    "env": RockPaperScissors,
    "env_config": {
        "sheldon_cooper": False,
    },
    "num_gpus": 1,
    "num_envs_per_worker": 4,
    "rollout_fragment_length": 10,
    "multiagent": {
        "policies_to_train": ["learned"],
        "policies": {
            "always_same": (AlwaysSameHeuristic, Discrete(3), Discrete(3), {}),
            "learned": (None, Discrete(3), Discrete(3), {"framework": "torch"}),
        },
        "policy_mapping_fn": lambda p: "learned" if p == "player1" else "always_same"
    },
    "framework": "torch",
}

In [None]:
# Iterate until either of the stopping criteria
def train(trainer, env):
    for _ in range(150):
        results = trainer.train()
        print(results["episode_reward_mean"])
        if results["timesteps_total"] > 100000:
            break
        elif env.player1_score - env.player2_score > 1000.0:
            return
    raise ValueError(
        "Desired reward difference ({}) not reached! Only got to {}.".
        format(args.stop_reward, env.player1_score - env.player2_score))

# Train the policy
cls = get_agent_class("PG")
trainer = cls(config=config)
env = trainer.workers.local_worker().env
train(trainer, env)