# Notebook 06. End-to-end demo: Learning a multiplayer game and a in-game item price recommendation system with RLlib, Ray Tune, and Ray Serve

© 2019-2022, Anyscale. All Rights Reserved <br>
📖 [Back to Table of Contents](./ex_00_rllib_notebooks_table_of_contents.ipynb)<br>
⬅️ [Previous notebook](./ex_05_rllib_and_ray_serve.ipynb) <br>


In this notebook, you will learn how to:
* Recycle our multi-player game from a previous notebook in this tutorial
* The game will be interrupted in the middle of an episode by an in-game item sale (a power-up is offered to both players at a price determined by a trained RecSys model served via Ray Serve)
* A user model decides whether to buy the item or not
* The game continues with or without the bought item

In [None]:
# Import required packages.

import gym
import numpy as np
import os
import pandas
import requests
import time

import ray
from ray import serve
from ray import tune
from ray.rllib.algorithms.crr import CRRConfig
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.examples.env.random_env import RandomEnv

from multi_agent_arena.multi_agent_arena import MultiAgentArena

if ray.is_initialized():
    ray.shutdown()

print(f"gym: {gym.__version__}")
print(f"ray: {ray.__version__}")

# !ale-import-roms --import-from-pkg atari_py.atari_roms

## Modifying our Game

So far, we have been using our own custom `MultiAgentEnv` sub-class to define our game and asked RLlib to train two policies (one for each agent/player in the game) on how to play the game close to optimal.

In this end-to-end example, we would like to extend this idea and include an in-game power-up (item) sale in the middle of the episode.
The type of the offered item is fixed and always the same for both players. Buying it will allow the respectve agent to move twice as fast as before.
Remember that each episode had a fixed number of timesteps (configurable via the `timestep_limit` constructor argument). We will now add some logic such that the game will pause after half of this number of timesteps and ask the in the to 

<img src="images/multi_agent_arena_3.png" width=800 />


In [None]:
# Use this simple script to generate some RecSys (price recommender) offline data:

dummy_config = PPOConfig().environment(env=RandomEnv, env_config={
    # Observation space: agent1 total reward, agent2 total reward
    "observation_space": gym.spaces.Box(-100, 100.0, (2, ), np.float32),
    # Price for the offered item (between $0 and $100).
    "action_space": gym.spaces.Box(0.0, 100.0, (1,), np.float32),
    "reward_space": gym.spaces.Box(0.0, 1.0, (), np.float32),
    "p_done": 0.0,
    # One-step episode len:
    # reset() -> obs=game state
    # step(action=recommended price) -> reward=bought or not + done?
    "max_episode_len": 1,
}).offline_data(output="offline_rl_data")

# Uncomment to train and generate the json output.
"""
algo = dummy_config.build()

for _ in range(4):
    algo.train()
"""

dummy_config

In [None]:
# Let's first take a look at some of this (JSON) data using pandas:
json_file = "offline_rl_data/in_game_item_price_recsys.json"
dataframe = pandas.read_json(json_file, lines=True)  # don't forget lines=True -> Each line in the json is one "rollout" of 4 timesteps.
dataframe.head()

In [None]:
crr_config = CRRConfig()

crr_config.environment(
    env=None,
    observation_space=dummy_config.env_config["observation_space"],
    action_space=dummy_config.env_config["action_space"],
)

crr_config.offline_data(
    input_="dataset",
    input_config={
        # If you feel daring here, use the `pendulum_beginner.json` file instead of the expert one here.
        # You may need to train a little longer, then, in order to get a decent policy.
        # But since you have the actual Pendulum environment available for evaluation, you should be able
        # to perfectly stop learning once a good episode reward (> -300.0) has been reached.
        "paths": os.path.join(os.getcwd(), "offline_rl_data/in_game_item_price_recsys.json"),
        "format": "json",
    },
    actions_in_input_normalized=True,
)

crr_config.framework("torch")

In [None]:
results = tune.run(
    # Registered name for the CRR Algorithm.
    "CRR",
    # Use our config -> converted to python dict.
    config=crr_config.to_dict(),
    # Stopping criteria -> As we are learning from dummy data, just train for a few iterations.
    stop={
        "training_iteration": 3,
    },
    # Create checkpoint every iteration.
    checkpoint_freq=3,
    local_dir="results",
    verbose=1,
)


In [None]:
# Get the best trial (there is only one) and last checkpoint.
best_trial = results.get_best_trial()
last_checkpoint = results.get_last_checkpoint(trial=best_trial)
print(f"Last checkpoint from training: {last_checkpoint}")

In [None]:
@serve.deployment(route_prefix="/in-game-recommendations")
class ServeModel:
    def __init__(self, config, checkpoint) -> None:
        # Create new algo from scratch.
        self.algo = config.build()
        # Restore state of algo to a already trained one (using a checkpoint).
        self.algo.restore(checkpoint)

    async def __call__(self, request):
        json_input = await request.json()
        # Extract observation from input.
        obs = json_input["observation"]
        # Translate obs back to np.arrays.
        np_obs = np.array(obs)
        action = self.algo.compute_single_action(np_obs, explore=False)
        return {"action": action}

serve_model = ServeModel.bind(crr_config, last_checkpoint)
serve.run(serve_model)
    
# That's it: Deployment created!

In [None]:

# Convenience function to send action requests to the service.
def get_price(rewards1, rewards2):
    obs = np.array([rewards1, rewards2])
    # Convert numpy array to list (needed for http transfer).
    obs = obs.tolist()
    resp = requests.get(
        "http://localhost:8000/in-game-recommendations", json={"observation": obs}
    )
    response_json = resp.json()
    price = response_json["action"][0]
    return price

# Test our deployment
get_price(0.0, -10.0)


In [None]:
class MultiAgentArenaWithItemSale(MultiAgentArena):
    def __init__(self, config=None):
        super().__init__(config=config)
        
        self.sell_item_at_ts = self.timestep_limit // 2

    def reset(self):
        obs = super().reset()
        self.agent1_moves_first = False
        self.agent2_double_speed = False
        return obs

    def step(self, action: dict):
        # Increase our time steps counter by 1.
        self.timesteps += 1
        # An episode is "done" when we reach the time step limit.
        is_done = self.timesteps >= self.timestep_limit

        ######################
        # NEW BEHAVIOR
        ######################
        # It's time to do the item sale.
        price_agent1_item = price_agent2_item = 0.0
        if self.timesteps == self.sell_item_at_ts:
            # Send a price request to our price service.
            price_agent1_item = get_price(self.agent1_R, self.agent2_R)
            price_agent2_item = get_price(self.agent2_R, self.agent1_R)
            
            # User model agent1: User of agent1 buys if item price < 50.0.
            if price_agent1_item < 50.0:
                print("User1 bought power-up!")
                time.sleep(1.0)
                self.agent1_moves_first = True
            # User model agent2: User of agent2 buys if item price < 45.0.
            if price_agent2_item < 45.0:
                print("User2 bought power-up!")
                time.sleep(1.0)
                self.agent2_double_speed = True
        
        # Who moves first?
        # events = [collision|agent1_new_field]
        if self.agent1_moves_first:
            events = self._move(self.agent1_pos, action["agent1"], is_agent1=True)
            events |= self._move(self.agent2_pos, action["agent2"], is_agent1=False)
            # Agent2 is allowed to move twice (double the speed).
            if self.agent2_double_speed:
                events |= self._move(self.agent2_pos, action["agent2"], is_agent1=False)
        else:
            events = self._move(self.agent2_pos, action["agent2"], is_agent1=False)
            # Agent2 is allowed to move twice (double the speed).
            if self.agent2_double_speed:
                events |= self._move(self.agent2_pos, action["agent2"], is_agent1=False)
            events |= self._move(self.agent1_pos, action["agent1"], is_agent1=True)

        # Determine rewards based on the collected events AND on the prices paid:
        r1 = -1.0 if "collision" in events else 1.0 if "agent1_new_field" in events else -0.5
        r2 = 1.0 if "collision" in events else -0.1
        r1 -= price_agent1_item / 10.0
        r2 -= price_agent2_item / 10.0
        self.agent1_R += r1
        self.agent2_R += r2
        ######################
        # END: NEW BEHAVIOR
        ######################

        rewards = {
            "agent1": r1,
            "agent2": r2,
        }

        # Generate a `done` dict (per-agent and total).
        dones = {
            "agent1": is_done,
            "agent2": is_done,
            # special `__all__` key indicates that the episode is done for all agents.
            "__all__": is_done,
        }

        # Useful for rendering.
        self.collision = "collision" in events
        if self.collision is True:
            self.num_collisions += 1    

        return self._get_obs(), rewards, dones, {}  # <- info dict (not needed here).

    

In [None]:
env = MultiAgentArenaWithItemSale(config={"render": True, "width": 5, "height": 5, "timestep_limit": 10})
obs = env.reset()

with env.out:
    # Agent1 moves down, Agent2 moves up.
    obs, rewards, dones, infos = env.step(action={"agent1": 2, "agent2": 0})
    env.render()

    # Agent1 moves right, Agent2 moves left.
    obs, rewards, dones, infos = env.step(action={"agent1": 1, "agent2": 3})
    env.render()

    # Agent1 moves right, Agent2 moves left.
    obs, rewards, dones, infos = env.step(action={"agent1": 1, "agent2": 3})
    env.render()

    # Agent1 moves down, Agent2 moves up.
    obs, rewards, dones, infos = env.step(action={"agent1": 2, "agent2": 0})
    env.render()

    # Agent1 moves left, Agent2 moves up.
    obs, rewards, dones, infos = env.step(action={"agent1": 3, "agent2": 0})
    env.render()

    # Agent1 moves down, Agent2 moves up.
    obs, rewards, dones, infos = env.step(action={"agent1": 2, "agent2": 0})
    env.render()


<img src="images/end_to_end_example.png" width=800 />
