In [14]:
from Environments.random_maze import maze_game
import numpy as np
import matplotlib.pyplot as plt
from gymnasium import spaces
from gymnasium.spaces import Dict
from ray.tune.registry import register_env
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.models import MODEL_DEFAULTS
from ray.rllib.models import ModelCatalog
import copy
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.utils.torch_utils import FLOAT_MIN
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray import air, tune

**Advanced RL Algorithms and RLLib!** 

The goal of this second notebook is to show you how I go about solving RL problems, using RLLib and more advanced algorithms. Hopefully it will give you an insight into types of problems that are encountered when working in RL and how they are solved. 

As a bit of a showcase, we are going to make the maze game for earlier harder, and discuss how to solve it. 

As in the previous notebook, we will start by looking at problem. You will notice that once again it is a maze game; however, this one will be a fair bit harder to solve. Run the cell below a few times, what do you notice?

In [2]:
example = maze_game()
example.reset()

({'maze': array([[1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1],
         [1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1],
         [1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1],
         [1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1],
         [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1],
         [1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
         [1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1],
         [1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1],
         [1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]], dtype=int8),
  'agent': array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0

This version of the maze game is random! We will need a high level of generalisation to solve this problem! (the action space is the same)

You will also notice that the state is split up by a dictionary. The base form (before wrapping) should present as much data as possible, we can always use wrappers later to reduce the data. 

Like all good ML, we will start with a baseline. We will produce a wrapper that stacks the into 3 planes that will make up the state:

In [3]:
class stacked_maze(maze_game):
    #overwrite the init to change the obs space
    def __init__(self):
        super().__init__()
        #This wrapper updates the observation space. It is now best discribed a multibinary.
        self.observation_space = spaces.MultiBinary((12,12,2))
    
    #overwrite create_state to change how the env handles the state
    def create_state(self): #In this version, the goal never changes, so ignore it
        state = np.stack([np.array(self.maze==self.wall, dtype=np.int8),
                          np.array(self.maze==self.agent, dtype=np.int8)], axis=-1)
        return state

Now we can introduce RLLib, we start by registiering the environment:

In [4]:
#We need to registier the env we want to use with RLLib before we can use it:
def env_creator(env_config):
    return stacked_maze()
register_env("stacked_maze", env_creator)

Now we will define the model that we want to use. More on this config can be found here: https://github.com/ray-project/ray/blob/469f4d296a112f2ade556ea586a0f05811b34d32/rllib/models/catalog.py#L52

We are using some convolution layers to reduce the dimensionality of the matrix inputs. These will get flattened and passed it the dense layers given in 'fcnet_hiddens'.

In [5]:
model = copy.copy(MODEL_DEFAULTS)
model.update({'fcnet_hiddens': [256, 128], 'fcnet_activation': 'relu', 'conv_filters': [[4, [3, 3], 1], [8, [3, 3], 1], [12, [3, 3], 1], [16, [3, 3], 1]]})

We now create a config that will produce a trainer for learning on the "stacked_maze" environment. We wont worry too much about the learning params rn. At this stage I am mainly checking that my MDP is working.

In [6]:
config = (
    PPOConfig()
    .rl_module(_enable_rl_module_api=False)
    .environment(env="stacked_maze")\
    .rollouts(num_rollout_workers=4, num_envs_per_worker=1)\
    .training(_enable_learner_api=False, train_batch_size=2000, gamma=0.995, model=model, lr=0.001,  )\
    .environment(disable_env_checking=True)\
    .framework('torch')\
)
trainer = config.build()

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2023-10-25 11:58:27,927	INFO tensorboardx.py:48 -- pip install "ray[tune]" to see TensorBoard files.
2023-10-25 11:58:31,691	INFO worker.py:1621 -- Started a local Ray instance.
2023-10-25 11:58:43,359	INFO trainable.py:172 -- Trainable.setup took 15.429 seconds. If your trainable is slow to initialize

Now the trainer is built, we can start training! We are simply powering this with a for loop.

In [7]:
def print_results(results_dict):
    train_iter = results_dict["training_iteration"]
    r_mean = results_dict["episode_reward_mean"]
    r_max = results_dict["episode_reward_max"]
    r_min = results_dict["episode_reward_min"]
    print(f"{train_iter:4d} \tr_mean: {r_mean:.1f} \tr_max: {r_max:.1f} \tr_min: {r_min: .1f}")

for i in range(50):
    print_results(trainer.train())



   1 	r_mean: -50.0 	r_max: -50.0 	r_min: -50.0
   2 	r_mean: -32.1 	r_max: 22.9 	r_min: -50.0
   3 	r_mean: -32.7 	r_max: 22.9 	r_min: -50.0
   4 	r_mean: -37.0 	r_max: 22.9 	r_min: -50.0
   5 	r_mean: -39.6 	r_max: 22.9 	r_min: -50.0
   6 	r_mean: -41.4 	r_max: 22.9 	r_min: -50.0
   7 	r_mean: -42.6 	r_max: 22.9 	r_min: -50.0
   8 	r_mean: -43.5 	r_max: 22.9 	r_min: -50.0
   9 	r_mean: -44.2 	r_max: 22.9 	r_min: -50.0
  10 	r_mean: -44.8 	r_max: 22.9 	r_min: -50.0
  11 	r_mean: -45.3 	r_max: 22.9 	r_min: -50.0
  12 	r_mean: -45.7 	r_max: 22.9 	r_min: -50.0
  13 	r_mean: -46.0 	r_max: 22.9 	r_min: -50.0
  14 	r_mean: -46.3 	r_max: 22.9 	r_min: -50.0
  15 	r_mean: -46.5 	r_max: 22.9 	r_min: -50.0
  16 	r_mean: -46.8 	r_max: 22.9 	r_min: -50.0
  17 	r_mean: -46.9 	r_max: 22.9 	r_min: -50.0
  18 	r_mean: -47.1 	r_max: 22.9 	r_min: -50.0
  19 	r_mean: -47.3 	r_max: 22.9 	r_min: -50.0
  20 	r_mean: -47.4 	r_max: 22.9 	r_min: -50.0
  21 	r_mean: -47.5 	r_max: 22.9 	r_min: -50.0
  22 	r_mean

So this didn't get a very good result, but it seems to have learnt something! So its now time to consider what hyperparamters we are using. In RL, typically have 5-10 different hyperparameters to balance, which makes running hyperparameter searches very time consuming. 

In [8]:
config = (
    PPOConfig()
    .rl_module(_enable_rl_module_api=False)
    .environment(env="stacked_maze")\
    .rollouts(num_rollout_workers=4, num_envs_per_worker=1)\
    .training(_enable_learner_api=False, train_batch_size=15000, gamma=0.995, model=model, lr=0.0003,\
              clip_param=0.15, num_sgd_iter=4, sgd_minibatch_size=3000, entropy_coeff=0.0001,)\
    .environment(disable_env_checking=True)\
    .framework('torch')\
)
trainer = config.build()

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2023-10-25 12:03:47,779	INFO trainable.py:172 -- Trainable.setup took 11.322 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [9]:
for i in range(50):
    print_results(trainer.train())



   1 	r_mean: -45.9 	r_max: 12.9 	r_min: -50.0
   2 	r_mean: -33.6 	r_max: 40.5 	r_min: -50.0
   3 	r_mean: -24.3 	r_max: 40.5 	r_min: -50.0
   4 	r_mean: -16.7 	r_max: 42.9 	r_min: -50.0
   5 	r_mean: -18.7 	r_max: 42.9 	r_min: -50.0
   6 	r_mean: -21.3 	r_max: 42.9 	r_min: -50.0
   7 	r_mean: -21.0 	r_max: 39.4 	r_min: -50.0
   8 	r_mean: -21.0 	r_max: 43.9 	r_min: -50.0
   9 	r_mean: -14.5 	r_max: 43.9 	r_min: -50.0
  10 	r_mean: -13.3 	r_max: 43.9 	r_min: -50.0
  11 	r_mean: -7.4 	r_max: 42.8 	r_min: -50.0
  12 	r_mean: -3.6 	r_max: 42.8 	r_min: -50.0
  13 	r_mean: -5.4 	r_max: 41.2 	r_min: -50.0
  14 	r_mean: -14.9 	r_max: 39.8 	r_min: -50.0
  15 	r_mean: -10.5 	r_max: 44.0 	r_min: -50.0
  16 	r_mean: -6.9 	r_max: 44.0 	r_min: -50.0
  17 	r_mean: -3.9 	r_max: 44.0 	r_min: -50.0
  18 	r_mean: -6.1 	r_max: 43.7 	r_min: -50.0
  19 	r_mean: -4.6 	r_max: 45.0 	r_min: -50.0
  20 	r_mean: -8.0 	r_max: 45.0 	r_min: -50.0
  21 	r_mean: -9.8 	r_max: 44.7 	r_min: -50.0
  22 	r_mean: -8.2 	r_

In [10]:
class random_maze_action_mask(maze_game):
    #overwrite the init to change the obs space
    def __init__(self):
        super().__init__()
        #This wrapper updates the observation space. It is now best discribed a multibinary.
        self.observation_space = spaces.Dict({'observations': spaces.MultiBinary((12,12,2)), 
                                              'action_mask': spaces.MultiBinary(4)})
    
    #overwrite create_state to change how the env handles the state
    def create_state(self):
        state = {}
        state['observations'] = np.stack([np.array(self.maze==self.wall, dtype=np.int8),
                                  np.array(self.maze==self.agent, dtype=np.int8)], axis=-1)
        state['action_mask'] = self.get_action_mask()
        return state
    
    def get_action_mask(self):
        action_mask = np.zeros(4,dtype=np.int8)
        if self.y != 0:
            if self.maze[self.x, self.y-1] != self.wall:
                action_mask[0] = 1
        if self.x != self.maze.shape[0]-1:
            if self.maze[self.x+1, self.y] != self.wall:
                action_mask[1] = 1
        if self.y != self.maze.shape[1]-1:
            if self.maze[self.x, self.y+1] != self.wall:
                action_mask[2] = 1
        if self.x != 0:
            if self.maze[self.x-1, self.y] != self.wall:
                action_mask[3] = 1
        return action_mask

def env_creator(env_config):
    return random_maze_action_mask()
register_env("random_maze_action_mask", env_creator)

In [11]:
torch, nn = try_import_torch()

class ActionMaskModel(TorchModelV2, nn.Module):
    """PyTorch version of above ActionMaskingModel."""

    def __init__(
        self,
        obs_space,
        action_space,
        num_outputs,
        model_config,
        name,
        **kwargs,
    ):
        orig_space = getattr(obs_space, "original_space", obs_space)
        assert (
            isinstance(orig_space, Dict)
            and "action_mask" in orig_space.spaces
            and "observations" in orig_space.spaces
        )

        TorchModelV2.__init__(
            self, obs_space, action_space, num_outputs, model_config, name, **kwargs
        )
        nn.Module.__init__(self)

        self.internal_model = TorchFC(
            orig_space["observations"],
            action_space,
            num_outputs,
            model_config,
            name + "_internal",
        )

    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        action_mask = input_dict["obs"]["action_mask"]
        # Compute the unmasked logits.
        logits, _ = self.internal_model({"obs": input_dict["obs"]["observations"]})
        # Convert action_mask into a [0.0 || -inf]-type mask.
        inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)
        masked_logits = logits + inf_mask
        # Return masked logits.
        return masked_logits, state

    def value_function(self):
        return self.internal_model.value_function()

In [12]:
ModelCatalog.register_custom_model("RandomMazeModel", ActionMaskModel)
model = copy.copy(MODEL_DEFAULTS)
model.update({'custom_model': "RandomMazeModel", 'custom_model_config': {},})

config = (
    PPOConfig()
    .rl_module(_enable_rl_module_api=False)
    .environment(env="random_maze_action_mask")\
    .rollouts(num_rollout_workers=4, num_envs_per_worker=1)\
    .training(_enable_learner_api=False, train_batch_size=15000, gamma=0.995, model=model, lr=0.0003,\
              clip_param=0.15, num_sgd_iter=4, sgd_minibatch_size=3000, entropy_coeff=0.0001,)\
    .environment(disable_env_checking=True)\
    .framework('torch')\
)
trainer = config.build()

  self.internal_model = TorchFC(


In [13]:
for i in range(50):
    print_results(trainer.train())

[2m[36m(RolloutWorker pid=8430)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


   1 	r_mean: -17.6 	r_max: 34.9 	r_min: -50.0
   2 	r_mean: -15.2 	r_max: 44.9 	r_min: -50.0
   3 	r_mean: -6.6 	r_max: 45.3 	r_min: -50.0
   4 	r_mean: 14.8 	r_max: 47.1 	r_min: -50.0
   5 	r_mean: 10.6 	r_max: 47.1 	r_min: -50.0
   6 	r_mean: 0.4 	r_max: 46.3 	r_min: -50.0
   7 	r_mean: 6.2 	r_max: 46.9 	r_min: -50.0
   8 	r_mean: 11.4 	r_max: 46.7 	r_min: -50.0
   9 	r_mean: 15.5 	r_max: 46.7 	r_min: -50.0
  10 	r_mean: 8.9 	r_max: 46.1 	r_min: -50.0
  11 	r_mean: 5.4 	r_max: 46.3 	r_min: -50.0
  12 	r_mean: 1.5 	r_max: 46.9 	r_min: -50.0
  13 	r_mean: 6.4 	r_max: 46.9 	r_min: -50.0
  14 	r_mean: 11.4 	r_max: 47.5 	r_min: -50.0
  15 	r_mean: 5.2 	r_max: 47.5 	r_min: -50.0
  16 	r_mean: 4.9 	r_max: 47.3 	r_min: -50.0
  17 	r_mean: 6.6 	r_max: 46.7 	r_min: -50.0
  18 	r_mean: 2.1 	r_max: 47.3 	r_min: -50.0
  19 	r_mean: -0.8 	r_max: 47.7 	r_min: -50.0
  20 	r_mean: 1.9 	r_max: 47.7 	r_min: -50.0
  21 	r_mean: 8.8 	r_max: 47.5 	r_min: -50.0
  22 	r_mean: 5.8 	r_max: 47.5 	r_min: -50.0

In [20]:
config = (
    PPOConfig()
    .rl_module(_enable_rl_module_api=False)
    .environment(env="random_maze_action_mask")\
    .rollouts(num_rollout_workers=4, num_envs_per_worker=1)\
    .training(_enable_learner_api=False, train_batch_size=tune.grid_search([12000, 18000]), gamma=0.995, model=model, lr=0.0003,\
              clip_param=0.15, num_sgd_iter=tune.grid_search([3,5,7]), sgd_minibatch_size=tune.grid_search([2000, 4000]), entropy_coeff=0.0001,)\
    .environment(disable_env_checking=True)\
    .framework('torch')\
    )

tuner = tune.Tuner(
    "PPO",
    tune_config=tune.TuneConfig(num_samples=3),
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop={"timesteps_total": 1.5e6}, storage_path="./results", name="ppo_hyperparam_search"))

results = tuner.fit()

0,1
Current time:,2023-10-25 12:51:07
Running for:,00:23:48.95
Memory:,10.5/16.0 GiB

Trial name,status,loc,num_sgd_iter,sgd_minibatch_size,train_batch_size,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_random_maze_action_mask_74bab_00001,RUNNING,127.0.0.1:19055,5,2000,12000,101.0,661.39,1212000.0,-4.105,47.9,-50.0,296.56
PPO_random_maze_action_mask_74bab_00002,PENDING,,7,2000,12000,,,,,,,
PPO_random_maze_action_mask_74bab_00003,PENDING,,3,4000,12000,,,,,,,
PPO_random_maze_action_mask_74bab_00004,PENDING,,5,4000,12000,,,,,,,
PPO_random_maze_action_mask_74bab_00005,PENDING,,7,4000,12000,,,,,,,
PPO_random_maze_action_mask_74bab_00006,PENDING,,3,2000,18000,,,,,,,
PPO_random_maze_action_mask_74bab_00007,PENDING,,5,2000,18000,,,,,,,
PPO_random_maze_action_mask_74bab_00008,PENDING,,7,2000,18000,,,,,,,
PPO_random_maze_action_mask_74bab_00009,PENDING,,3,4000,18000,,,,,,,
PPO_random_maze_action_mask_74bab_00010,PENDING,,5,4000,18000,,,,,,,


[2m[36m(PPO pid=14587)[0m Install gputil for GPU system monitoring.
[2m[36m(RolloutWorker pid=14630)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
[2m[36m(PPO pid=19055)[0m Install gputil for GPU system monitoring.
[2m[36m(RolloutWorker pid=19105)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
2023-10-25 12:51:10,513	INFO tune.py:1148 -- Total run time: 1432.09 seconds (1428.93 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/Users/adamprice/RL Master Class/RLMasterClass/results/ppo_hyperparam_search", trainable=...)


RuntimeError: Uploading, downloading, and deleting from cloud storage requires pyarrow to be installed. Install with: `pip install pyarrow`. Subsequent calls to cloud operations will be ignored.