https://docs.ray.io/en/latest/rllib/rllib-training.html

## Example: Preprocessing observations for feeding into a model

In [1]:
import gym
env = gym.make("Pong-v0")

# RLlib uses preprocessors to implement transforms such as one-hot encoding
# and flattening of tuple and dict observations.
from ray.rllib.models.preprocessors import get_preprocessor
prep = get_preprocessor(env.observation_space)(env.observation_space)
# <ray.rllib.models.preprocessors.GenericPixelPreprocessor object at 0x7fc4d049de80>

# Observations should be preprocessed prior to feeding into a model
env.reset().shape
# (210, 160, 3)
prep.transform(env.reset()).shape
# (84, 84, 3)

A.L.E: Arcade Learning Environment (version 0.7.5+db37282)
[Powered by Stella]
  from .autonotebook import tqdm as notebook_tqdm


(84, 84, 3)

## Example: Querying a policy’s action distribution



In [1]:
import ray

In [2]:
import tensorflow

In [3]:
import torch

In [4]:
import numpy as np
# Get a reference to the policy
from ray.rllib.agents.ppo import PPOTrainer
trainer = PPOTrainer(env="CartPole-v0", config={"framework": "tf2", "num_workers": 0})
policy = trainer.get_policy()

Metal device set to: Apple M1


In [5]:
# Run a forward pass to get model output logits. Note that complex observations
# must be preprocessed as in the above code block.
logits, _ = policy.model({"obs": np.array([[0.1, 0.2, 0.3, 0.4]])})

In [6]:
# version in docs
# logits, _ = policy.model.from_batch({"obs": np.array([[0.1, 0.2, 0.3, 0.4]])})

In [7]:
policy.dist_class

ray.rllib.models.tf.tf_action_dist.Categorical

In [8]:
dist = policy.dist_class(logits, policy.model)

In [9]:
dist.sample()

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([1])>

In [10]:
dist.logp([1])

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.6939949], dtype=float32)>

In [11]:
policy.model.value_function()

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.00134783], dtype=float32)>

In [12]:
policy.model.base_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 observations (InputLayer)      [(None, 4)]          0           []                               
                                                                                                  
 fc_1 (Dense)                   (None, 256)          1280        ['observations[0][0]']           
                                                                                                  
 fc_value_1 (Dense)             (None, 256)          1280        ['observations[0][0]']           
                                                                                                  
 fc_2 (Dense)                   (None, 256)          65792       ['fc_1[0][0]']                   
                                                                                              

## Example: getting Q values

In [21]:
# Get a reference to the model through the policy
import numpy as np
from ray.rllib.agents.dqn import DQNTrainer
trainer = DQNTrainer(env="CartPole-v0", config={"framework": "tf2"})
model = trainer.get_policy().model
model

<ray.rllib.models.catalog.FullyConnectedNetwork_as_DistributionalQTFModel at 0x14ee40bb0>

In [22]:
# <ray.rllib.models.catalog.FullyConnectedNetwork_as_DistributionalQModel ...>

In [23]:
# Run a forward pass to get base model output. Note that complex observations
# must be preprocessed. An example of preprocessing is examples/saving_experiences.py
model_out = model({"obs": np.array([[0.1, 0.2, 0.3, 0.4]])})
model_out[0].shape

TensorShape([1, 256])

In [24]:
# TensorShape([Dimension(1), Dimension(256)])

# Access the base Keras models (all default models have a base)
model.base_model.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 observations (InputLayer)      [(None, 4)]          0           []                               
                                                                                                  
 fc_1 (Dense)                   (None, 256)          1280        ['observations[0][0]']           
                                                                                                  
 fc_out (Dense)                 (None, 256)          65792       ['fc_1[0][0]']                   
                                                                                                  
 value_out (Dense)              (None, 1)            257         ['fc_1[0][0]']                   
                                                                                            

In [25]:
"""
Model: "model"
_______________________________________________________________________
Layer (type)                Output Shape    Param #  Connected to
=======================================================================
observations (InputLayer)   [(None, 4)]     0
_______________________________________________________________________
fc_1 (Dense)                (None, 256)     1280     observations[0][0]
_______________________________________________________________________
fc_out (Dense)              (None, 256)     65792    fc_1[0][0]
_______________________________________________________________________
value_out (Dense)           (None, 1)       257      fc_1[0][0]
=======================================================================
Total params: 67,329
Trainable params: 67,329
Non-trainable params: 0
______________________________________________________________________________
"""

# Access the Q value model (specific to DQN)
print(model.get_q_value_distributions(model_out)[0])

tf.Tensor([[-0.13385144  0.20218928]], shape=(1, 2), dtype=float32)


In [26]:
# <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.05483027,  0.18754935]], dtype=float32)>

model.q_value_head.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 model_out (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 hidden_0 (Dense)               (None, 256)          65792       ['model_out[0][0]']              
                                                                                                  
 dense_6 (Dense)                (None, 2)            514         ['hidden_0[0][0]']               
                                                                                                  
 tf.ones_like_4 (TFOpLambda)    (None, 2)            0           ['dense_6[0][0]']                
                                                                                            

In [27]:
# Access the state value model (specific to DQN)
print(model.get_state_value(model_out))
# tf.Tensor([[0.09381643]], shape=(1, 1), dtype=float32)

tf.Tensor([[0.18455854]], shape=(1, 1), dtype=float32)


In [28]:
model.state_value_head.summary()

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model_out (InputLayer)      [(None, 256)]             0         
                                                                 
 dense_7 (Dense)             (None, 256)               65792     
                                                                 
 dense_8 (Dense)             (None, 1)                 257       
                                                                 
Total params: 66,049
Trainable params: 66,049
Non-trainable params: 0
_________________________________________________________________
