In [2]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np

from tf_agents.specs import array_spec
from tf_agents.specs import tensor_spec
from tf_agents.networks import network

from tf_agents.policies import py_policy
from tf_agents.policies import random_py_policy
from tf_agents.policies import scripted_py_policy

from tf_agents.policies import tf_policy
from tf_agents.policies import random_tf_policy
from tf_agents.policies import actor_policy
from tf_agents.policies import q_policy
from tf_agents.policies import greedy_policy

from tf_agents.trajectories import time_step as ts

2025-01-07 17:44:12.167342: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-07 17:44:12.294755: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-07 17:44:12.616014: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-07 17:44:12.616084: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-07 17:44:12.637700: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

## Python Policies
The interface for Python policies is defined in policies/py_policy.PyPolicy. The main methods are:

## Example 1: Random Python Policy

A simple example of a PyPolicy is the RandomPyPolicy, which generates random actions for the discrete/continuous given action_spec. The input time_step is ignored.


In [4]:
action_spec = array_spec.BoundedArraySpec((2,), np.int32, -10, 10)
my_random_py_policy = random_py_policy.RandomPyPolicy(time_step_spec=None, action_spec=action_spec)
time_step = None
action_spec = my_random_py_policy.action(time_step)
print(action_spec)
action_spec = my_random_py_policy.action(time_step)
print(action_spec)

PolicyStep(action=array([ 8, -3], dtype=int32), state=(), info=())
PolicyStep(action=array([-6,  6], dtype=int32), state=(), info=())


## Example 2: Scripted Python Policy

A scripted policy plays back a script of actions represented as a list of (num_repeats, action) tuples. Every time the action function is called, it returns the next action from the list until the specified number of repeats is done, and then moves on to the next action in the list. The reset method can be called to start executing from the beginning of the list.

In [8]:
action_spec = array_spec.BoundedArraySpec((2,), np.int32, -10, 10)
action_script = [
    ( 1, np.array( [2, 5], dtype=np.int32 )),
    ( 0, np.array( [0, 0], dtype=np.int32 )),
    ( 2, np.array( [1, 2], dtype=np.int32 )),
    ( 1, np.array( [2, 5], dtype=np.int32 )),
]

my_scripted_py_policy = scripted_py_policy.ScriptedPyPolicy(
    time_step_spec=None, action_spec=action_spec, action_script=action_script
)

policy_state = my_scripted_py_policy.get_initial_state()
time_step = None

print('Executing scripted policy...')
action_step = my_scripted_py_policy.action(time_step, policy_state)
print(action_step)
action_step= my_scripted_py_policy.action(time_step, action_step.state)
print(action_step)
action_step = my_scripted_py_policy.action(time_step, action_step.state)
print(action_step)

print('\nResetting my_scripted_py_policy...')
policy_state = my_scripted_py_policy.get_initial_state()
action_step = my_scripted_py_policy.action(time_step, policy_state)
print(action_step)

Executing scripted policy...
PolicyStep(action=array([2, 5], dtype=int32), state=[0, 1], info=())
PolicyStep(action=array([1, 2], dtype=int32), state=[2, 1], info=())
PolicyStep(action=array([1, 2], dtype=int32), state=[2, 2], info=())

Resetting my_scripted_py_policy...
PolicyStep(action=array([2, 5], dtype=int32), state=[0, 1], info=())


## TensorFlow Policies
TensorFlow policies follow the same interface as Python policies. Let us look at a few examples.

## Example 1: Random TF Policy
A RandomTFPolicy can be used to generate random actions according to a given discrete/continuous action_spec. The input time_step is ignored.

In [11]:
action_spec = tensor_spec.BoundedTensorSpec((2,), tf.float32, minimum=-1, maximum=3)
input_tensor_spec = tensor_spec.TensorSpec((2,), tf.float32)
time_step_spec = ts.time_step_spec(input_tensor_spec)

my_random_tf_policy = random_tf_policy.RandomTFPolicy(
    action_spec=action_spec, time_step_spec=time_step_spec
)
observation = tf.ones(time_step_spec.observation.shape)
time_step = ts.restart(observation)
action_step = my_random_tf_policy.action(time_step)

print(f'Action: {action_step.action}')

Action: [-0.21865416  1.0914135 ]


## Example 2: Actor Policy
An actor policy can be created using either a network that maps time_steps to actions or a network that maps time_steps to distributions over actions.

### Using an action network: 
Let us define a network as follows:

In [24]:
class ActionNet(network.Network):
    def __init__(self, input_tensor_spec, output_tensor_spec):
        super(ActionNet, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name='ActionNet'
        )
        self._output_tensor_spec = output_tensor_spec
        self._sub_layers = [
            tf.keras.layers.Dense(
                action_spec.shape.num_elements(), activation=tf.nn.tanh
            )
        ]

        def call(self, observations, step_type, network_state):
            del step_type

            output = tf.cast(observations, dtype=tf.float32)
            for layer in self._sub_layers:
                output = layer(output)

            actions = tf.reshape(output, [-1] + self._output_tensor_spec.shape.as_list())          
            return actions, network_state

In [25]:
input_tensor_spec = tensor_spec.TensorSpec((4,), tf.float32)
time_step_spec = ts.time_step_spec(input_tensor_spec)
action_spec = tensor_spec.BoundedTensorSpec((3,), tf.float32, minimum=-1, maximum=1)
action_net = ActionNet(input_tensor_spec, action_spec)

my_actor_policy = actor_policy.ActorPolicy(
    time_step_spec=time_step_spec,
    action_spec=action_spec,
    actor_network=action_net
)

ValueError: not enough values to unpack (expected 2, got 1)
  In call to configurable 'ActorPolicy' (<class 'tf_agents.policies.actor_policy.ActorPolicy'>)