<a href="https://colab.research.google.com/github/rajdeepd/tensorflow_2.0_book_code/blob/master/ch09/Sample_One_SAC_Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!sudo apt-get install -y xvfb ffmpeg
!pip install 'imageio==2.4.0'
#!pip install pyvirtualdisplay
!pip install tf-agents

Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
The following NEW packages will be installed:
  xvfb
0 upgraded, 1 newly installed, 0 to remove and 31 not upgraded.
Need to get 784 kB of archives.
After this operation, 2,270 kB of additional disk space will be used.
Ign:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.8
Err:1 http://security.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.8
  404  Not Found [IP: 91.189.88.152 80]
E: Failed to fetch http://security.ubuntu.com/ubuntu/pool/universe/x/xorg-server/xvfb_1.19.6-1ubuntu4.8_amd64.deb  404  Not Found [IP: 91.189.88.152 80]
E: Unable to fetch some archives, maybe run apt-get update or try with --fix-missing?
Collecting imageio==2.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/ac/64/8e2bb6aac43d6ed7c2d9514320b43d5e80c00f150ee2b9408aee24

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from absl.testing import parameterized
import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import

import tf_agents
import sys
from tf_agents.agents.dqn import dqn_agent
from tf_agents.networks import network
from tf_agents.networks import q_network
from tf_agents.networks import sequential
from tf_agents.networks import test_utils as networks_test_utils
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import policy_step
from tf_agents.trajectories import test_utils as trajectories_test_utils
from tf_agents.trajectories import time_step as ts
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.utils import test_utils
from tf_agents.agents.sac.sac_agent import SacAgent

In [3]:
class MyActorPolicy(object):

  def __init__(self,
               time_step_spec,
               action_spec,
               actor_network,
               training=False):
    del time_step_spec
    del actor_network
    del training
    single_action_spec = tf.nest.flatten(action_spec)[0]
    # Action is maximum of action range.
    self._action = single_action_spec.maximum
    self._action_spec = action_spec
    self.info_spec = ()

  def action(self, time_step):
    observation = time_step.observation
    batch_size = observation.shape[0]
    action = tf.constant(self._action, dtype=tf.float32, shape=[batch_size, 1])
    return policy_step.PolicyStep(action=action)

  def distribution(self, time_step, policy_state=()):
    del policy_state
    action = self.action(time_step).action
    return policy_step.PolicyStep(action=_MockDistribution(action))

  def get_initial_state(self, batch_size):
    del batch_size
    return ()

In [9]:
class MyCriticNet(network.Network):

  def __init__(self, l2_regularization_weight=0.0, shared_layer=None):
    super(MyCriticNet, self).__init__(
        input_tensor_spec=(tensor_spec.TensorSpec([2], tf.float32),
                           tensor_spec.TensorSpec([1], tf.float32)),
        state_spec=(),
        name=None)
    self._l2_regularization_weight = l2_regularization_weight
    self._value_layer = tf.keras.layers.Dense(
        1,
        kernel_regularizer=tf.keras.regularizers.l2(l2_regularization_weight),
        kernel_initializer=tf.constant_initializer([[0], [1]]),
        bias_initializer=tf.constant_initializer([[0]]))
    self._shared_layer = shared_layer
    self._action_layer = tf.keras.layers.Dense(
        1,
        kernel_regularizer=tf.keras.regularizers.l2(l2_regularization_weight),
        kernel_initializer=tf.constant_initializer([[1]]),
        bias_initializer=tf.constant_initializer([[0]]))

  def copy(self, name=''):
    del name
    return MyCriticNet(
        l2_regularization_weight=self._l2_regularization_weight,
        shared_layer=self._shared_layer)

  def call(self, inputs, step_type, network_state=()):
    del step_type
    observation, actions = inputs
    actions = tf.cast(tf.nest.flatten(actions)[0], tf.float32)

    states = tf.cast(tf.nest.flatten(observation)[0], tf.float32)

    s_value = self._value_layer(states)
    if self._shared_layer:
      s_value = self._shared_layer(s_value)
    a_value = self._action_layer(actions)
    # Biggest state is best state.
    q_value = tf.reshape(s_value + a_value, [-1])
    return q_value, network_state

In [10]:
def create_sequential_critic_net(l2_regularization_weight=0.0,
                                 shared_layer=None):
  value_layer = tf.keras.layers.Dense(
      1,
      kernel_regularizer=tf.keras.regularizers.l2(l2_regularization_weight),
      kernel_initializer=tf.initializers.constant([[0], [1]]),
      bias_initializer=tf.initializers.constant([[0]]))
  if shared_layer:
    value_layer = sequential.Sequential([value_layer, shared_layer])

  action_layer = tf.keras.layers.Dense(
      1,
      kernel_regularizer=tf.keras.regularizers.l2(l2_regularization_weight),
      kernel_initializer=tf.initializers.constant([[1]]),
      bias_initializer=tf.initializers.constant([[0]]))

  def sum_value_and_action_out(value_and_action_out):
    value_out, action_out = value_and_action_out
    return tf.reshape(value_out + action_out, [-1])

  return sequential.Sequential([
      nest_map.NestMap((value_layer, action_layer)),
      tf.keras.layers.Lambda(sum_value_and_action_out)
  ])


In [13]:
from tf_agents.networks import actor_distribution_network
from tf_agents.agents.sac import tanh_normal_projection_network


class SacAgentSample( test_utils.TestCase):

  def setUp(self):
    super(SacAgentSample, self).setUp()
    self._obs_spec = tensor_spec.BoundedTensorSpec([2],
                                                   tf.float32,
                                                   minimum=0,
                                                   maximum=1)
    self._time_step_spec = ts.time_step_spec(self._obs_spec)
    self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)

  #@parameterized.named_parameters(('Network', DummyCriticNet, False),
  #                                ('Keras', create_sequential_critic_net, True))
  def runCreateAgent(self, create_critic_net_fn, skip_in_tf1):
    if skip_in_tf1 and not common.has_eager_been_enabled():
      self.skipTest('Skipping test: sequential networks not supported in TF1')

    critic_network = create_critic_net_fn()

    SacAgent(
        self._time_step_spec,
        self._action_spec,
        critic_network=critic_network,
        actor_network=None,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001))
        actor_policy_ctor=MyActorPolicy)


  def runAgentTrajectoryTrain(self):
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        self._obs_spec,
        self._action_spec,
        fc_layer_params=(10,),
        continuous_projection_net=tanh_normal_projection_network
        .TanhNormalProjectionNetwork)

    agent = SacAgent(
        self._time_step_spec,
        self._action_spec,
        critic_network=MyCriticNet(),
        actor_network=actor_net,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001))

    trajectory_spec = trajectory.Trajectory(
        step_type=self._time_step_spec.step_type,
        observation=self._time_step_spec.observation,
        action=self._action_spec,
        policy_info=(),
        next_step_type=self._time_step_spec.step_type,
        reward=tensor_spec.BoundedTensorSpec(
            [], tf.float32, minimum=0.0, maximum=1.0, name='reward'),
        discount=self._time_step_spec.discount)

    sample_trajectory_experience = tensor_spec.sample_spec_nest(
        trajectory_spec, outer_dims=(3, 2))
    loss_info = agent.train(sample_trajectory_experience)
    tf.print(loss_info)

In [15]:
sac_agent_1 = SacAgentSample()
sac_agent_1.setUp()
#'Network', MyCriticNet, False
sac_agent_1.runCreateAgent(MyCriticNet,False)
sac_agent_1.runAgentTrajectoryTrain()

LossInfo(loss=3.35792017, extra=SacLossInfo(critic_loss=2.24238253, actor_loss=1.11553776, alpha_loss=0))
