<a href="https://colab.research.google.com/github/rajdeepd/tensorflow_2.0_book_code/blob/master/ch09/Sample_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!sudo apt-get install -y xvfb ffmpeg
!pip install 'imageio==2.4.0'
#!pip install pyvirtualdisplay
!pip install tf-agents

Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
The following NEW packages will be installed:
  xvfb
0 upgraded, 1 newly installed, 0 to remove and 31 not upgraded.
Need to get 784 kB of archives.
After this operation, 2,270 kB of additional disk space will be used.
Ign:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.8
Err:1 http://security.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.8
  404  Not Found [IP: 91.189.88.152 80]
E: Failed to fetch http://security.ubuntu.com/ubuntu/pool/universe/x/xorg-server/xvfb_1.19.6-1ubuntu4.8_amd64.deb  404  Not Found [IP: 91.189.88.152 80]
E: Unable to fetch some archives, maybe run apt-get update or try with --fix-missing?
Collecting imageio==2.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/ac/64/8e2bb6aac43d6ed7c2d9514320b43d5e80c00f150ee2b9408aee24

In [22]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from absl.testing import parameterized
import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import

import tf_agents
import sys
from tf_agents.agents.dqn import dqn_agent
from tf_agents.networks import network
from tf_agents.networks import q_network
from tf_agents.networks import sequential
from tf_agents.networks import test_utils as networks_test_utils
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import policy_step
from tf_agents.trajectories import test_utils as trajectories_test_utils
from tf_agents.trajectories import time_step as ts
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.utils import test_utils

In [19]:
class DummyNet(network.Network):

  def __init__(self,
               observation_spec,
               action_spec,
               l2_regularization_weight=0.0,
               name=None):
    super(DummyNet, self).__init__(
        observation_spec, state_spec=(), name=name)
    num_actions = action_spec.maximum - action_spec.minimum + 1

    # Store custom layers that can be serialized through the Checkpointable API.
    self._dummy_layers = [
        tf.keras.layers.Dense(
            num_actions,
            kernel_regularizer=tf.keras.regularizers.l2(
                l2_regularization_weight),
            kernel_initializer=tf.constant_initializer([[num_actions, 1],
                                                        [1, 1]]),
            bias_initializer=tf.constant_initializer([[1], [1]]))
    ]

  def call(self, inputs, step_type=None, network_state=()):
    del step_type
    inputs = tf.cast(inputs, tf.float32)
    for layer in self._dummy_layers:
      inputs = layer(inputs)
    return inputs, network_state

In [13]:
class DqnAgentBase(tf.test.TestCase, parameterized.TestCase):

  def setUp(self):

    super(DqnAgentBase, self).setUp()
    self._observation_spec = tensor_spec.TensorSpec([2], tf.float32)
    self._time_step_spec = ts.time_step_spec(self._observation_spec)
    self._action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 1)

### DqnAgent 
#### With Changed Optimal Actions

 Using the kernel initializer `[[2, 1], [1, 1]]` and bias initializer
 `[[1], [1]]` from DummyNet above, we can calculate the following values:
 
 ```
 Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5
 Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8
 (Here we use the second row of the kernel initializer above, since the
 chosen action is now 1 instead of 0.)
 ```

 For the target Q-values here, note that since we've replaced `5` and `7` with
 `-5` and `-7`, it is better to use action `1` with a kernel of `[1, 1]` instead of action `0` with a kernel of `[2, 1]`.

 ```
 Target Q-value for first next_observation: 1 * -5 + 1 * 6 + 1 = 2
 Target Q-value for second next_observation: 1 * -7 + 1 * 8 + 1 = 2
 TD targets: 10 + 0.9 * 2 = 11.8 and 20 + 0.9 * 2 = 21.8
 TD errors: 11.8 - 5 = 6.8 and 21.8 - 8 = 13.8
 TD loss: 6.3 and 13.3 (Huber loss subtracts 0.5)
 Overall loss: (6.3 + 13.3) / 2 = 9.8
 ```


In [20]:
class DqnAgentSampleOne(DqnAgentBase):

  def setUp(self):
    super(DqnAgentSampleOne, self).setUp()
  
  def runLossWithChangedOptimalActions(self):
    q_net = DummyNet(self._observation_spec, self._action_spec)
    #<class 'tf_agents.agents.dqn.dqn_agent.DdqnAgent'>
    agent = tf_agents.agents.dqn.dqn_agent.DdqnAgent(
        self._time_step_spec,
        self._action_spec,
        q_network=q_net,
        optimizer=None)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)

    actions = tf.constant([0, 1], dtype=tf.int32)
    action_steps = policy_step.PolicyStep(actions)

    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

    # Note that instead of [[5, 6], [7, 8]] as before, we now have -5 and -7.
    next_observations = tf.constant([[-5, 6], [-7, 8]], dtype=tf.float32)
    next_time_steps = ts.transition(next_observations, rewards, discounts)

    experience = trajectories_test_utils.stacked_trajectory_from_transition(
        time_steps, action_steps, next_time_steps)

    loss, _ = agent._loss(experience)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    loss_evaluate = self.evaluate(loss)
    tf.print("loss:", loss, output_stream=sys.stdout)


In [23]:
sampleOne = DqnAgentSampleOne()
sampleOne.setUp()
sampleOne.runLossWithChangedOptimalActions()

loss: 9.79999924


### Multiple Episodes