In [5]:
!cp /content/drive/MyDrive/Research/Implementation/simtest/InterruptEnvironment.py .
!cp /content/drive/MyDrive/Research/Implementation/simtest/InterruptDriver.py .
!cp /content/drive/MyDrive/Research/Implementation/simtest/TransitionBuffer.py .

In [None]:
!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev

!pip install tf-agents[reverb]
!pip install pyglet
!pip install pygame


In [7]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import annotations

import random
import math
import abc
import tensorflow as tf
import numpy as np

import reverb

from tf_agents import environments
from tf_agents import specs
from tf_agents import trajectories
from tf_agents import networks
from tf_agents import agents
from tf_agents import utils
from tf_agents import policies
from tf_agents import replay_buffers
from tf_agents import drivers
from tf_agents import typing


from InterruptEnvironment import InterruptEnvironment as InterruptEnv
from InterruptDriver import InterruptDriver
from TransitionBuffer import TransitionBuffer

In [8]:
learning_rate = 1e-3 
replay_buffer_max_length = 1000 
initial_collect_steps = 100 
batch_size = 4 

num_eval_episodes = 10 
collect_steps_per_iteration = 1
num_iterations = 5000

log_interval = 200
eval_interval = 1000

In [9]:
class SimulatorEnv(InterruptEnv):
  def __init__(self) -> None:
    env_name = 'CartPole-v0'
    self.env = environments.suite_gym.load(env_name)
    super().__init__()

  def observation_spec(self):
    return self.env.observation_spec()

  def action_spec(self):
    return self.env.action_spec()

  def _reset(self):
    time_step = self.env.reset()
    interrupt = InterruptEnv.Interrupt(time_step, actionRequired=True)
    self._counter = 0
    return interrupt

  def _resume(self, actionStep: trajectories.policy_step.PolicyStep = None):
    
    if (self.current_interrupt().actionRequired()):
      if (actionStep == None):
        raise ValueError("When interrupt is of type action, actionstep cannot be None")
      timeStep = self.current_interrupt().timeStep()
      nextTimeStep = self.env.step(actionStep.action)
      return InterruptEnv.Interrupt(timeStep, transitionCompleted=True,
                                    nextTimeStep=nextTimeStep, actionStep=actionStep)
    elif(self.current_interrupt().transitionCompleted()):
      self._counter += 1
      if (self._counter >= 1000000):
        return InterruptEnv.Interrupt(self.current_interrupt().nextTimeStep(), terminated=True)
      else:
        if (self.current_interrupt().nextTimeStep().step_type == trajectories.time_step.StepType.LAST):
          return InterruptEnv.Interrupt(self.env.reset(), actionRequired=True)
        else:
          return InterruptEnv.Interrupt(self.current_interrupt().nextTimeStep(), actionRequired=True)
    elif(self.current_interrupt().terminated()):
      raise RuntimeError("Simulation has already ended")

In [10]:
env = SimulatorEnv()
interrupt = env.reset()

In [11]:
env.time_step_tensor_spec()

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': BoundedTensorSpec(shape=(4,), dtype=tf.float32, name='observation', minimum=array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38],
      dtype=float32), maximum=array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38],
      dtype=float32)),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

# Agent

In [12]:
fc_layer_params = (50, 20)
action_tensor_spec = env.action_tensor_spec()
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None)
q_net = networks.sequential.Sequential(dense_layers + [q_values_layer])

In [13]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = agents.dqn.dqn_agent.DqnAgent(
    env.time_step_tensor_spec(),
    env.action_tensor_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=utils.common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

#Random Policy test ->

In [14]:
test_environment = SimulatorEnv()
random_policy = policies.random_tf_policy.RandomTFPolicy(test_environment.time_step_tensor_spec(),
                                                test_environment.action_tensor_spec())

# Creating objects from Custom Driver and Buffer

In [15]:
buffer = TransitionBuffer(5000)

In [16]:
TestDriver = InterruptDriver(
    env,
    policies.py_tf_eager_policy.PyTFEagerPolicy(
      random_policy, use_tf_function=True),
    [buffer.observer])

## test

In [17]:
TestDriver.run(100)

()

In [18]:
buffer.sampleExperiences(5)

Trajectory(
{'action': <tf.Tensor: shape=(5, 2), dtype=int64, numpy=
array([[0, 0],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0]])>,
 'discount': <tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]], dtype=float32)>,
 'next_step_type': <tf.Tensor: shape=(5, 2), dtype=int32, numpy=
array([[1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1]], dtype=int32)>,
 'observation': <tf.Tensor: shape=(5, 2, 4), dtype=float32, numpy=
array([[[-0.05088438, -0.7365883 ,  0.02508327,  1.2102034 ],
        [-0.06561615, -0.932025  ,  0.04928734,  1.5106399 ]],

       [[-0.05193746, -0.98285425,  0.09335595,  1.5283283 ],
        [-0.07159454, -0.7889742 ,  0.12392252,  1.2661815 ]],

       [[ 0.05612925, -0.53697044, -0.16607322,  0.09783456],
        [ 0.04538984, -0.33990592, -0.16411653, -0.24229668]],

       [[ 0.00962541,  0.40473774,  0.00678688, -0.6193964 ],
        [ 0.01772016,  0.2

#Train

In [19]:
def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]


In [20]:

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent._train = utils.common.function(agent._train)

# Reset the train step.
agent.train_step_counter.assign(0)


# Create a driver to collect experience.
collect_driver = InterruptDriver(
    env,
    policies.py_tf_eager_policy.PyTFEagerPolicy(
      agent.collect_policy, use_tf_function=True),
    [buffer.observer])

for _ in range(5000):

  collect_driver.run(10)

  experience = buffer.sampleExperiences(200)
  train_loss = agent.train(experience, None).loss
  if (_ % 100 == 0):
    print(train_loss)
  if (_ % 500 == 0):
    print(compute_avg_return(environments.tf_py_environment.TFPyEnvironment(env.env), agent.policy))

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))
tf.Tensor(1.7149928, shape=(), dtype=float32)
11.7
tf.Tensor(14.041291, shape=(), dtype=float32)
tf.Tensor(6.1856856, shape=(), dtype=float32)
tf.Tensor(4.8558803, shape=(), dtype=float32)
tf.Tensor(1.1513597, shape=(), dtype=float32)
tf.Tensor(4.047057, shape=(), dtype=float32)
26.6
tf.Tensor(23.691156, shape=(), dtype=float32)
tf.Tensor(17.496176, shape=(), dtype=float32)
tf.Tensor(16.048256, shape=(), dtype=float32)
tf.Tensor(14.338504, shape=(), dtype=float32)
tf.Tensor(39.40449, shape=(), dtype=float32)
177.0
tf.Tensor(219.54013, shape=(), dtype=float32)
tf.Tensor(109.87901, shape=(), dtype=float32)
tf.Tensor(30.679028, shape=(), dtype=float32)
tf.Tensor(31.65066, shape=(), dtype=float32)
tf.Tensor(202.21959, shape=(), dtype=float32)
106.5
tf.Tensor(1

In [23]:
eval_env = environments.tf_py_environment.TFPyEnvironment(env.env)
compute_avg_return(eval_env, agent.policy)

125.7