In [1]:
"""
This Notebook closely follows the guide presented by Tensorflow here:
https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial
"""

'\nThis Notebook closely follows the guide presented by Tensorflow here:\nhttps://www.tensorflow.org/agents/tutorials/1_dqn_tutorial\n'

In [2]:
""" Imports: """
# Tensorflow Stuff:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.policies.epsilon_greedy_policy import EpsilonGreedyPolicy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
from tf_agents.metrics import py_metrics
from tf_agents.drivers import dynamic_step_driver

import imageio
import IPython
import time
import matplotlib.pyplot as plt

In [None]:
# BM stuff:
from BMEnvironment import BMEnv
from templates.Seesaw_Template import Seesaw
from templates.Tower_Template import Tower
from templates.Blank_Template import Blank
from templates.Roof_Template import Roof
from templates.Shield_Template import Shield

from templates.PureAvoidance_Template import PureAvoidance
from observations.DoubleRaycastObservation import DoubleRayCastObservation
from rewards.NewHeightReward import NewHeightReward

In [None]:
def create_policy_eval_video(policy, env: tf_py_environment.TFPyEnvironment, py_env: BMEnv, filename, num_episodes=1, fps=60, render_obs=False):
  """ Creates a video where actions are mapped out according to the given policy """

  mode = ''
  if render_obs: 
    mode='render_observation'
  else:
    mode='render_normal'

  filename = filename + ".mp4"
  with imageio.get_writer(filename, fps=fps) as video:
    for _ in range(num_episodes):
      time_step = env.reset()
      video.append_data(py_env.render(mode=mode))                
      while not time_step.is_last():
        action_step = policy.action(time_step)        
        time_step = env.step(action_step)
        video.append_data(py_env.render(mode=mode))  

In [None]:
""" Hyper-parameters """

num_iterations = 15

initial_collect_steps = 10

collect_steps_per_iteration = 500
replay_buffer_max_length = 5000

batch_size = 500
learning_rate = 5.0e-2
gamma = 0.99
start_epsilon = 0.15
exp_epsilon_decay_base = 0.99
grad_clip = 10
target_update_period = 1


num_eval_episodes = 2
eval_interval = 5
log_interval = 5

In [None]:
""" Create Environments: """    

# Choose reward function:
reward_fn = NewHeightReward()

# Choose observation function:
observation_fn = DoubleRayCastObservation()

# Choose variant ranges:
variant_range_train = (0.0, 0.5)
variant_range_test = (0.2, 0.2)

# Generic Environment:
env = BMEnv(    template=Seesaw(), 
                variantRange=(0.0, 1.0), 
                reward_fn=reward_fn,
                observation_fn=observation_fn  )

# Training Environment:
train_py_env = BMEnv(   template=Seesaw(), 
                        variantRange=variant_range_train, 
                        reward_fn=reward_fn,
                        observation_fn=observation_fn  )                        

# Evaluation Environment:
eval_py_env = BMEnv(    template=Seesaw(), 
                        variantRange=variant_range_test, 
                        reward_fn=reward_fn,
                        observation_fn=observation_fn  )      

# Convert to TF environments:
train_env = tf_py_environment.TFPyEnvironment(train_py_env)                  
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)


In [None]:
""" Create Agent Q-Net: """

# Hidden Layers:
h_layers = (64, 64)

# Action Specification:
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1


def dense_layer(num_units):
    """ Helper function to generate dense layers in Q-net """    
    return layers.Dense(    num_units, 
                            activation=tf.keras.activations.relu,
                            kernel_initializer=tf.keras.initializers.VarianceScaling(   
                                scale=2.0, mode='fan_in', distribution='truncated_normal'
                            ))

# Create the dense layers:
dense_layers = [dense_layer(units) for units in h_layers]   

# Create the output layers:
output_layer = layers.Dense(    num_actions,
                                activation=None,
                                kernel_initializer=tf.keras.initializers.RandomUniform(
                                    minval=-0.03, maxval=0.03
                                ),
                                bias_initializer=keras.initializers.Constant(-0.1) )

q_net = sequential.Sequential(dense_layers + [output_layer])


In [None]:
""" Create Agent: """
# Agent learning optimizer:
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

# Counter:
train_step_counter = tf.Variable(0)

# Agent:
agent = dqn_agent.DqnAgent( optimizer=optimizer,
                            time_step_spec=train_env.time_step_spec(),
                            action_spec=train_env.action_spec(),
                            q_network=q_net,
                            td_errors_loss_fn=common.element_wise_squared_loss,
                            train_step_counter=train_step_counter,
                            gamma=gamma,
                            target_update_period=target_update_period,
                            target_update_tau=1.0,
                            gradient_clipping=grad_clip )

agent.initialize()

In [None]:
def compute_avg_return(environment, policy, num_episodes=10):
    """
    The standard metric for evaluating a policy in an environment
    """
    total_return = 0.0
    iterations = 0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
            iterations += 1
        total_return += episode_return
        
    avg_return = total_return / iterations
    return avg_return.numpy()[0]

In [None]:
""" Replay Buffer """

table_name = 'uniform_table'
collect_data_spec = tensor_spec.from_spec(agent.collect_data_spec)

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=collect_data_spec,
                                                                batch_size=train_env.batch_size,
                                                                max_length=replay_buffer_max_length )

In [None]:
""" Drivers: """
metric = tf_metrics.AverageReturnMetric()

observers = [metric, replay_buffer.add_batch]

random_policy = random_tf_policy.RandomTFPolicy(    action_spec=agent.action_spec,
                                                    time_step_spec=train_env.time_step_spec()   )

dynamic_step_driver.DynamicStepDriver(  env=train_env, 
                                        policy=random_policy,
                                        observers=observers,
                                        num_steps=initial_collect_steps).run(time_step=train_env.reset())

dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=batch_size,
    num_steps=2).prefetch(3)

dataset

iterator = iter(dataset)

In [None]:
""" Initialise collect driver: """


In [None]:
""" Train Agent: """

starttime = time.time()

agent.train = common.function(agent.train)

agent.train_step_counter.assign(0)

avg_return = compute_avg_return(train_env, agent.policy, num_eval_episodes)
returns = [avg_return]

time_step = train_env.reset()

collect_driver = dynamic_step_driver.DynamicStepDriver( env=train_env,
                                                        observers=observers,
                                                        policy=agent.collect_policy,                                                        
                                                        num_steps=collect_steps_per_iteration   )   
iteration = 0

for _ in range(num_iterations):

    iteration += 1    

    epsilon = start_epsilon * pow(exp_epsilon_decay_base, iteration)    
    
    agent._epsilon_greedy = epsilon

    time_step = collect_driver.run()

    experience, _ = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('iteration = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)        

timetaken = time.time() - starttime
print("Time taken: " + str(timetaken))


In [None]:
iterations = range(0, num_iterations + 1, eval_interval)
plt.plot(iterations, returns)
plt.ylabel('Average Return')
plt.xlabel('Iterations')

In [None]:
create_policy_eval_video(agent.collect_policy, eval_env, eval_py_env, "../training_videos/Shield-27-05-2022-5", 1, render_obs=False)