### How to do it

1. Install prerequisites and import required libraries

In [1]:
#!sudo apt-get install -y xvfb ffmpeg
!pip install gym
!pip install 'imageio==2.4.0'
!pip install PILLOW
!pip install pyglet
!pip install pyvirtualdisplay
!pip install tf-agents

Collecting imageio==2.4.0
  Using cached imageio-2.4.0-py3-none-any.whl
Installing collected packages: imageio
  Attempting uninstall: imageio
    Found existing installation: imageio 2.5.0
[31mERROR: Cannot uninstall 'imageio'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.[0m[31m


In [4]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay

import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
tf.compat.v1.enable_v2_behavior()

# Set up a virtual display for rendering OpenAI gym environments
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

We set some hyperparameters for our problem

In [5]:
num_iterations = 20_000

initial_collect_steps = 100
collect_steps_per_iteration = 1
replay_buffer_max_length = 100_000

# parameters of the nerual network underlying at the core of an agent
batch_size = 64
learning_rate = 1e-3
log_interval = 200

num_eval_episodes = 10
eval_interval = 1000

Define functions for our problem

In [6]:
def compute_avg_return(environment, policy, num_episodes=10):
    
    total_return = 0.0
    for _ in range(num_episodes):
        
        time_step = environment.reset()
        episode_return = 0.0
        
        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return
        
    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

The following code collects fora single step and performs data aggregation

In [7]:
def collect_step(environment, policy, buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    
    # Add trajectory to replay buffer
    buffer.add_batch(traj)
    
def collect_data(env, policy, buffer, steps):
    for _ in range(steps):
        collect_step(env, policy, buffer)

We use this function to produce a video that enables us to visualize what our agent is actually doing

In [29]:
def embed_mp4(filename):
    """ Embeds an mp4 file in the notebook."""
    video = open(filename, 'rb').read()
    b64 = base64.b64encode(video)
    tag='''
    <video width="640" height="480" controls>
      <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())
    
    return IPython.display.HTML(tag)

def create_policy_eval_video(policy, filename, num_episodes=5, fps=30):
    filename = filename + ".mp4"
    with imageio.get_writer(filename, fps=fps) as video:
        for _ in range(num_episodes):
            time_step = eval_env.reset()
            video.append_data(eval_py_env.render())
            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = eval_env.step(action_step.action)
                video.append_data(eval_py_env.render())
    return embed_mp4(filename)

We now set up our environment

In [9]:
env_name = 'CartPole-v0'
env = suite_gym.load(env_name)
env.reset()

TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([-0.00604147, -0.01121802,  0.01685468,  0.04287687], dtype=float32),
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})

We also split the training and evaluation environments and apply wrappers to them

In [10]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

We will now define the network which acts as the backbone of the learning algorithm for our agent:

In [11]:
fc_layer_params = (100,)

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

Now we instantiate a DQN agent

In [12]:
agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

Set up the policies

In [13]:
eval_policy = agent.policy
collect_policy = agent.collect_policy

We also use a random policy as a benchmark

In [14]:
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())

Create an example environment

In [16]:
example_environment = tf_py_environment.TFPyEnvironment(suite_gym.load('CartPole-v0'))

time_step = example_environment.reset()

The replay buffer tracks data collected from the environment, and is used for training

In [17]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

We use our random policy here

In [18]:
collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)

Using a pipeline, we can access our replay buffer

In [19]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=batch_size,
    num_steps=2).prefetch(3)

iterator = iter(dataset)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


During training, we alternate between collecting data from the environment and using that data to train our agent

In [23]:
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

for _ in range(num_iterations):
    
    # Collect a few steps using collect_policy and save to the replay buffer
    collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)
    
    # Sample a batch of data from the buffer and update the agents network
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss
    
    step = agent.train_step_counter.numpy()
    
    if step % log_interval == 0:
        print('step = {0}: loss={1}'.format(step, train_loss))
        
    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))
step = 200: loss=6.787779808044434
step = 400: loss=18.682899475097656
step = 600: loss=61.56465148925781
step = 800: loss=145.1483154296875
step = 1000: loss=11.662643432617188
step = 1000: Average Return = 52.599998474121094
step = 1200: loss=16.647485733032227
step = 1400: loss=301.2387390136719
step = 1600: loss=122.9173355102539
step = 1800: loss=384.86639404296875
step = 2000: loss=149.59683227539062
step = 2000: Average Return = 20.600000381469727
step = 2200: loss=290.2342529296875
step = 2400: loss=121.64823150634766
step = 2600: loss=158.031982421875
step = 2800: loss=213.07003784179688
step = 3000: loss=155.46835327148438
step = 3000: Average Return = 27.100000381469727
step = 3200: loss=43.943504333496094
step = 3400: loss=37.83335876464844
ste

We can now observe both the random policy agent and our trained agent!

In [30]:
create_policy_eval_video(random_policy, "random_agent")



In [31]:
create_policy_eval_video(agent.policy, "trained_agent")

