In [1]:
import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import PIL.Image
import pyvirtualdisplay
import tensorflow as tf

In [2]:
from tf_agents.agents.dqn import dqn_agent

In [3]:
from tf_agents.agents.dqn import q_network

In [4]:
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.environments import trajectory
from tf_agents.metrics import metric_utils, tf_metrics
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common

In [5]:
tf.compat.v1.enable_v2_behavior()

In [6]:
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

In [7]:
# hyperparameters
env_name = 'CartPole-v0'
num_iterations = 20000
initial_collect_steps = 1000
collect_steps_per_iteration = 1
replay_buffer_capacity = 100000
fc_layer_params = (100,)

batch_size = 64
learning_rate = 1e-3
log_interval = 200
num_eval_episodes = 10
eval_interval = 1000

In [8]:
env = suite_gym.load(env_name)

In [9]:
env.reset()

TimeStep(step_type=array(0, dtype=int32), reward=array(0., dtype=float32), discount=array(1., dtype=float32), observation=array([ 0.03654184,  0.02037561, -0.001049  , -0.03980718], dtype=float32))

In [10]:
# PIL.Image.fromarray(env.render())

In [11]:
env.time_step_spec().observation

BoundedArraySpec(shape=(4,), dtype=dtype('float32'), name=None, minimum=[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], maximum=[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38])

In [12]:
env.action_spec()

BoundedArraySpec(shape=(), dtype=dtype('int64'), name=None, minimum=0, maximum=1)

In [13]:
time_step = env.reset()

In [14]:
time_step

TimeStep(step_type=array(0, dtype=int32), reward=array(0., dtype=float32), discount=array(1., dtype=float32), observation=array([-0.02990607, -0.03645543,  0.03585297,  0.02890254], dtype=float32))

In [15]:
action = 1

In [16]:
next_time_step = env.step(action)

In [17]:
next_time_step

TimeStep(step_type=array(1, dtype=int32), reward=array(1., dtype=float32), discount=array(1., dtype=float32), observation=array([-0.03063518,  0.15813452,  0.03643103, -0.25225627], dtype=float32))

In [18]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

In [19]:
q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params,
)

In [20]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

tf_agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    train_step_counter=train_step_counter,
    td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
)
tf_agent.initialize()

In [21]:
eval_policy = tf_agent.policy

In [22]:
collect_policy = tf_agent.collect_policy

In [53]:
eval_policy

<bound method TFAgent.policy of <tf_agents.agents.dqn.dqn_agent.DqnAgent object at 0x140806630>>

In [54]:
collect_policy

<bound method TFAgent.collect_policy of <tf_agents.agents.dqn.dqn_agent.DqnAgent object at 0x140806630>>

In [23]:
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                               train_env.action_spec())

In [24]:
def compute_avg_return(environment, policy, num_episodes=10):    total_return = 0.0 
    for _ in range(num_episodes):
        time_step = environment.reset()
        episode_return = 0.0
        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return
    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

compute_avg_return(eval_env, random_policy, num_eval_episodes)

20.6

In [25]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=tf_agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_capacity
)

In [26]:
def collect_step(environment, policy):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    replay_buffer.add_batch(traj)
    
for _ in range(initial_collect_steps):
    collect_step(train_env, random_policy)

In [27]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2
).prefetch(3)

iterator = iter(dataset)

In [29]:
%%time

tf_agent.train = common.function(tf_agent.train)

tf_agent.train_step_counter.assign(0)

avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
returns = [avg_return]

for _ in range(num_iterations):
    for _ in range(collect_steps_per_iteration):
        collect_step(train_env, tf_agent.collect_policy)
        
    experience, unused_info = next(iterator)
    train_loss = tf_agent.train(experience)
    
    step = tf_agent.train_step_counter.numpy()
    
    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss.loss))
    
    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

step = 200: loss = 21.531078338623047
step = 400: loss = 56.50679397583008
step = 600: loss = 35.09357452392578
step = 800: loss = 32.73316192626953
step = 1000: loss = 34.38996887207031
step = 1000: Average Return = 198.39999389648438
step = 1200: loss = 82.58395385742188
step = 1400: loss = 132.54940795898438
step = 1600: loss = 124.92987823486328
step = 1800: loss = 47.248104095458984
step = 2000: loss = 30.976490020751953
step = 2000: Average Return = 59.29999923706055
step = 2200: loss = 336.6656188964844
step = 2400: loss = 105.05303955078125
step = 2600: loss = 30.639545440673828
step = 2800: loss = 99.55473327636719
step = 3000: loss = 602.5145263671875
step = 3000: Average Return = 61.400001525878906
step = 3200: loss = 32.09312438964844
step = 3400: loss = 646.1357421875
step = 3600: loss = 47.03107833862305
step = 3800: loss = 961.888671875
step = 4000: loss = 95.11054992675781
step = 4000: Average Return = 57.20000076293945
step = 4200: loss = 362.79473876953125
step = 4400

In [31]:
steps = range(0, num_iterations+1, eval_interval)
plt.plot(steps, returns)
plt.ylabel('Average Return')
plt.xlabel('Step')
plt.ylim(top=250)

NameError: name 'returns' is not defined