In [1]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
import matplotlib.pyplot as plt
import os

from keras.optimizers.schedules import PolynomialDecay

from tf_agents.environments import suite_gym
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.networks.q_network import QNetwork
from tf_agents.agents.dqn.dqn_agent import DqnAgent
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.metrics import tf_metrics
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.policies.policy_saver import PolicySaver
from tf_agents.utils.common import function, element_wise_squared_loss

In [2]:
env_name = "LunarLander-v2"

#Agent parameters
learning_rate = 1e-3
update_period = 1200
discount_factor = 1
step_counter = tf.Variable(0)

#Buffer params
replay_buffer_capacity = 10000

#Training params
num_iterations = 150000

#Dataset params
batch_size = 64

# Environment

In [3]:
train_env_py = suite_gym.load(env_name)
eval_env_py = suite_gym.load(env_name)

In [4]:
train_env = TFPyEnvironment(train_env_py)
eval_env = TFPyEnvironment(eval_env_py)

In [5]:
train_env.observation_spec()

BoundedTensorSpec(shape=(8,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32))

In [6]:
train_env.action_spec()

BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0, dtype=int64), maximum=array(3, dtype=int64))

# Network

In [5]:
fc_layer_params = (64, 32)

q_net = QNetwork(train_env.observation_spec(),
                 train_env.action_spec(),
                 fc_layer_params = fc_layer_params,
                 activation_fn = tf.keras.activations.relu,
                 kernel_initializer = "he_normal" )

# Agent

In [6]:
optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)

epsilon_fn = PolynomialDecay(initial_learning_rate = 1.0,
                             decay_steps = 6,
                             end_learning_rate = 0.01)

agent = DqnAgent(train_env.time_step_spec(),
                 train_env.action_spec(),
                 q_network = q_net,
                 optimizer = optimizer,
                 target_update_period = update_period,
                 td_errors_loss_fn = element_wise_squared_loss, #keras.losses.Huber(reduction="none"),
                 gamma = discount_factor,
                 train_step_counter = step_counter,
                 epsilon_greedy = lambda : epsilon_fn(step_counter)) 

agent.initialize()

# Replay Buffer

In [7]:
replay_buffer = TFUniformReplayBuffer(data_spec = agent.collect_data_spec,
                                      batch_size= train_env.batch_size,
                                      max_length= replay_buffer_capacity)

In [8]:
observer = replay_buffer.add_batch

# Metrics

In [9]:
training_metrics = [tf_metrics.AverageReturnMetric(),
                 tf_metrics.NumberOfEpisodes()]
#Might need a logger to print them to screen (cf. logging in Geron book)

# Drivers

In [10]:
random_policy = RandomTFPolicy(time_step_spec = train_env.time_step_spec(),
                               action_spec = train_env.action_spec())

In [13]:
initial_collect_driver = DynamicStepDriver(train_env,
                                           random_policy,
                                           observers = [observer] + training_metrics,
                                           num_steps = replay_buffer_capacity)

In [15]:
good_policy = tf.saved_model.load("Good policies/policy_120k_huber")

pre_trained_driver = DynamicStepDriver(train_env,
                                       good_policy, 
                                       observers = [observer] + training_metrics,
                                       num_steps = replay_buffer_capacity)

In [11]:
collect_driver = DynamicStepDriver(train_env,
                                   agent.collect_policy,
                                   observers = [observer] + training_metrics,
                                   num_steps = 1)

In [18]:
#initial_collect_driver.run()
pre_trained_driver.run()

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


(TimeStep(
 {'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
  'observation': <tf.Tensor: shape=(1, 8), dtype=float32, numpy=
 array([[ 0.23598842, -0.03832475,  0.01568582, -0.00230626, -0.26576924,
         -0.00681765,  1.        ,  1.        ]], dtype=float32)>,
  'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.41610548], dtype=float32)>,
  'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1])>}),
 ())

# Dataset

In [12]:
dataset = replay_buffer.as_dataset(sample_batch_size = batch_size,
                                   num_steps = 2,
                                   num_parallel_calls = 3).prefetch(3)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


# Training Loop

In [13]:
collect_driver.run = function(collect_driver.run)
agent.train = function(agent.train)

In [14]:
policy_dir = os.path.join(os.curdir, "Saved policies")
policy_dir

'.\\Saved policies'

In [15]:
saver = PolicySaver(agent.policy)

In [23]:
agent.train_step_counter.assign(0)
iterator = iter(dataset)

time_step = None
policy_state = agent.policy.get_initial_state(train_env.batch_size)
iterator = iter(dataset)

returns = [-200]

for _ in range(num_iterations):
    time_step, policy_state = collect_driver.run(time_step, policy_state)
    experience, info = next(iterator)
    train_loss = agent.train(experience)
    
    step = agent.train_step_counter.numpy()

    if step % 1000 == 0:
        avg_return = training_metrics[0].result()
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

    if step % 10000 == 0:
        iteration = step // 1000
        folder = os.path.join(policy_dir, "policy_%dk" % iteration)
        saver.save(folder)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))
step = 1000: Average Return = -481.74420166015625
step = 2000: Average Return = -445.8077087402344
step = 3000: Average Return = -382.081787109375
step = 4000: Average Return = -328.1892395019531
step = 5000: Average Return = -227.13723754882812
step = 6000: Average Return = -194.12179565429688
step = 7000: Average Return = -284.3909606933594
step = 8000: Average Return = -274.2027893066406
step = 9000: Average Return = -258.9452209472656




step = 10000: Average Return = -249.7990264892578





FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_10k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_10k\assets


step = 11000: Average Return = -243.97607421875
step = 12000: Average Return = -246.990966796875
step = 13000: Average Return = -238.73770141601562
step = 14000: Average Return = -251.58181762695312
step = 15000: Average Return = -246.9851837158203
step = 16000: Average Return = -191.25204467773438
step = 17000: Average Return = -134.7073516845703
step = 18000: Average Return = -125.92742919921875
step = 19000: Average Return = -110.41261291503906




step = 20000: Average Return = -114.60330963134766

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_20k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_20k\assets


step = 21000: Average Return = -87.09349060058594
step = 22000: Average Return = -70.41943359375
step = 23000: Average Return = -71.07567596435547
step = 24000: Average Return = -71.41526794433594
step = 25000: Average Return = -67.64946746826172
step = 26000: Average Return = -67.89710235595703
step = 27000: Average Return = -85.71308898925781
step = 28000: Average Return = -91.66325378417969
step = 29000: Average Return = -82.61857604980469




step = 30000: Average Return = -63.46210861206055

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_30k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_30k\assets


step = 31000: Average Return = -60.508934020996094
step = 32000: Average Return = -54.559974670410156
step = 33000: Average Return = -41.631160736083984
step = 34000: Average Return = -37.62888717651367
step = 35000: Average Return = -36.056373596191406
step = 36000: Average Return = -33.602577209472656
step = 37000: Average Return = -29.799129486083984
step = 38000: Average Return = -42.70119094848633
step = 39000: Average Return = -49.44880294799805




step = 40000: Average Return = -46.10763931274414

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_40k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_40k\assets


step = 41000: Average Return = -13.04986572265625
step = 42000: Average Return = -10.316640853881836
step = 43000: Average Return = -16.830333709716797
step = 44000: Average Return = -17.36126136779785
step = 45000: Average Return = -17.694995880126953
step = 46000: Average Return = -14.84802532196045
step = 47000: Average Return = -17.70541763305664
step = 48000: Average Return = -50.867984771728516
step = 49000: Average Return = -69.00650787353516




step = 50000: Average Return = -75.0960922241211

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_50k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_50k\assets


step = 51000: Average Return = -50.33705520629883
step = 52000: Average Return = -42.97129821777344
step = 53000: Average Return = -41.043251037597656
step = 54000: Average Return = -38.51504135131836
step = 55000: Average Return = -39.636226654052734
step = 56000: Average Return = -32.91533660888672
step = 57000: Average Return = -30.798364639282227
step = 58000: Average Return = 1.129174828529358
step = 59000: Average Return = -6.2528228759765625




step = 60000: Average Return = -1.9891357421875

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_60k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_60k\assets


step = 61000: Average Return = -2.3950538635253906
step = 62000: Average Return = 0.39813536405563354
step = 63000: Average Return = -4.060652256011963
step = 64000: Average Return = -6.930148124694824
step = 65000: Average Return = -8.85755443572998
step = 66000: Average Return = -2.6926300525665283
step = 67000: Average Return = -3.2231431007385254
step = 68000: Average Return = 1.2235870361328125
step = 69000: Average Return = -6.8290557861328125




step = 70000: Average Return = -9.151216506958008

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_70k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_70k\assets


step = 71000: Average Return = -7.913212776184082
step = 72000: Average Return = -18.00091552734375
step = 73000: Average Return = -80.1270980834961
step = 74000: Average Return = -76.35243225097656
step = 75000: Average Return = -76.50553131103516
step = 76000: Average Return = -60.067420959472656
step = 77000: Average Return = -82.22793579101562
step = 78000: Average Return = -78.3037109375
step = 79000: Average Return = -79.99504852294922




step = 80000: Average Return = -75.5724868774414

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_80k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_80k\assets


step = 81000: Average Return = -61.09189987182617
step = 82000: Average Return = -52.80987548828125
step = 83000: Average Return = 26.14951515197754
step = 84000: Average Return = 26.25705337524414
step = 85000: Average Return = 10.225651741027832
step = 86000: Average Return = 10.747103691101074
step = 87000: Average Return = 29.091785430908203
step = 88000: Average Return = 24.863452911376953
step = 89000: Average Return = 58.98150634765625




step = 90000: Average Return = 73.89456176757812

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_90k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_90k\assets


step = 91000: Average Return = 69.96000671386719
step = 92000: Average Return = 40.6036376953125
step = 93000: Average Return = 35.51144790649414
step = 94000: Average Return = 39.113494873046875
step = 95000: Average Return = 32.342201232910156
step = 96000: Average Return = 10.394719123840332
step = 97000: Average Return = -15.435979843139648
step = 98000: Average Return = -28.872112274169922
step = 99000: Average Return = 4.4522809982299805




step = 100000: Average Return = 14.322553634643555

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_100k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_100k\assets


step = 101000: Average Return = 12.718243598937988
step = 102000: Average Return = 9.209807395935059
step = 103000: Average Return = -3.3507606983184814
step = 104000: Average Return = -8.317544937133789
step = 105000: Average Return = -9.435965538024902
step = 106000: Average Return = -50.687644958496094
step = 107000: Average Return = -87.17062377929688
step = 108000: Average Return = -69.53208923339844
step = 109000: Average Return = -73.13108825683594




step = 110000: Average Return = -73.86192321777344

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_110k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_110k\assets


step = 111000: Average Return = -69.89509582519531
step = 112000: Average Return = -47.453651428222656
step = 113000: Average Return = -66.34281921386719
step = 114000: Average Return = -51.89763641357422
step = 115000: Average Return = -42.19669723510742
step = 116000: Average Return = -43.089500427246094
step = 117000: Average Return = -33.55915069580078
step = 118000: Average Return = -21.841938018798828
step = 119000: Average Return = -40.536048889160156




step = 120000: Average Return = -28.78645896911621

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_120k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_120k\assets


step = 121000: Average Return = -39.6362419128418
step = 122000: Average Return = -32.17646026611328
step = 123000: Average Return = -33.39101028442383
step = 124000: Average Return = -58.63360595703125
step = 125000: Average Return = -56.294525146484375
step = 126000: Average Return = -54.98761749267578
step = 127000: Average Return = -72.94304656982422
step = 128000: Average Return = -62.06354904174805
step = 129000: Average Return = -68.3818359375




step = 130000: Average Return = -63.244117736816406

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_130k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_130k\assets


step = 131000: Average Return = -69.79014587402344
step = 132000: Average Return = -82.70357513427734
step = 133000: Average Return = -72.41386413574219
step = 134000: Average Return = -87.46299743652344
step = 135000: Average Return = -61.382476806640625
step = 136000: Average Return = -60.29169845581055
step = 137000: Average Return = -44.928951263427734
step = 138000: Average Return = -44.17757797241211
step = 139000: Average Return = -62.03868865966797




step = 140000: Average Return = -67.98761749267578





FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_140k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_140k\assets


step = 141000: Average Return = -51.26951217651367
step = 142000: Average Return = -70.74075317382812
step = 143000: Average Return = -51.87975311279297
step = 144000: Average Return = -84.71998596191406
step = 145000: Average Return = -63.541847229003906
step = 146000: Average Return = -72.99394226074219
step = 147000: Average Return = -87.15099334716797
step = 148000: Average Return = -89.19763946533203
step = 149000: Average Return = -104.53459167480469




step = 150000: Average Return = -94.71683502197266

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: .\Saved policies\policy_150k\assets


INFO:tensorflow:Assets written to: .\Saved policies\policy_150k\assets


# Evaluation and visualization

In [None]:
plt.plot(returns)
plt.grid()

In [18]:
policy = tf.saved_model.load("Good policies/policy_120k_huber")

In [None]:
policy = agent.policy

In [22]:
episode_rewards = []
for _ in range(4):
    reward = 0.0
    time_step = eval_env.reset()
    while not time_step.is_last():
        eval_env_py.render()
        action_step = policy.action(time_step)
        time_step = eval_env.step(action_step)
        reward += time_step.reward
    episode_rewards.append(reward)

In [23]:
eval_env_py.close()

In [None]:
tf.reduce_mean(episode_rewards)

In [None]:
tf.math.reduce_std(episode_rewards)

In [None]:
tf.reduce_min(episode_rewards)

In [None]:
tf.reduce_min(returns)

In [None]:
tf.reduce_mean(returns)

In [None]:
tf.math.reduce_std(returns)