[Reinforcement Learning TF-Agents](https://colab.research.google.com/drive/1FXh1BQgMI5xE1yIV1CQ25TyRVcxvqlbH?usp=sharing)

In [None]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
# nice plot figures
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import matplotlib.animation as animation
# smooth animations
mpl.rc('animation', html='jshtml')

import PIL
import os

import gym
import tf_agents
from tf_agents.environments import suite_atari, suite_gym

from tf_agents.environments.atari_preprocessing import AtariPreprocessing
from tf_agents.environments.atari_wrappers import FrameStack4
from tf_agents.environments.tf_py_environment import TFPyEnvironment

from tf_agents.networks.q_network import QNetwork
from tf_agents.agents.dqn.dqn_agent import DqnAgent
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.metrics import tf_metrics
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.utils.common import function

In [None]:
# functions to plot animations on a per frame basis
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

# save an agent's demo (after training)
saved_frames = []
def save_frames(trajectory):
    global saved_frames
    saved_frames.append(tf_env.pyenv.envs[0].render(mode="rgb_array"))

def play_game_demo(tf_env, the_agent, obs_list, n_steps):
    watch_driver = DynamicStepDriver(
        tf_env,
        the_agent.policy,
        observers=[save_frames] + obs_list,
        num_steps=n_steps)
    final_time_step, final_policy_state = watch_driver.run()

def save_animated_gif(frames): # saved_frames is passed in
    image_path = os.path.join("images", "rl", "breakout.gif")
    frame_images = [PIL.Image.fromarray(frame) for frame in frames[:150]]
    frame_images[0].save(image_path, format='GIF',
                         append_images=frame_images[1:],
                         save_all=True,
                         duration=30,
                         loop=0)
    
# %%html
# <img src="images/rl/breakout.gif" /> runs the gif in a jupyter/colab environment

In [None]:
# 8

# install this dependency for LunarLander
# pip install gym[box2d] 
test_env = gym.make("LunarLander-v2")

In [None]:
test_env # seems like there is a time limit

<TimeLimit<LunarLander<LunarLander-v2>>>

In [None]:
test_env.reset() # 8 values from each observation

array([ 0.00610943,  1.4206164 ,  0.6188056 ,  0.43093634, -0.00707254,
       -0.14016892,  0.        ,  0.        ], dtype=float32)


From the source code, we can see that these each 8D observation (x, y, h, v, a, w, l, r) correspond to:

+ x,y: the coordinates of the spaceship. It starts at a random location near (0, 1.4) and must land near the target at (0, 0).
+ h,v: the horizontal and vertical speed of the spaceship. It starts with a small random speed.
+ a,w: the spaceship's angle and angular velocity.
+ l,r: whether the left or right leg touches the ground (1.0) or not (0.0).

In [None]:
print(test_env.observation_space) # 
print(test_env.action_space, test_env.action_space.n) # 4 possible values


Box(-inf, inf, (8,), float32)
Discrete(4) 4


Looking at the https://gym.openai.com/envs/LunarLander-v2/, these actions are:

+ do nothing
+ fire left orientation engine
+ fire main engine
+ fire right orientation engine

In [None]:
# PG REINFORCE algorithm

keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

n_inputs = test_env.observation_space.shape[0]
n_outputs = test_env.action_space.n

model = keras.models.Sequential([
  keras.layers.Dense(32, activation="relu", input_shape=[n_inputs]),
  keras.layers.Dense(32, activation='relu'),
  keras.layers.Dense(32, activation='relu'),
  keras.layers.Dense(n_outputs, activation="softmax")                                                        
])

In [None]:
# play multiple episodes, exploring the environment randomly and recording 
# gradients and rewards

def play_one_step(env, obs, model, loss_fn):
  with tf.GradientTape() as tape:
    probas = model(obs[np.newaxis])
    logits = tf.math.log(probas + keras.backend.epsilon())
    action = tf.random.categorical(logits, num_samples=1)
    loss = tf.reduce_mean(loss_fn(action, probas))
  grads = tape.gradient(loss, model.trainable_variables)
  return obs, reward, done, grads

def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
  all_grads, all_rewards = [], []
  for episode in range(n_episodes):
    current_grads, current_rewards = [], []
    obs = env.reset()
    for step in range(n_max_steps):
      obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
      current_rewards.append(reward)
      current_grads.append(grads)
      if done:
        break
  all_grads.append(current_grads)
  all_rewards.append(current_rewards)
  return all_rewards, all_grads

In [None]:
# compute sum of future discounted rewards and standardize to differentiate
# good and bad decisions

def discount_rewards(discounted, discount_rate):
  discounted = np.array(discounted)
  for step in range(len(discounted) - 2, -1, -1):
    discounted[step] += discounted[step + 1] * discount_rate
  return discount

def discount_and_normalize_rewards(all_rewards, discount_rate):
  discounted_rewards = [discount_rewards(reward, discount_rate) for reward in all_rewards]
  flattened_rewards = np.concatenate(discounted_rewards)
  rewards_mean = flattened_rewards.mean()
  rewards_stddev = flattened_rewards.std()
  return [(reward - rewards_mean) / rewards_stddev for reward in discounted_rewards]

In [None]:
n_iterations = 200
n_episodes_per_update = 16
n_max_steps = 1000
discount_rate = 0.99

In [None]:
env = gym.make("LunarLander-v2")

optimizer = keras.optimizers.Nadam(lr=0.005)
loss_fn = keras.losses.sparse_categorical_crossentropy
# the model outputs probabilities for each class so we use categorical_crossentropy
# and the action is just 1 value (not a 1 hot vector so we use sparse_categorical_crossentropy)
env.seed(42)

[42]

In [None]:
# this will take very long, so I'm not calling it for the sake of my computer's mental health
def train(n_iterations, env, n_episodes_per_update, n_max_steps, model, loss_fn, discount_rate):
  for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(env, n_episodes_per_update, n_max_steps, model, loss_fn)

    # for plotting the learning curve with undiscounted rewards
    # alternatively, just use a reduce_sum from tf and extract the numpy scalar value using .numpy()
    mean_reward = sum(map(sum, all_rewards)) / n_episodes_per_update
    print("\rIteration: {}/{}, mean reward: {:.1f}  ".format( # \r means that it will not return a new line, it will just replace the current line
            iteration + 1, n_iterations, mean_reward), end="")
    mean_rewards.append(mean_reward)

    all_discounted_rewards = discount_and_normalize_rewards(all_rewards, discount_rate)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
      mean_grads = tf.reduce_mean(
              [final_reward * all_grads[episode_index][step][var_index]
              for episode_index, final_rewards in enumerate(all_discounted_rewards)
                  for step, final_reward in enumerate(final_rewards)], axis=0)
    all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

In [None]:
# 9 TF-Agents SpaceInvaders-v4

environment_name = "SpaceInvaders-v4"
env = suite_atari.load(
    environment_name,
    max_episode_steps=27000,
    gym_env_wrappers=[AtariPreprocessing, FrameStack4]
)
env

<tf_agents.environments.atari_wrappers.AtariTimeLimit at 0x7f7f337f0950>

+ environment ✓
+ driver ✓
+ observer(s) ✓
+ replay buffer ✓
+ dataset ✓
+ agent with collect policy ✓
+ DQN ✓
+ training loop ✓

In [None]:
# environment officially built
tf_env = TFPyEnvironment(env)

In [None]:
dropout_params = [0.4]
fc_params = [512]
conv_params = [(32, (8, 8), 5),
               (64, (4, 4), 4),
               (64, (3, 3), 1),]
preprocessing_layer = keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32) / 255.) # uint8 beforehand

dqn = QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=preprocessing_layer,
    conv_layer_params=conv_params,
    fc_layer_params=fc_params,
    dropout_layer_params=dropout_params,
    activation_fn=keras.activations.relu,
)

In [None]:
# dqn agent with collect policy officially built
update_period = 4
train_step = tf.Variable(0)
epsilon_greedy_policy = keras.optimizers.schedules.PolynomialDecay( 
    initial_learning_rate=1.0, 
    decay_steps=250000 // update_period, 
    end_learning_rate=0.01, 
)

dqn_agent = DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=dqn,
    optimizer=keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False),
    train_step_counter=train_step,
    gamma=0.99, 
    td_errors_loss_fn=keras.losses.Huber(reduction="none"),
    target_update_period=2000,
    epsilon_greedy=lambda: epsilon_greedy_policy(train_step)
)
dqn_agent.initialize()

In [None]:
# uniform replay buffer officially built
replay_buffer = TFUniformReplayBuffer(
    dqn_agent.collect_data_spec,
    batch_size = tf_env.batch_size,
    max_length=100000,
)

In [None]:
replay_buffer_observer = replay_buffer.add_batch

In [None]:
# observers + metrics officially built
training_metrics = [
  tf_metrics.AverageEpisodeLengthMetric(),
  tf_metrics.AverageReturnMetric(),
  tf_metrics.NumberOfEpisodes(),
  tf_metrics.EnvironmentSteps(),
]

In [None]:
class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total
    def __call__(self, trajectory):
        if not trajectory.is_boundary(): 
            self.counter += 1
        if self.counter % 100 == 0:
            print("\r{}/{}".format(self.counter, self.total), end="")

# driver officially created
driver = DynamicStepDriver(
    tf_env, 
    dqn_agent.collect_policy,
    observers = training_metrics + [ShowProgress(2000)],
    num_steps=update_period
)

In [None]:
random_policy = RandomTFPolicy(
    tf_env.time_step_spec(),
    tf_env.action_spec()
)

initial_driver = DynamicStepDriver(
    tf_env, 
    random_policy,
    observers = [replay_buffer.add_batch] + [ShowProgress(2000)],
    num_steps=update_period
)

final_time_step, final_policy_state = initial_driver.run()

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


In [None]:
# dataset officially built
dataset = replay_buffer.as_dataset(
    sample_batch_size=64,
    num_steps=2,
    num_parallel_calls=3, 
).prefetch(3)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [None]:
driver.run = function(driver.run)
dqn_agent.train = function(dqn_agent.train)

# I would train it, but my computer suffers from dementia
# training loop officially built
def training(n_iterations, agent, driver, tf_env, dataset):
  time_step = None
  initial_policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
  iterator = iter(dataset) # forgot to do this!
  for iteration in range(n_iterations):
    time_step, policy_state = driver.run(time_step, policy_state)
    trajectories, buffer_info = next(iterator)
    train_loss = agent.train(trajectories)