In [1]:
#!python -m atari_py.import_roms .\ROMS

In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
from tf_agents.environments.wrappers import ActionRepeat
from tf_agents.environments import suite_gym
from tf_agents.environments import suite_atari
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.environments.atari_preprocessing import AtariPreprocessing
from tf_agents.environments.atari_wrappers import FrameStack4
from tf_agents.networks.q_network import QNetwork
from tf_agents.networks.categorical_q_network import CategoricalQNetwork
from tf_agents.agents.dqn.dqn_agent import DqnAgent
from tf_agents.agents.categorical_dqn.categorical_dqn_agent import CategoricalDqnAgent
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.metrics import tf_metrics
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.utils.common import element_wise_squared_loss
from tf_agents.utils.common import function
from tf_agents.utils.common import Checkpointer
from tf_agents.policies.policy_saver import PolicySaver

In [4]:
global_step = tf.Variable(0)
env_name = "Pong-v0"

#QNetwork parameters
num_atoms = 51
conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
fc_layer_params = (512,)

#DQN Agent parameters
learning_rate = 5e-4
target_update_period = 2000
gamma = .92
epsilon = 0
n_step_update = 2

#Replay Buffer parameters
replay_buffer_capacity = 30000

#Training parameters
num_training_steps = 1000000

# Make the environment

In [5]:
train_env_py = suite_atari.load(env_name,
                                gym_env_wrappers = [AtariPreprocessing, FrameStack4])
eval_env_py = suite_atari.load(env_name,
                                gym_env_wrappers = [AtariPreprocessing, FrameStack4])

In [6]:
train_env_py.get_action_meanings()

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [7]:
train_env = TFPyEnvironment(train_env_py)
eval_env = TFPyEnvironment(eval_env_py)

In [8]:
train_env.time_step_spec()._fields

('step_type', 'reward', 'discount', 'observation')

In [9]:
train_env.observation_spec()

BoundedTensorSpec(shape=(84, 84, 4), dtype=tf.uint8, name='observation', minimum=array(0, dtype=uint8), maximum=array(255, dtype=uint8))

In [10]:
train_env.action_spec()

BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0, dtype=int64), maximum=array(5, dtype=int64))

# Make the Network

In [11]:
preprocessing_layer = keras.layers.Lambda(lambda obs : tf.cast(obs, np.float32)/255.0)

q_net = QNetwork(train_env.observation_spec(), 
                 train_env.action_spec(),
                 preprocessing_layers = preprocessing_layer,
                 conv_layer_params = conv_layer_params,
                 fc_layer_params = fc_layer_params)

Alternatively we can use a categorical DQN agent to speed up training and make it more stable.

In [12]:
# preprocessing_layer = keras.layers.Lambda(lambda obs : tf.cast(obs, np.float32)/255.0)

# cat_q_net = CategoricalQNetwork(train_env.observation_spec(),
#                                 train_env.action_spec(),
#                                 preprocessing_layers = preprocessing_layer,
#                                 conv_layer_params = conv_layer_params,
#                                 num_atoms = num_atoms,
#                                 fc_layer_params = fc_layer_params)

# Create the agent 

In [13]:
optimizer = keras.optimizers.Adam(learning_rate = learning_rate)

epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=.26, 
    decay_steps= 25,
    end_learning_rate=0.01)

agent = DqnAgent(train_env.time_step_spec(),
                 train_env.action_spec(),
                 q_network = q_net,
                 optimizer = optimizer,
                 target_update_period = target_update_period,
                 td_errors_loss_fn = keras.losses.Huber(reduction="none"),
                 gamma = gamma,
                 train_step_counter = global_step,
                 epsilon_greedy = lambda : epsilon_fn(global_step))

agent.initialize()

In [14]:
# optimizer = keras.optimizers.Adam(learning_rate = learning_rate)

# epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
#     initial_learning_rate=.31, 
#     decay_steps= 30,
#     end_learning_rate=0.01)

# agent = CategoricalDqnAgent(train_env.time_step_spec(),
#                             train_env.action_spec(),
#                             categorical_q_network = cat_q_net,
#                             optimizer = optimizer,
#                             min_q_value = -20,
#                             max_q_value = 20,
#                             n_step_update = n_step_update,
#                             td_errors_loss_fn = element_wise_squared_loss, #keras.losses.Huber(reduction="none"),
#                             gamma = gamma,
#                             epsilon_greedy = epsilon,
#                             train_step_counter = global_step)

# agent.initialize()

# Create the Replay Buffer

In [15]:
replay_buffer = TFUniformReplayBuffer(data_spec = agent.collect_data_spec,
                                      batch_size = train_env.batch_size,
                                      max_length = 5000)

observer = replay_buffer.add_batch

# Metrics

In [16]:
training_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric()]

# Drivers

In [17]:
random_policy = RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())

In [18]:
initial_driver = DynamicStepDriver(train_env,
                                   policy = random_policy,
                                   observers = [observer] + training_metrics,
                                   num_steps = 20000)

In [19]:
collect_driver = DynamicStepDriver(train_env,
                                   policy = agent.collect_policy,
                                   observers = [observer] + training_metrics,
                                   num_steps = 4)

In [20]:
initial_driver.run()

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


(TimeStep(
 {'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
  'observation': <tf.Tensor: shape=(1, 84, 84, 4), dtype=uint8, numpy=
 array([[[[ 52,  52,  52,  52],
          [ 52,  52,  52,  52],
          [ 52,  52,  52,  52],
          ...,
          [ 87,  87,  87,  87],
          [ 87,  87,  87,  87],
          [ 87,  87,  87,  87]],
 
         [[ 87,  87,  87,  87],
          [ 87,  87,  87,  87],
          [ 87,  87,  87,  87],
          ...,
          [ 87,  87,  87,  87],
          [ 87,  87,  87,  87],
          [ 87,  87,  87,  87]],
 
         [[ 87,  87,  87,  87],
          [ 87,  87,  87,  87],
          [ 87,  87,  87,  87],
          ...,
          [ 87,  87,  87,  87],
          [ 87,  87,  87,  87],
          [ 87,  87,  87,  87]],
 
         ...,
 
         [[236, 236, 236, 236],
          [236, 236, 236, 236],
          [236, 236, 236, 236],
          ...,
          [236, 236, 236, 236],
          [236, 236, 236, 236],
         

# Dataset

In [21]:
dataset = replay_buffer.as_dataset(sample_batch_size = 64,
                                   num_steps = 2,
                                   num_parallel_calls = 3).prefetch(3)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


# Training Loop

In [22]:
policy_dir = os.path.join(os.curdir, "Saved policies pongDQN")
policy_dir

'.\\Saved policies pongDQN'

In [23]:
saver = PolicySaver(agent.policy)

In [24]:
collect_driver.run = function(collect_driver.run)
agent.train = function(agent.train)

In [25]:
agent.train_step_counter.assign(0)

<tf.Variable 'UnreadVariable' shape=() dtype=int32, numpy=0>

In [26]:
time_step = None
policy_state = agent.policy.get_initial_state(train_env.batch_size)
iterator = iter(dataset)

returns = []

for _ in range(num_training_steps):
    time_step, policy_state = collect_driver.run(time_step, policy_state)
    experience, info = next(iterator)
    train_loss = agent.train(experience)
    
    step = agent.train_step_counter.numpy()

    if step % 1000 == 0:
        avg_return = training_metrics[2].result()
        avg_length = training_metrics[3].result()
        print('step = {0}: Average Return = {1} --- Average length: {2}'.format(step, avg_return, avg_length))
        returns.append(avg_return)
        
#     if step % 10000 == 0:
#         iteration = step // 1000
#         folder = os.path.join(policy_dir, "policy_%dk" % iteration)
#         saver.save(folder)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))
step = 1000: Average Return = -21.0 --- Average length: 260.79998779296875
step = 2000: Average Return = -20.899999618530273 --- Average length: 266.20001220703125
step = 3000: Average Return = -20.799999237060547 --- Average length: 300.5
step = 4000: Average Return = -20.700000762939453 --- Average length: 313.6000061035156
step = 5000: Average Return = -20.399999618530273 --- Average length: 325.8999938964844
step = 6000: Average Return = -20.399999618530273 --- Average length: 375.79998779296875
step = 7000: Average Return = -20.700000762939453 --- Average length: 358.70001220703125
step = 8000: Average Return = -20.100000381469727 --- Average length: 372.20001220703125
step = 9000: Average Return = -19.0 --- Average length: 490.1000061035156
step = 10

KeyboardInterrupt: 

# Evaluation and visualization

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(returns)
plt.grid()

In [None]:
policy = tf.saved_model.load("Saved policies pong/policy_1570k")

In [None]:
episode_rewards = []

for _ in range(200):
    reward = 0.0
    time_step = eval_env.reset()
    
    while not time_step.is_last():
        action_step = policy.action(time_step)
        time_step = eval_env.step(action_step)
        reward += time_step.reward
    episode_rewards.append(reward)

In [None]:
x = np.array(episode_rewards).reshape(-1)

In [None]:
tf.reduce_mean(episode_rewards)

In [None]:
tf.math.reduce_std(episode_rewards)

In [None]:
n, bins, patches = plt.hist(x, 50, density=True, facecolor='g', alpha=0.75)

plt.grid(True)
plt.show()