## Setup

In [306]:
import base64

import gym
import imageio
import IPython
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay
import tensorflow as tf
from gym import spaces
from tf_agents.agents.dqn import dqn_agent
from tf_agents.agents.ppo import ppo_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import FlattenObservationsWrapper
from tf_agents.environments import ObservationFilterWrapper
from tf_agents.environments import PyEnvironmentBaseWrapper
from gym import ObservationWrapper
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
from tf_agents.utils.nest_utils import batch_nested_array

In [307]:
# Set up a virtual display for rendering OpenAI gym environments.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

In [308]:
tf.version.VERSION

'2.12.0'

## Hyperparameters

In [309]:
num_iterations = 20_000  # @param {type:"integer"}

initial_collect_steps = 100  # @param {type:"integer"}
collect_steps_per_iteration = 1  # @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 5e-4  # 1e-3  # @param {type:"number"}
log_interval = 1000  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 5000  # @param {type:"integer"}

## Environment

In [325]:
class SimplifiedObservations(gym.ObservationWrapper):
# class SimplifiedObservations(PyEnvironmentBaseWrapper):
    def __int__(self, env):
        print(f"SimplifiedObservations")
        super().__init__(env)
        self.observation_space = spaces.Discrete(45)
        print(f"Observation space: {self.observation_space}")
        raise Exception(f"SimplifiedObservations")

    def observation(self, observation):
        print("Observing")
        return observation[0] + observation[1] + observation[2]

In [326]:
env_name = 'Blackjack-v1'
env = suite_gym.load(
    env_name,
    # env_wrappers=[SimplifiedObservations],
    gym_env_wrappers=[SimplifiedObservations],
    # env_wrappers=[FlattenObservationsWrapper],
)
print(env.observation_spec())

# print(env.observation_spec().values())
# env = FlattenObservationsWrapper(env)

(BoundedArraySpec(shape=(), dtype=dtype('int64'), name='observation/tuple_0', minimum=0, maximum=31), BoundedArraySpec(shape=(), dtype=dtype('int64'), name='observation/tuple_1', minimum=0, maximum=10), BoundedArraySpec(shape=(), dtype=dtype('int64'), name='observation/tuple_2', minimum=0, maximum=1))


In [327]:
env.reset()
PIL.Image.fromarray(env.render())

Observing


TypeError: If shallow structure is a sequence, input must also be a sequence. Input has type: <class 'int'>.

In [313]:
print('Observation Spec:')
print(env.time_step_spec().observation)

Observation Spec:
(BoundedArraySpec(shape=(), dtype=dtype('int64'), name='observation/tuple_0', minimum=0, maximum=31), BoundedArraySpec(shape=(), dtype=dtype('int64'), name='observation/tuple_1', minimum=0, maximum=10), BoundedArraySpec(shape=(), dtype=dtype('int64'), name='observation/tuple_2', minimum=0, maximum=1))


In [314]:
print('Reward Spec:')
print(env.time_step_spec().reward)

Reward Spec:
ArraySpec(shape=(), dtype=dtype('float32'), name='reward')


In [315]:
print('Action Spec:')
print(env.action_spec())

Action Spec:
BoundedArraySpec(shape=(), dtype=dtype('int64'), name='action', minimum=0, maximum=1)


In [316]:
time_step = env.reset()
print('Time step:')
print(time_step)

action = np.array(1, dtype=np.int32)

next_time_step = env.step(action)
print('Next time step:')
print(next_time_step)

Time step:
TimeStep(
{'discount': array(1., dtype=float32),
 'observation': (array(10), array(10), array(0)),
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})
Next time step:
TimeStep(
{'discount': array(1., dtype=float32),
 'observation': (array(17), array(10), array(0)),
 'reward': array(0., dtype=float32),
 'step_type': array(1, dtype=int32)})


In [317]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

In [318]:
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

## Agent

In [319]:
fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1
print(f"action tensor spec: {action_tensor_spec}\n")
print(f"num actions {num_actions}\n")


# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
    return tf.keras.layers.Dense(
        num_units,
        activation=tf.keras.activations.relu,
        kernel_initializer=tf.keras.initializers.VarianceScaling(
            scale=2.0, mode='fan_in', distribution='truncated_normal'
        )
    )


# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
input_layer = tf.keras.layers.InputLayer(input_shape=(3,))
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03
    ),
    bias_initializer=tf.keras.initializers.Constant(-0.2)
)
q_net = sequential.Sequential(
    [input_layer] + dense_layers + [q_values_layer],
    input_spec=env.observation_spec()
)
print(f"input {q_net.input_spec}")
print(f"layer 0 {q_net.layers[0].name}")
print(f"layer 1 {q_net.layers[1].name}")
print(f"layer 2 {q_net.layers[2].name}")

action tensor spec: BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0), maximum=array(1))

num actions 2

input None
layer 0 input_19
layer 1 dense_178
layer 2 dense_179


In [320]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

# agent = ppo_agent.PPOAgent(
#     train_env.time_step_spec(),
#     train_env.action_spec(),
#     q_network=q_net,
#     optimizer=optimizer,
#     td_errors_loss_fn=common.element_wise_squared_loss,
#     train_step_counter=train_step_counter,
#     target_update_tau=0.1,  # soft updates
#     gamma=1,  # discount factor
# )

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter,
    target_update_tau=0.1,  # soft updates
    gamma=1,  # discount factor
)

agent.initialize()

ValueError: Exception encountered when calling layer 'sequential_57' (type Sequential).

Layer "dense_178" expects 1 input(s), but it received 3 input tensors. Inputs received: [<tf.Tensor: shape=(1,), dtype=int64, numpy=array([7])>, <tf.Tensor: shape=(1,), dtype=int64, numpy=array([8])>, <tf.Tensor: shape=(1,), dtype=int64, numpy=array([0])>]

Call arguments received by layer 'sequential_57' (type Sequential):
  • inputs=('tf.Tensor(shape=(1,), dtype=int64)', 'tf.Tensor(shape=(1,), dtype=int64)', 'tf.Tensor(shape=(1,), dtype=int64)')
  • network_state=()
  • kwargs={'step_type': 'tf.Tensor(shape=(1,), dtype=int32)', 'training': 'None'}
  In call to configurable 'DqnAgent' (<class 'tf_agents.agents.dqn.dqn_agent.DqnAgent'>)

## Policies

In [None]:
eval_policy = agent.policy
collect_policy = agent.collect_policy

In [None]:
random_policy = random_tf_policy.RandomTFPolicy(
    train_env.time_step_spec(),
    train_env.action_spec()
)

In [None]:
example_environment = tf_py_environment.TFPyEnvironment(
    suite_gym.load(env_name)
)

In [None]:
time_step = example_environment.reset()

In [None]:
random_policy.action(time_step)

## Metrics and Evaluation

In [None]:
def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

# See also the metrics module for standard implementations of different metrics.
# https://github.com/tensorflow/agents/tree/master/tf_agents/metrics

In [None]:
compute_avg_return(eval_env, random_policy, num_eval_episodes)

## Replay Buffer

In [None]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    agent.collect_data_spec,
    batch_size=1,
    max_length=replay_buffer_max_length
)

# rb_observer = replay_buffer.add_batch
rb_observer = lambda x: replay_buffer.add_batch(batch_nested_array(x))

# rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
#   replay_buffer.py_client,
#   table_name,
#   sequence_length=2)


In [None]:
agent.collect_data_spec

In [None]:
# noinspection PyProtectedMember
agent.collect_data_spec._fields

## Data Collection

In [None]:
py_driver.PyDriver(
    env,
    py_tf_eager_policy.PyTFEagerPolicy(
        random_policy, use_tf_function=True),
    [rb_observer],
    max_steps=initial_collect_steps
).run(train_py_env.reset())

In [None]:
# For the curious:
# Uncomment to peel one of these off and inspect it.
# iter(replay_buffer.as_dataset()).next()

In [None]:
# Dataset generates trajectories with shape [Bx2x...]
replayBufferDataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=batch_size,
    num_steps=2
).prefetch(3)

replayBufferDataset

In [None]:
replayBufferIterator = iter(replayBufferDataset)
print(replayBufferIterator)

In [None]:
# For the curious:
# Uncomment to see what the dataset iterator is feeding to the agent.
# Compare this representation of replay data
# to the collection of individual trajectories shown earlier.

# iterator.next()

## Training the Agent

In [None]:
try:
    %%time
except:
    pass

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step.
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

# Reset the environment.
time_step = train_py_env.reset()

# Create a driver to collect experience.
collect_driver = py_driver.PyDriver(
    env,
    py_tf_eager_policy.PyTFEagerPolicy(
        agent.collect_policy, use_tf_function=True),
    [rb_observer],
    max_steps=collect_steps_per_iteration)

for _ in range(num_iterations):

    # Collect a few steps and save to the replay buffer.
    time_step, _ = collect_driver.run(time_step)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(replayBufferIterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

## Visualization

In [None]:
iterations = range(0, num_iterations + 1, eval_interval)
plt.plot(iterations, returns)
plt.ylabel('Average Return')
plt.xlabel('Iterations')
plt.ylim(top=250)

In [None]:
def embed_mp4(filename):
    """Embeds an mp4 file in the notebook."""
    video = open(filename, 'rb').read()
    b64 = base64.b64encode(video)
    tag = '''
  <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

    return IPython.display.HTML(tag)

In [None]:
def create_policy_eval_video(policy, filename, num_episodes=5, fps=30):
    filename = filename + ".mp4"
    with imageio.get_writer(filename, fps=fps) as video:
        for _ in range(num_episodes):
            time_step = eval_env.reset()
            video.append_data(eval_py_env.render())
            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = eval_env.step(action_step.action)
                video.append_data(eval_py_env.render())
    return embed_mp4(filename)


create_policy_eval_video(agent.policy, "trained-agent")

In [None]:
create_policy_eval_video(random_policy, "random-agent")