In [None]:
import numpy as np
import json
import tensorflow as tf
import os

import time
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import gym

import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(gpus[0], 
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])
  except RuntimeError as e:
    print(e)
    
    
print("GPUs: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.config.experimental.list_logical_devices('GPU')


from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.networks.q_network import QNetwork
from tf_agents.agents.dqn.dqn_agent import DqnAgent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.metrics import tf_metrics
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.utils.common import function
from tf_agents.eval.metric_utils import log_metrics
import logging


import tensorflow.keras as keras

tf.compat.v1.enable_v2_behavior()
import time

from my_gym import envs

In [None]:
#env = gym.make('GatekeeperEnv-v0')
env = suite_gym.load('GatekeeperEnv-v0')
tf_env = tf_py_environment.TFPyEnvironment(env)

In [None]:
tf_env.observation_spec()

In [None]:
def image_layers():
    input_img = keras.layers.Input(shape=(2,2))
    x = keras.layers.Reshape((2,2, 1))(input_img)
    #preprocessing = keras.layers.Lambda(lambda x: tf.cast(x, np.float32) / 255.)(input_img)
    preprocessing = keras.layers.Lambda(lambda x: x / 255.)(x)
    x = keras.layers.Conv2D(filters=4,
                            kernel_size=3,
                            padding="same",
                            activation='relu')(preprocessing)
    x = keras.layers.Flatten()(x)
    model = keras.models.Model(inputs=input_img,outputs=x)
    return model
    
def input_vect_layers():
    input_ = keras.layers.Input(shape=(1,))
    preprocessing = keras.layers.Lambda(lambda x: x / 15.)(input_)
    return keras.models.Model(inputs=input_,outputs=preprocessing)

In [None]:
#network
preprocessing_layers=(image_layers(),input_vect_layers())
preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1)
fc_layer_params = (30,)

q_net = QNetwork(
tf_env.observation_spec(),
tf_env.action_spec(),
preprocessing_layers=preprocessing_layers,
preprocessing_combiner=preprocessing_combiner,
fc_layer_params=fc_layer_params)

In [None]:
#agent
train_step = tf.Variable(0)
update_period = 8 # train the model every x steps
optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0,
            epsilon=0.00001, centered=True)

epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=1.0, # initial ε
            decay_steps=10000 // update_period, 
            end_learning_rate=0.01) # final ε

agent = DqnAgent(tf_env.time_step_spec(),
                tf_env.action_spec(),
                q_network=q_net,
                optimizer=optimizer,
                target_update_period=16,
                td_errors_loss_fn=keras.losses.Huber(reduction="none"),
                gamma=0.99, # discount factor
                train_step_counter=train_step,
                epsilon_greedy=lambda: epsilon_fn(train_step))
agent.initialize()


In [None]:
#Replay buffer
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec= agent.collect_data_spec,
    batch_size= tf_env.batch_size,
    max_length=10000)

In [None]:
#observer
#observer is just a function (or a callable object) that takes a trajectory argument,
#add_method() method (bound to the replay_buffer object) can be used as observer
replay_buffer_observer = replay_buffer.add_batch

In [None]:
#observer for training metrics
training_metrics = [
tf_metrics.NumberOfEpisodes(),
tf_metrics.EnvironmentSteps(),
tf_metrics.AverageReturnMetric(),
tf_metrics.AverageEpisodeLengthMetric(),
]

In [None]:
#custom observer
class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total
    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 100 == 0:
            print("\r{}/{}".format(self.counter, self.total), end="")

In [None]:
#Collect Driver
collect_driver = DynamicStepDriver(
    tf_env,
    agent.collect_policy,
    observers=[replay_buffer_observer] + training_metrics,
    num_steps=update_period) # collect x steps for each training iteration

In [None]:
# random policy driver to start filling the buffer
random_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                        tf_env.action_spec())

init_driver = DynamicStepDriver(
            tf_env,
            random_collect_policy,
            observers=[replay_buffer.add_batch, ShowProgress(5000)],
            num_steps=5000)
            
final_time_step, final_policy_state = init_driver.run()

In [None]:
#use buffer as tf API dataset ()
dataset = replay_buffer.as_dataset(
        sample_batch_size=16,
        num_steps=2,
        num_parallel_calls=3).prefetch(3)

In [None]:
#convert main functions to tensorflow functions to speed up training
collect_driver.run = function(collect_driver.run)
agent.train = function(agent.train)

In [None]:
def train_agent(n_iterations):
    time_step = None
    policy_state = ()#agent.collect_policy.get_initial_state(tf_env.batch_size)
    iterator = iter(dataset)
    for iteration in range(n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(iterator)
        train_loss = agent.train(trajectories)
        print("\r{} loss:{:.5f}".format(
            iteration, train_loss.loss.numpy()), end="")
        if iteration % 1000 == 0:
            #log_metrics(training_metrics)
            print(training_metrics[0].result())
            print(training_metrics[1].result())
            print(training_metrics[2].result())
            print(training_metrics[3].result())

In [None]:
train_agent(50000)

In [None]:
log_metrics

In [None]:
print(training_metrics[2].result())