In [1]:
import numpy as np
import json
import tensorflow as tf
import os

import time
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import gym

import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(gpus[0], 
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=128)])
  except RuntimeError as e:
    print(e)
    
    
print("GPUs: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.config.experimental.list_logical_devices('GPU')


from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.networks.q_network import QNetwork
from tf_agents.agents.dqn.dqn_agent import DqnAgent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.metrics import tf_metrics
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.utils.common import function, element_wise_squared_loss
from tf_agents.eval.metric_utils import log_metrics
import logging

import tensorflow.keras as keras

tf.compat.v1.enable_v2_behavior()
import time
import json

from my_gym import envs

seed=42
tf.random.set_seed(seed)
np.random.seed(seed)

GPUs:  0


In [2]:
pm=json.load(open("/home/pico/uni/romi/gatekeeper_rl_env/params.json"))

In [3]:
#env = gym.make('GatekeeperEnv-v0')
env = suite_gym.load('GatekeeperEnv-v0')
tf_env = tf_py_environment.TFPyEnvironment(env)

In [4]:
tf_env.observation_spec()

(BoundedTensorSpec(shape=(2, 2), dtype=tf.float32, name='observation/tuple_0', minimum=array(0., dtype=float32), maximum=array(255., dtype=float32)),
 BoundedTensorSpec(shape=(), dtype=tf.int64, name='observation/tuple_1', minimum=array(0), maximum=array(15)))

In [5]:
def image_layers():
    input_img = keras.layers.Input(shape=(2,2))
    x = keras.layers.Reshape((2,2, 1))(input_img)
    #preprocessing = keras.layers.Lambda(lambda x: tf.cast(x, np.float32) / 255.)(input_img)
    preprocessing = keras.layers.Lambda(lambda x: x / 255.)(x)
    x = keras.layers.Conv2D(filters=pm['model']['conv2d_1_filters'],
                            kernel_size=3,
                            padding="same",
                            activation='relu')(preprocessing)
    x = keras.layers.Flatten()(x)
    model = keras.models.Model(inputs=input_img,outputs=x)
    return model
    
def input_vect_layers():
    input_ = keras.layers.Input(shape=(1,))
    preprocessing = keras.layers.Lambda(lambda x: x / 15.)(input_)
    #x = keras.layers.Dense(4)(preprocessing)
    return keras.models.Model(inputs=input_,outputs=preprocessing)

In [6]:
#network
preprocessing_layers=(image_layers(),input_vect_layers())
preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1)
fc_layer_params = (pm['model']['fc_layer_params'],)

q_net = QNetwork(
tf_env.observation_spec(),
tf_env.action_spec(),
preprocessing_layers=preprocessing_layers,
preprocessing_combiner=preprocessing_combiner,
fc_layer_params=fc_layer_params)



In [7]:
#agent
train_step = tf.Variable(0)
#optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0,
#            epsilon=0.00001, centered=True)
optimizer = keras.optimizers.Adam(learning_rate=pm['model']['learning_rate'])

epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=1.0, # initial ε
            decay_steps = pm['agent']['decay_steps'], 
            end_learning_rate=0.01) # final ε

agent = DqnAgent(tf_env.time_step_spec(),
                tf_env.action_spec(),
                q_network=q_net,
                optimizer=optimizer,
                target_update_period=pm['agent']['target_update_period'],
                td_errors_loss_fn= element_wise_squared_loss, #keras.losses.Huber(reduction="none"),
                gamma=pm['agent']['gamma'], # discount factor
                train_step_counter=train_step,
                epsilon_greedy=lambda: epsilon_fn(train_step))
agent.initialize()




In [8]:
#Replay buffer
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec= agent.collect_data_spec,
    batch_size= tf_env.batch_size,
    max_length=pm['rbuffer']['max_length'])

In [9]:
#observer
#observer is just a function (or a callable object) that takes a trajectory argument,
#add_method() method (bound to the replay_buffer object) can be used as observer
replay_buffer_observer = replay_buffer.add_batch

In [10]:
#observer for training metrics
training_metrics = [
tf_metrics.NumberOfEpisodes(),
tf_metrics.EnvironmentSteps(),
tf_metrics.AverageReturnMetric(),
tf_metrics.AverageEpisodeLengthMetric(),
]

In [11]:
#custom observer
class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total
    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 100 == 0:
            print("\r{}/{}".format(self.counter, self.total), end="")

In [12]:
#Collect Driver
update_period = pm['collect_driver']['num_steps'] # train the model every x steps
collect_driver = DynamicStepDriver(
    tf_env,
    agent.collect_policy,
    observers=[replay_buffer_observer] + training_metrics,
    num_steps=update_period) # collect x steps for each training iteration

In [13]:
# random policy driver to start filling the buffer
random_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                        tf_env.action_spec())

ns = pm['rnd_policy']['num_steps']
init_driver = DynamicStepDriver(
            tf_env,
            random_collect_policy,
            observers=[replay_buffer.add_batch, ShowProgress(ns)],
            num_steps=ns)
            
final_time_step, final_policy_state = init_driver.run()

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


5000/5000

In [14]:
#use buffer as tf API dataset ()
dataset = replay_buffer.as_dataset(
        sample_batch_size=pm['rbuffer']['sample_batch_size'],
        num_steps=2,
        num_parallel_calls=3).prefetch(3)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [15]:
#convert main functions to tensorflow functions to speed up training
collect_driver.run = function(collect_driver.run)
agent.train = function(agent.train)

In [16]:
def train_agent(n_iterations):
    time_step = None
    policy_state = ()#agent.collect_policy.get_initial_state(tf_env.batch_size)
    iterator = iter(dataset)
    for iteration in range(n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(iterator)
        train_loss = agent.train(trajectories)
        print("\r{} loss:{:.5f}".format(
            iteration, train_loss.loss.numpy()), end="")
        #if iteration % 1000 == 0:
            #log_metrics(training_metrics)
            #print(training_metrics[0].result())
            #print(training_metrics[1].result())
            #print(training_metrics[2].result())
            #print(training_metrics[3].result())

In [17]:
train_agent(10000)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


9999 loss:2.9036981

In [29]:
#custom observer
class testAgent:
    def __init__(self):
        self.counter = 0
    def __call__(self, trajectory):
        #if not trajectory.is_boundary():
        print(trajectory)
        

In [30]:
replay_buffer.num_frames()

<tf.Tensor: shape=(), dtype=int64, numpy=10000>

In [31]:
test_driver = DynamicStepDriver(
    tf_env,
    agent.policy,
    observers=[testAgent()],
    num_steps=1) 

state = tf_env.reset()
print(env.map)

[[b'-' b'-' b'D' b'G']
 [b'-' b'-' b'-' b'R']
 [b'+' b'-' b'+' b'+']
 [b'-' b'-' b'-' b'-']]


In [32]:
print(state.observation[1].numpy()[0])
for i in range(20):
    state,_ = test_driver.run(state)
    print(state.observation[1].numpy()[0])
    if state.step_type.numpy()[0] == 2:
        break

15
Trajectory(step_type=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, observation=(<tf.Tensor: shape=(1, 2, 2), dtype=float32, numpy=
array([[[255., 255.],
        [255., 255.]]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int64, numpy=array([15])>), action=<tf.Tensor: shape=(1,), dtype=int64, numpy=array([0])>, policy_info=(), next_step_type=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>, reward=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-1.], dtype=float32)>, discount=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>)
14
Trajectory(step_type=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>, observation=(<tf.Tensor: shape=(1, 2, 2), dtype=float32, numpy=
array([[[255., 255.],
        [255., 255.]]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int64, numpy=array([14])>), action=<tf.Tensor: shape=(1,), dtype=int64, numpy=array([0])>, policy_info=(), next_step_type=<tf.Tensor: shape=(1,)

In [21]:
state,_ = test_driver.run(state)
print(state)

TimeStep(step_type=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>, reward=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-1.], dtype=float32)>, discount=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>, observation=(<tf.Tensor: shape=(1, 2, 2), dtype=float32, numpy=
array([[[255., 255.],
        [255., 255.]]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int64, numpy=array([12])>))


In [22]:
agent.train_step_counter

<tf.Variable 'Variable:0' shape=() dtype=int32, numpy=10000>

In [23]:
tf_env.reset()

TimeStep(step_type=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>, reward=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>, discount=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>, observation=(<tf.Tensor: shape=(1, 2, 2), dtype=float32, numpy=
array([[[255., 255.],
        [255., 255.]]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int64, numpy=array([15])>))

In [24]:
tf_env.step(2)

TimeStep(step_type=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>, reward=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-1.], dtype=float32)>, discount=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>, observation=(<tf.Tensor: shape=(1, 2, 2), dtype=float32, numpy=
array([[[255., 255.],
        [255., 255.]]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int64, numpy=array([15])>))