* This document is written, referring to the following tutorials about tf_agents: 
    * [tutorials](https://github.com/tensorflow/agents/tree/master/docs/tutorials)

In [None]:
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts

# 1. Define an environment class as an inheritance from PyEnvironment
Instances from this class represent first-order delay(FOD) systems.

In [None]:
class MyEnv(py_environment.PyEnvironment):
    '''
    
    Y(s) = K/(1+T*s) * U(s)
    
    T * dy(t)/dt = - y(t) + K * u(t), t > 0, 
    y(0) = y_init.
    
    y(t+1) = (1-1/T) * y(t) + K / T * u(t), t = 1,2, ...
    y(0) = y_init.
    
    x(t+1) = (1-1/T) * x(t) + K / T * u(t), t = 1,2, ...
    x(0) = x_init, 
    y(t) = x(t).
    
    '''

    def __init__(self, nStepSimulation = 100, T = 10, K = 1.0, discount = 0.9):
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(1,), dtype=np.float32, minimum=(-1,), maximum=(1,), name='action')

        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(1,), dtype=np.float32, minimum=(-1,), maximum=(1,), name='observation')

        self._state = self.getInitialState()
        self._episode_ended = False
        self.time = 0
        self.nStepSimulation = nStepSimulation
        self.T = T
        self.K = K
        self.discount = discount
    
    def getInitialState(self):
        return 0.
    
    def getObservation(self):
        return np.array((self._state,), np.float32) # (1,)
    
    def getReward(self):
        sv = 1.0
        err = sv - self.getObservation()[0] 
        return np.abs(err) # (,)
    
    def action_spec(self):
        return self._action_spec
    
    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        self.time = 0
        self._state = self.getInitialState()
        self._episode_ended = False
        
        return ts.restart(self.getObservation())

    def _step(self, action):
        # action: (1,)
        
        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()
        
        if self.time < self.nStepSimulation:
            
            self._state = (1-1/self.T) * self._state + self.K/self.T * action[0]
            
            self.time += 1
            return ts.transition(self.getObservation(), reward = self.getReward(), discount = self.discount)
        else:
            self._episode_ended = True
            return ts.termination(self.getObservation(), reward = self.getReward())

In [None]:
def aSimpleUnitTest():
    env = MyEnv()
    assert isinstance(env, py_environment.PyEnvironment)
    utils.validate_py_environment(env, episodes=5)

def anotherSimpleUnitTest():
    env = MyEnv()
    assert isinstance(env, py_environment.PyEnvironment)

    u = np.array(np.random.randn(1), np.float32) # (,)
    
    time_step = env.reset()    
    rewardAvg = time_step.reward    
    while not time_step.is_last():
        time_step = env.step(u)
        rewardAvg = (1-1/10) * rewardAvg + 1/10 * time_step.reward

In [None]:
aSimpleUnitTest()
anotherSimpleUnitTest()

## 2. Represent P-controllers by deterministic policy networks or stochastic ones

MyActionNetDeterminisitc and MyActionNetDistiributional are implementations of P-controller with saturated/bounded outputs.

In [None]:
import tensorflow as tf
import tensorflow_probability as tfp

from tf_agents.specs import tensor_spec
from tf_agents.networks import network
from tf_agents.policies import actor_policy

from tf_agents.trajectories import time_step as ts

In [None]:
class MyActionNetDeterministic(network.Network):

    def __init__(self, input_tensor_spec, output_tensor_spec):
        super().__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name='ActionNet')
        self._output_tensor_spec = output_tensor_spec
        self._sub_layers = [
            tf.keras.layers.Dense(
                action_spec.shape.num_elements(), activation=tf.nn.tanh),
        ]
        self._layer = tf.keras.layers.Dense(action_spec.shape.num_elements(), activation=tf.nn.tanh)
        # action_spec
        # BoundedTensorSpec(shape=(3,), dtype=tf.float32, name=None, minimum=array(-1., dtype=float32), maximum=array(1., dtype=float32))

    def call(self, observations, step_type, network_state):
        del step_type

        _observations = tf.cast(observations, dtype=tf.float32) # (nPv,)
        _actions = self._layer(_observations) # (nMv,)
        _actions = tf.reshape(_actions, [-1] + self._output_tensor_spec.shape.as_list()) # (1, nMv)

        return _actions, network_state

In [None]:
class MyActionNetDistributional(network.Network):
    """
    
    An instance as stochastic policy represents a P-controller with a random value generator.
    
    >> create an instance of the network:
    net = MyActionNetDistributional(input_tensor_spec, output_tensor_spec)    
    
    """

    def __init__(self, input_tensor_spec, output_tensor_spec):
        super().__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name='ActionNet')
        self._output_tensor_spec = output_tensor_spec
        self._sub_layers = [
            tf.keras.layers.Dense(
                action_spec.shape.num_elements(), activation=tf.nn.tanh),
        ]
        self._layer = tf.keras.layers.Dense(action_spec.shape.num_elements(), activation=tf.nn.tanh)
        self._log_action_std = tf.Variable(tf.zeros(shape=())) # (,)
        # action_spec
        # BoundedTensorSpec(shape=(3,), dtype=tf.float32, name=None, minimum=array(-1., dtype=float32), maximum=array(1., dtype=float32))

    def call(self, observations, step_type, network_state):
        del step_type

        _observations = tf.cast(observations, dtype=tf.float32) # (nPv,)
        _actions = self._layer(_observations) # (nMv,)
        _actions = tf.reshape(_actions, [-1] + self._output_tensor_spec.shape.as_list()) # (1, nMv)        
        _action_std = tf.ones_like(_actions) * tf.math.exp(self._log_action_std) # (1, nMv)

        return tfp.distributions.MultivariateNormalDiag(_actions, _action_std), network_state

In [None]:
def createAnInstanceOfDeterministicPolicy(input_tensor_spec, action_spec):
    time_step_spec = ts.time_step_spec(input_tensor_spec)

    return actor_policy.ActorPolicy(
        time_step_spec = time_step_spec,
        action_spec    = action_spec,
        actor_network  = MyActionNetDeterministic(input_tensor_spec, action_spec))

In [None]:
def createAnInstanceOfDistributionalPolicy(input_tensor_spec, action_spec):
    time_step_spec = ts.time_step_spec(input_tensor_spec)

    return actor_policy.ActorPolicy(
        time_step_spec = time_step_spec,
        action_spec    = action_spec,
        actor_network  = MyActionNetDistributional(input_tensor_spec, action_spec))

In [None]:
nPv = 1
nMv = 1
batch_size = 2**5

input_tensor_spec = tensor_spec.TensorSpec((nPv,)
                                           , tf.float32)

action_spec = tensor_spec.BoundedTensorSpec((nMv,),
                                            tf.float32,
                                            minimum=-1,
                                            maximum=1)

for my_actor_policy in (createAnInstanceOfDeterministicPolicy(input_tensor_spec, action_spec)
                        ,createAnInstanceOfDistributionalPolicy(input_tensor_spec, action_spec)):

    observations = tf.random.normal(shape=(batch_size, nPv))

    time_step = ts.restart(observations, batch_size) # time_step.is_first = True

    action_step = my_actor_policy.action(time_step) # action_step.action: (*, nMv)

    distribution_step = my_actor_policy.distribution(time_step)
    
    assert isinstance(distribution_step.action, tfp.distributions.Distribution)

# 3. Implement data collectors aided by replay buffers

See [this tutorial](https://github.com/tensorflow/agents/blob/master/docs/tutorials/5_replay_buffers_tutorial.ipynb)

In [None]:
import tensorflow as tf
from tf_agents.replay_buffers import tf_uniform_replay_buffer

define parameters:

In [None]:
nMv = 1
nPv = 1
batch_size = 2**5
sample_batch_size = 2**2
max_length = 2**10
num_steps = 4 + 1

create an instance of replay buffer:

In [None]:
data_spec =  (
    tf.TensorSpec([nMv,], tf.float32, 'action'),
    tf.TensorSpec([nPv,], tf.float32, 'observation'),
    tf.TensorSpec([], tf.float32, 'reward'),
    )

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec,
    batch_size=batch_size,
    max_length=max_length)

add batches of items in the replay buffer:

In [None]:
def collectData():
    """
    Everytime a batch of items is collected, stream it out
    """
    for _ in range(10):
        actionBatch = tf.random.normal([batch_size, nMv])
        observationBatch = tf.random.normal([batch_size, nPv])
        rewardBatch = tf.random.normal([batch_size,])
        yield (actionBatch, observationBatch, rewardBatch)
        
replay_buffer.clear()
for (actionBatch, observationBatch, rewardBatch) in collectData():
    replay_buffer.add_batch((actionBatch, observationBatch, rewardBatch))

read items from the buffer:

In [None]:
dataset = replay_buffer.as_dataset(
    sample_batch_size=sample_batch_size
    , num_steps=num_steps)
trajectories, _ = iter(dataset).__next__()

In [None]:
print("The first item of trajectories represents a trajectory of actions, with shape(batch_size, num_steps, nMv)=", trajectories[0].shape)
print(", the second, observations, with shape(batch_size, num_steps, nPv)=", trajectories[1].shape)
print("and the last, rewards, with shape(batch_size, num_steps)=", trajectories[2].shape)

# 4. Apply an algorithm of RL to design controllers for FOD systems

See [this tutorial](https://github.com/tensorflow/agents/blob/master/docs/tutorials/6_reinforce_tutorial.ipynb)

In [None]:
from tf_agents.environments import tf_py_environment
from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.networks import actor_distribution_network
from tf_agents.trajectories import trajectory

define parameters:

In [None]:
num_iterations = 250 # @param {type:"integer"}
collect_episodes_per_iteration = 2 # @param {type:"integer"}
replay_buffer_capacity = 2000 # @param {type:"integer"}

learning_rate = 1e-3 # @param {type:"number"}
log_interval = 25 # @param {type:"integer"}
num_eval_episodes = 10 # @param {type:"integer"}
eval_interval = 50 # @param {type:"integer"}

## 4.1 create environment instances

In [None]:
def createAnEnvironmentInstance():
    return MyEnv()

create an instance of MyEnvironment and check `time_step_spec` and `action_spec`:

In [None]:
env = createAnEnvironmentInstance()
env.reset()

print('Observation Spec:')
print(env.time_step_spec().observation)
print('Action Spec:')
print(env.action_spec())

cast python-form environments to TFPyEnvironment and check if all the specifications of variables are wrapped by `TensorSpec`:

In [None]:
train_py_env = createAnEnvironmentInstance()
eval_py_env = createAnEnvironmentInstance()

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

print('Observation Spec:')
print(train_env.time_step_spec().observation)
print('Action Spec:')
print(train_env.action_spec())

## 4.2 create an agent instance and generate the data-collect policy and the evaluation one:

In [None]:
def createActionNetworkInstance(env):
    return MyActionNetDistributional(input_tensor_spec = env.observation_spec()
                                     , output_tensor_spec = env.action_spec())

create an instance of stochastic action network:

In [None]:
actor_net = createActionNetworkInstance(train_env)
print("Input spec.:")
print(actor_net._input_tensor_spec)
print("Output spec.:")
print(actor_net._output_tensor_spec)

create an agent:

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

tf_agent = reinforce_agent.ReinforceAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    actor_network=actor_net,
    optimizer=optimizer,
    normalize_returns=True,
    train_step_counter=train_step_counter)
tf_agent.initialize()

create two policies, the one policy to be deployed and the other to be used for collecting data:

In [None]:
eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy

## 4.3 create an instance of ReplayBuffer and define a process to collect trajectories

In [None]:
def createAnInstanceOfReplayBuffer(data_spec, batch_size=1, max_length=2**10):
    return tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec = data_spec
        , batch_size=batch_size
        , max_length=max_length)

In [None]:
def collectTrajectories(environment, policy, replay_buffer, policy_state = ()):
    replay_buffer.clear()
    time_step = environment.reset()    
    for _ in range(collect_episodes_per_iteration):    
        action_step = policy.action(time_step, policy_state)
        next_time_step = environment.step(action_step)
        traj = trajectory.Trajectory(
            time_step.step_type,
            time_step.observation,
            action_step.action,
            action_step.info,
            next_time_step.step_type,
            next_time_step.reward,
            next_time_step.discount)

        replay_buffer.add_batch(traj)

        time_step = next_time_step
        policy_state = action_step.state

In [None]:
replay_buffer = createAnInstanceOfReplayBuffer(data_spec = tf_agent.collect_data_spec)

do a test-run of data collect steps:

In [None]:
collectTrajectories(environment = train_env
                    , policy = collect_policy
                    , replay_buffer =  replay_buffer)