* This document is written, referring to the following tutorials about tf_agents: 
    * [tutorials](https://github.com/tensorflow/agents/tree/master/docs/tutorials)

In [None]:
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step

# 1. Define an environment class as an inheritance from PyEnvironment
Instances from this class represent first-order delay systems.

In [None]:
class MyEnv(py_environment.PyEnvironment):
    '''
    
    Y(s) = K/(1+T*s) * U(s)
    
    T * dy(t)/dt = - y(t) + K * u(t), t > 0, 
    y(0) = y_init.
    
    y(t+1) = (1-1/T) * y(t) + K / T * u(t), t = 1,2, ...
    y(0) = y_init.
    
    x(t+1) = (1-1/T) * x(t) + K / T * u(t), t = 1,2, ...
    x(0) = x_init, 
    y(t) = x(t).
    
    '''

    def __init__(self, nStepSimulation = 100, T = 10, K = 1.0, discount = 0.9):
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.float32, minimum=-1, maximum=1, name='action')

        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.float32, minimum=-1, maximum=1, name='observation')

        self._state = self.getInitialState()
        self._episode_ended = False
        self.time = 0
        self.nStepSimulation = nStepSimulation
        self.T = T
        self.K = K
        self.discount = discount
    
    def getInitialState(self):
        return 0.
    
    def getObservation(self):
        return np.array(self._state, np.float32) # (,)
    
    def getReward(self):
        sv = 1.0
        err = sv - self.getObservation()
        return np.abs(err)
    
    def action_spec(self):
        return self._action_spec
    
    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        self.time = 0
        self._state = self.getInitialState()
        self._episode_ended = False
        
        return time_step.restart(self.getObservation())

    def _step(self, action):
        
        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()
        
        if self.time < self.nStepSimulation:
            
            self._state = (1-1/self.T) * self._state + self.K/self.T * action
            
            self.time += 1
            return time_step.transition(self.getObservation(), reward = self.getReward(), discount = self.discount)
        else:
            self._episode_ended = True
            return time_step.termination(self.getObservation(), reward = self.getReward())

In [None]:
def aSimpleUnitTest():
    env = MyEnv()
    assert isinstance(env, py_environment.PyEnvironment)
    utils.validate_py_environment(env, episodes=5)

def anotherSimpleUnitTest():
    env = MyEnv()
    assert isinstance(env, py_environment.PyEnvironment)

    u = np.array(np.random.randn(), np.float32) # (,)
    
    time_step = env.reset()    
    rewardAvg = time_step.reward    
    while not time_step.is_last():
        time_step = env.step(u)
        rewardAvg = (1-1/10) * rewardAvg + 1/10 * time_step.reward

In [None]:
aSimpleUnitTest()
anotherSimpleUnitTest()

## 2. Represent P-controllers by deterministic policy networks or stochastic ones

MyActionNetDeterminisitc and MyActionNetDistiributional are implementations of P-controller with saturated/bounded outputs.

In [None]:
import tensorflow as tf
import tensorflow_probability as tfp

from tf_agents.specs import tensor_spec
from tf_agents.networks import network
from tf_agents.policies import actor_policy

from tf_agents.trajectories import time_step as ts

In [None]:
class MyActionNetDeterministic(network.Network):

    def __init__(self, input_tensor_spec, output_tensor_spec):
        super().__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name='ActionNet')
        self._output_tensor_spec = output_tensor_spec
        self._sub_layers = [
            tf.keras.layers.Dense(
                action_spec.shape.num_elements(), activation=tf.nn.tanh),
        ]
        self._layer = tf.keras.layers.Dense(action_spec.shape.num_elements(), activation=tf.nn.tanh)
        # action_spec
        # BoundedTensorSpec(shape=(3,), dtype=tf.float32, name=None, minimum=array(-1., dtype=float32), maximum=array(1., dtype=float32))

    def call(self, observations, step_type, network_state):
        del step_type

        _observations = tf.cast(observations, dtype=tf.float32) # (nPv,)
        _actions = self._layer(_observations) # (nMv,)
        _actions = tf.reshape(_actions, [-1] + self._output_tensor_spec.shape.as_list()) # (1, nMv)

        return _actions, network_state

In [None]:
class MyActionNetDistributional(network.Network):

    def __init__(self, input_tensor_spec, output_tensor_spec):
        super().__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name='ActionNet')
        self._output_tensor_spec = output_tensor_spec
        self._sub_layers = [
            tf.keras.layers.Dense(
                action_spec.shape.num_elements(), activation=tf.nn.tanh),
        ]
        self._layer = tf.keras.layers.Dense(action_spec.shape.num_elements(), activation=tf.nn.tanh)
        self._log_action_std = tf.Variable(tf.zeros(shape=())) # (,)
        # action_spec
        # BoundedTensorSpec(shape=(3,), dtype=tf.float32, name=None, minimum=array(-1., dtype=float32), maximum=array(1., dtype=float32))

    def call(self, observations, step_type, network_state):
        del step_type

        _observations = tf.cast(observations, dtype=tf.float32) # (nPv,)
        _actions = self._layer(_observations) # (nMv,)
        _actions = tf.reshape(_actions, [-1] + self._output_tensor_spec.shape.as_list()) # (1, nMv)        
        _action_std = tf.ones_like(_actions) * tf.math.exp(self._log_action_std) # (1, nMv)

        return tfp.distributions.MultivariateNormalDiag(_actions, _action_std), network_state

In [None]:
def createAnInstanceOfDeterministicPolicy(input_tensor_spec, action_spec):
    time_step_spec = ts.time_step_spec(input_tensor_spec)

    return actor_policy.ActorPolicy(
        time_step_spec = time_step_spec,
        action_spec    = action_spec,
        actor_network  = MyActionNetDeterministic(input_tensor_spec, action_spec))

In [None]:
def createAnInstanceOfDistributionalPolicy(input_tensor_spec, action_spec):
    time_step_spec = ts.time_step_spec(input_tensor_spec)

    return actor_policy.ActorPolicy(
        time_step_spec = time_step_spec,
        action_spec    = action_spec,
        actor_network  = MyActionNetDistributional(input_tensor_spec, action_spec))

In [None]:
nPv = 1
nMv = 1
batch_size = 2**5

input_tensor_spec = tensor_spec.TensorSpec((nPv,)
                                           , tf.float32)

action_spec = tensor_spec.BoundedTensorSpec((nMv,),
                                            tf.float32,
                                            minimum=-1,
                                            maximum=1)

for my_actor_policy in (createAnInstanceOfDeterministicPolicy(input_tensor_spec, action_spec)
                        ,createAnInstanceOfDistributionalPolicy(input_tensor_spec, action_spec)):

    observations = tf.random.normal(shape=(batch_size, nPv))

    time_step = ts.restart(observations, batch_size) # time_step.is_first = True

    action_step = my_actor_policy.action(time_step) # action_step.action: (*, nMv)

    distribution_step = my_actor_policy.distribution(time_step)
    
    assert isinstance(distribution_step.action, tfp.distributions.Distribution)