# Debugging A_R_Actor, P_Q_Actor, A_R_P_Q_Critic

- Use Keras Version 2.2.5. If you use tf.keras it causes some error. (In the tf.keras.backend.function)


### A_R_Actor

In [1]:
'''
Sites and pages that are helpful:
DDPG: https://spinningup.openai.com/en/latest/algorithms/ddpg.html
DDPG Code: https://github.com/germain-hug/Deep-RL-Keras/tree/master/DDPG
DDPG Code Updated: https://github.com/samhiatt/ddpg_agent/blob/master/ddpg_agent/agents/agent.py
Keras.Gradient: https://www.tensorflow.org/api_docs/python/tf/gradients
Keras.Function: https://www.tensorflow.org/api_docs/python/tf/keras/backend/function
Zip: https://www.geeksforgeeks.org/zip-in-python/
TensorFlow version used: less than 2.0.0 as then tf.gradients does work
'''

import tensorflow as tf
from keras import layers, models, optimizers
from keras import backend as K
import collections
import random
import numpy as np
import math
import copy


class A_R_Actor:
    '''
        Actor Class for Accept Reject Network
    '''

    def __init__(self, inp_dim, lr, gaussian_std, out_dim):

        self.inp_dim = inp_dim
        self.out_dim = out_dim
        self.lr = lr
        self.gaussian_std = gaussian_std
        self.network()

    def network(self):

        inputs = layers.Input(shape=(self.inp_dim,))
        #
        x = layers.Dense(32, activation='relu')(inputs)
        x = layers.BatchNormalization()(x)
        x = layers.GaussianNoise(self.gaussian_std)(x)
        #
        x = layers.BatchNormalization()(x)
        x = layers.GaussianNoise(self.gaussian_std)(x)
        #
        outputs = layers.Dense(self.out_dim, activation='sigmoid')(x)
        #
        self.model = models.Model(input=inputs, output=outputs)

        # Define loss function using action value (Q value) gradients
        action_gradients = layers.Input(shape=(self.out_dim,))
        self.test = -action_gradients * outputs
        self.loss = K.mean(-action_gradients * outputs)
        loss = K.mean(-action_gradients * outputs)

        # Define optimizer and training function
        optimizer = optimizers.Adam()
        updates_op = optimizer.get_updates(params=self.model.trainable_weights, loss=loss)
        self.train_fn = K.function(
            inputs=[self.model.input, action_gradients, K.learning_phase()],
            outputs=[loss],
            updates=updates_op)

    def summary(self):

        print(self.model.summary())

    def action(self, state):

        return self.model.predict(state)

    def train(self, states, action_gradients):

        #Grads will be supplied by the overall critic
        return self.train_fn([states, action_gradients, 1])

    def save(self, path):

        self.model.save_weights(path + '_A_R_Actor.h5')

    def load_weights(self, path):

        self.model.load_weights(path)



Using TensorFlow backend.


### P_Q_Actor

In [2]:
'''
Sites and pages that are helpful:
DDPG: https://spinningup.openai.com/en/latest/algorithms/ddpg.html
DDPG Code: https://github.com/germain-hug/Deep-RL-Keras/tree/master/DDPG
DDPG Code Updated: https://github.com/samhiatt/ddpg_agent/blob/master/ddpg_agent/agents/agent.py
Keras.Gradient: https://www.tensorflow.org/api_docs/python/tf/gradients
Keras.Function: https://www.tensorflow.org/api_docs/python/tf/keras/backend/function
Zip: https://www.geeksforgeeks.org/zip-in-python/
TensorFlow version used: less than 2.0.0 as then tf.gradients does work
'''


import tensorflow as tf
from keras import layers, models, optimizers
from keras import backend as K
import collections
import random
import numpy as np
import math
import copy



class P_Q_Actor:
    '''
        Actor Class for Accept Reject Network
    '''

    def __init__(self, inp_dim, lr, gaussian_std, out_dim):
        self.inp_dim = inp_dim
        self.out_dim = out_dim
        self.lr = lr
        self.gaussian_std = gaussian_std
        self.network()

    def network(self):
        inputs = layers.Input(shape=(self.inp_dim,))
        #
        x = layers.Dense(32, activation='relu')(inputs)
        x = layers.BatchNormalization()(x)
        x = layers.GaussianNoise(self.gaussian_std)(x)
        #
        x = layers.BatchNormalization()(x)
        x = layers.GaussianNoise(self.gaussian_std)(x)
        #
        output_q_traded = layers.Dense(1, activation='tanh')(x)
        output_q = layers.Dense(self.out_dim, activation='softmax')(x)
        output_p = layers.Dense(self.out_dim, activation='tanh')(x)
        #
        self.model = models.Model(input=inputs, output=[output_p, output_q, output_q_traded])
        '''
            output_q_traded: is a portion of the total energy demanded or supplied by the D_S_Net
            output_q: is the softmax (distribution) of energy amongst buyers and sellers
            output_p: is the price at which the trade will occur
        '''

        p_act_grad = layers.Input(shape=(self.out_dim,))
        q_act_grad = layers.Input(shape=(self.out_dim,))
        q_traded_grad = layers.Input(shape=(1,))

        self.loss_1 = p_act_grad * output_p
        self.loss_2 = q_act_grad * output_q
        self.loss_3 = q_traded_grad * output_q_traded
        loss = K.mean(- p_act_grad * output_p) + K.mean(-q_act_grad * output_q) + K.mean(-q_traded_grad * output_q_traded)

        # Define optimizer and training function
        optimizer = optimizers.Adam()
        updates_op = optimizer.get_updates(params=self.model.trainable_weights, loss=loss)
        self.train_fn = K.function(
            inputs=[self.model.input, p_act_grad, q_act_grad, q_traded_grad, K.learning_phase()],
            outputs=[loss],
            updates=updates_op)

    def summary(self):
        print(self.model.summary())

    def action(self, state):
        return self.model.predict(state)

    def train(self, states, p_grads, q_grads,  q_traded_grads):
        # Grads will be supplied by the overall critic
        return self.train_fn([states, p_grads, q_grads, q_traded_grads, 1])

    def save(self, path):
        self.model.save_weights(path + '_P_Q_Actor.h5')

    def load_weights(self, path):
        self.model.load_weights(path)

### A_R_P_Q_Critic

In [13]:
'''
Sites and pages that are helpful:
DDPG: https://spinningup.openai.com/en/latest/algorithms/ddpg.html
DDPG Code: https://github.com/germain-hug/Deep-RL-Keras/tree/master/DDPG
DDPG Code Updated: https://github.com/samhiatt/ddpg_agent/blob/master/ddpg_agent/agents/agent.py
Keras.Gradient: https://www.tensorflow.org/api_docs/python/tf/gradients
Keras.Function: https://www.tensorflow.org/api_docs/python/tf/keras/backend/function
Zip: https://www.geeksforgeeks.org/zip-in-python/
TensorFlow version used: less than 2.0.0 as then tf.gradients does work
'''

import tensorflow as tf
from keras import layers, models, optimizers
from keras import backend as K
import collections
import random
import numpy as np
import math
import copy


class A_R_P_Q_Critic:

    def __init__(self, a_r_inp_dim, p_q_inp_dim, a_r_act_dim, p_act_dim, q_act_dim, q_traded_act_dim=1):
        self.a_r_inp_dim = a_r_inp_dim
        self.p_q_inp_dim = p_q_inp_dim
        self.a_r_act_dim = a_r_act_dim
        self.p_act_dim = p_act_dim
        self.q_act_dim = q_act_dim
        self.q_traded_act_dim = q_traded_act_dim
        self.network()

    def network(self):

        a_r_inp = layers.Input(shape=(self.a_r_inp_dim,))
        p_q_inp = layers.Input(shape=(self.p_q_inp_dim,))
        a_r_act_inp = layers.Input(shape=(self.a_r_act_dim,))
        p_act_inp = layers.Input(shape=(self.p_act_dim,))
        q_act_inp = layers.Input(shape=(self.q_act_dim,))
        q_traded_act_inp = layers.Input(shape=(self.q_traded_act_dim,))
        #
        x = layers.concatenate([a_r_inp, p_q_inp, a_r_act_inp, p_act_inp, q_act_inp, q_traded_act_inp], axis=-1)
        x = layers.Dense(32, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        #
        x = layers.Dense(32, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        #
        output = layers.Dense(1, activation='linear')(x)

        self.model = models.Model(inputs=[a_r_inp, p_q_inp, a_r_act_inp, p_act_inp, q_act_inp, q_traded_act_inp], outputs=output)

        optimizer = optimizers.Adam()
        self.model.compile(optimizer=optimizer, loss='mse')
        
        self.get_action_gradients = K.function( inputs= [self.model.input[0], self.model.input[1], self.model.input[2],
                                                       self.model.input[3], self.model.input[4], self.model.input[5], K.learning_phase()],
                                                      outputs = K.gradients(self.model.output,
                                                                                 [self.model.input[2],
                                                                                  self.model.input[3],
                                                                                  self.model.input[4],
                                                                                  self.model.input[5]]))


    def gradients(self, a_r_inp, p_q_inp, a_r_act_inp, p_act_inp, q_act_inp, q_traded_act_inp):

        return self.get_action_gradients([a_r_inp, p_q_inp, a_r_act_inp, p_act_inp, q_act_inp, q_traded_act_inp, 0])

    def reward_value(self, a_r_inp, p_q_inp, a_r_act_inp, p_act_inp, q_act_inp, q_traded_act_inp):

        return self.model.predict([a_r_inp, p_q_inp, a_r_act_inp, p_act_inp, q_act_inp, q_traded_act_inp])

    def train(self, a_r_inp, p_q_inp, a_r_act_inp, p_act_inp, q_act_inp, q_traded_act_inp,
              a_r_inp_next, p_q_inp_next, a_r_act_inp_next, p_act_inp_next, q_act_inp_next, q_traded_act_inp_next,
              total_rewards):
      
        y = self.model.predict([a_r_inp_next, p_q_inp_next, a_r_act_inp_next, p_act_inp_next, q_act_inp_next, q_traded_act_inp_next])

        for i in range(len(total_rewards)):
            if total_rewards[i] == [0.0]:
                y[i] = total_rewards[i]

        loss = self.model.train_on_batch(x=[a_r_inp, p_q_inp, a_r_act_inp, p_act_inp, q_act_inp, q_traded_act_inp], y=y)

        return loss

    def save(self, path):

        self.model.save_weights(path + '_A_R_P_Q_Critic.h5')

    def load_weights(self, path):

        self.model.load_weights(path)

### Debugging the A_R_P_Q_Critic

In [4]:
A_R_P_Q_Critic_ = A_R_P_Q_Critic(5, 5, 6, 7, 7)








In [5]:
# Data
a_r_inp = np.array([[1,2,3,4,5],[1,2,3,4,5]]).astype(np.float32)
p_q_inp = np.array([[1,2,3,4,5],[1,2,3,4,5]]).astype(np.float32)
a_r_act = np.array([[1,2,3,4,5,6],[1,2,3,4,5,6]]).astype(np.float32).reshape(-1, 6)
p_act = np.array([[1,2,3,4,5,6,7],[1,2,3,4,5,6,7]]).astype(np.float32).reshape(-1, 7)
q_act = np.array([[1,2,3,4,5,6,7], [1,2,3,4,5,6,7]]).astype(np.float32).reshape(-1, 7)
q_traded = np.array([[1],[2]]).reshape(-1,1)
reward_1 = np.array([[1], [1]]).reshape(-1,1)
reward_0 = np.array([[0], [0]]).reshape(-1,1)

In [6]:
# Gradients
A_R_P_Q_Critic_.gradients(a_r_inp, p_q_inp, a_r_act, p_act, q_act, q_traded)









[array([[-0.09060257,  0.05156034,  0.04906118, -0.19119526, -0.15446454,
         -0.04544139],
        [-0.09060257,  0.05156034,  0.04906118, -0.19119526, -0.15446454,
         -0.04544139]], dtype=float32),
 array([[ 0.14027981, -0.20565191,  0.12181181,  0.06962101,  0.08803711,
         -0.07280227,  0.1196762 ],
        [ 0.14027981, -0.20565191,  0.12181181,  0.06962101,  0.08803711,
         -0.07280227,  0.1196762 ]], dtype=float32),
 array([[-0.04523172, -0.04033513, -0.23001516, -0.0611654 , -0.2211808 ,
          0.18053871, -0.09471793],
        [-0.04523172, -0.04033513, -0.23001516, -0.0611654 , -0.2211808 ,
          0.18053871, -0.09471793]], dtype=float32),
 array([[-0.05013109],
        [-0.05013109]], dtype=float32)]

In [7]:
# Reward Value
A_R_P_Q_Critic_.reward_value(a_r_inp, p_q_inp, a_r_act, p_act, q_act, q_traded)

array([[-1.623348 ],
       [-1.6734791]], dtype=float32)

In [10]:
# Training
A_R_P_Q_Critic_.train(a_r_inp, p_q_inp, a_r_act, p_act, q_act, q_traded,a_r_inp, p_q_inp, a_r_act, p_act, q_act, q_traded, reward_0)





0.39946878

In [11]:
# Training
A_R_P_Q_Critic_.train(a_r_inp, p_q_inp, a_r_act, p_act, q_act, q_traded,a_r_inp, p_q_inp, a_r_act, p_act, q_act, q_traded,reward_1)

116.73743

In [12]:
# Getting Individual Gradients
[a_r_grad, p_grad, q_grad, q_traded_grad] = A_R_P_Q_Critic_.gradients(a_r_inp, p_q_inp, a_r_act, p_act, q_act, q_traded)

### Debugging the A_R_Actor

In [14]:
A_R_Actor_ = A_R_Actor(5, 0.1, 0.1, 6)






In [15]:
# Action
A_R_Actor_.action(a_r_inp)

array([[0.3539759 , 0.04806849, 0.6060275 , 0.52506167, 0.20672342,
        0.75820374],
       [0.3539759 , 0.04806849, 0.6060275 , 0.52506167, 0.20672342,
        0.75820374]], dtype=float32)

In [16]:
# Gradients
A_R_Actor_.train(a_r_inp, a_r_grad)

[5.6140823]

In [17]:
### To understand whats happening internally
A_R_Actor_.test

<tf.Tensor 'mul:0' shape=(?, 6) dtype=float32>

In [18]:
### To understand whats happening internally
A_R_Actor_.loss

<tf.Tensor 'Mean:0' shape=() dtype=float32>

In [19]:
### To understand whats happening internally
np.array([1, 2]) * np.array([3,4])

array([3, 8])

### Debugging the P_Q_Actor

In [20]:
P_Q_Actor_ = P_Q_Actor(5, 0.1, 0.01, 7)



In [21]:
# Action
P_Q_Actor_.action(a_r_inp)

[array([[ 0.12405473,  0.37628353, -0.07276691,  0.69627243,  0.9998546 ,
          0.06685907, -0.6633774 ],
        [ 0.12405473,  0.37628353, -0.07276691,  0.69627243,  0.9998546 ,
          0.06685907, -0.6633774 ]], dtype=float32),
 array([[0.02626349, 0.01357553, 0.00766519, 0.8655479 , 0.07685392,
         0.0026146 , 0.00747942],
        [0.02626349, 0.01357553, 0.00766519, 0.8655479 , 0.07685392,
         0.0026146 , 0.00747942]], dtype=float32),
 array([[-0.79066193],
        [-0.79066193]], dtype=float32)]

In [22]:
# Getting Individual Actions
[p_output, q_output, q_traded_output] = P_Q_Actor_.action(a_r_inp)

In [23]:
# Training
P_Q_Actor_.train(a_r_inp, p_grad, q_grad, q_traded_grad)

[1.8229717]

### Checking if the training does cause an improvement

In [24]:
# Action
P_Q_Actor_.action(a_r_inp)

[array([[ 0.12606578,  0.38212767, -0.06446828,  0.6932459 ,  0.99985516,
          0.06563277, -0.6660674 ],
        [ 0.12606578,  0.38212767, -0.06446828,  0.6932459 ,  0.99985516,
          0.06563277, -0.6660674 ]], dtype=float32),
 array([[0.02639188, 0.01365938, 0.00771181, 0.8647045 , 0.07742614,
         0.00264839, 0.00745786],
        [0.02639188, 0.01365938, 0.00771181, 0.8647045 , 0.07742614,
         0.00264839, 0.00745786]], dtype=float32),
 array([[-0.7834161],
        [-0.7834161]], dtype=float32)]

In [25]:
# Training
P_Q_Actor_.train(a_r_inp, p_grad, q_grad, q_traded_grad)

[1.2958459]

In [26]:
# Action
P_Q_Actor_.action(a_r_inp)

[array([[ 0.12033914,  0.38290933, -0.05938209,  0.69127613,  0.9998548 ,
          0.06264965, -0.66897184],
        [ 0.12033914,  0.38290933, -0.05938209,  0.69127613,  0.9998548 ,
          0.06264965, -0.66897184]], dtype=float32),
 array([[0.02657991, 0.01369545, 0.00770971, 0.8645353 , 0.07735843,
         0.00266778, 0.00745341],
        [0.02657991, 0.01369545, 0.00770971, 0.8645353 , 0.07735842,
         0.00266778, 0.00745341]], dtype=float32),
 array([[-0.7777392],
        [-0.7777392]], dtype=float32)]

In [27]:
### To understand whats happening internally
P_Q_Actor_.loss_1

<tf.Tensor 'mul_44:0' shape=(?, 7) dtype=float32>

In [28]:
### To understand whats happening internally
P_Q_Actor_.loss_2

<tf.Tensor 'mul_45:0' shape=(?, 7) dtype=float32>

In [29]:
### To understand whats happening internally
P_Q_Actor_.loss_3

<tf.Tensor 'mul_46:0' shape=(?, 1) dtype=float32>