# LSTM-DQN

In [1]:
import tensorflow.compat.v1 as tf
import numpy as np
import gym
import time
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
#####################  hyper parameters  ####################

MAX_EPISODES = 200
MAX_EP_STEPS = 200
LR_A = 0.001    # learning rate for actor
LR_C = 0.002    # learning rate for critic
GAMMA = 0.9     # reward discount
TAU = 0.01      # soft replacement
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32

RENDER = False
ENV_NAME = 'Pendulum-v1'

env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)

s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high

print(s_dim,a_dim,np.shape(a_bound))

3 1 (1,)


In [3]:
###############################  LSTM-DQN  ####################################

class LSTM_DQN(object):
    def __init__(self, a_dim, s_dim, a_bound,):
        num_nodes = 64 
        
        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
        self.pointer = 0
        self.sess = tf.compat.v1.Session()

        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')
        
        with tf.variable_scope("input", initializer=tf.truncated_normal_initializer(-0.1, 0.1)) as input_layer:
            self.ix, self.im, self.ib = self._generate_w_b(
                x_weights_size=[s_dim, num_nodes],
                m_weights_size=[s_dim, num_nodes],
                biases_size=[1, num_nodes])
        with tf.variable_scope("memory", initializer=tf.truncated_normal_initializer(-0.1, 0.1)) as update_layer:
            self.cx, self.cm, self.cb = self._generate_w_b(
                x_weights_size=[s_dim, num_nodes],
                m_weights_size=[s_dim, num_nodes],
                biases_size=[1, num_nodes])
        with tf.variable_scope("forget", initializer=tf.truncated_normal_initializer(-0.1, 0.1)) as forget_layer:
            self.fx, self.fm, self.fb = self._generate_w_b(
                x_weights_size=[s_dim, num_nodes],
                m_weights_size=[s_dim, num_nodes],
                biases_size=[1, num_nodes])
        with tf.variable_scope("output", initializer=tf.truncated_normal_initializer(-0.1, 0.1)) as output_layer:
            self.ox, self.om, self.ob = self._generate_w_b(
                x_weights_size=[s_dim, num_nodes],
                m_weights_size=[s_dim, num_nodes],
                biases_size=[1, num_nodes])

        self.saved_output = tf.Variable(tf.zeros([1, 3]), trainable=False)
        self.saved_state = tf.Variable(tf.zeros([1, num_nodes]), trainable=False)
        
        with tf.variable_scope('Actor'):
            self.a = self._run(self.S, scope='eval', trainable=True)
            a_ = self._run(self.S_, scope='eval_', trainable=False)

        with tf.variable_scope('Critic'):
            self.q = self._build_c(self.S, self.a, scope='eval', trainable=True)
            q_ = self._build_c(self.S_, a_, scope='target', trainable=False)

        # networks parameters
        self.a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

        self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e)
                             for t, e in zip(self.ct_params, self.ce_params)]
        
        q_target = self.R + GAMMA * q_
        
        
        td_error = tf.losses.mean_squared_error(labels=q_target, predictions=self.q)
        self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=self.ce_params)
        
        a_loss = -tf.reduce_mean(self.q)    # maximize the q
        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=self.a_params)

        self.sess.run(tf.global_variables_initializer())
    
    
    def _generate_w_b(self, x_weights_size, m_weights_size, biases_size):
        x_w = tf.get_variable("x_weights", x_weights_size, trainable=True)
        m_w = tf.get_variable("m_weigths", m_weights_size, trainable=True)
        b = tf.get_variable("biases", a_dim, initializer=tf.constant_initializer(0.0), trainable=True)
        return x_w, m_w, b

    def _run(self, input, scope, trainable):
        with tf.variable_scope(scope):
            state = self.saved_state
            output = self.saved_output
            forget_gate = tf.sigmoid(tf.matmul(input, self.fx) + tf.matmul(output, self.fm) + self.fb)
            input_gate = tf.sigmoid(tf.matmul(input, self.ix) + tf.matmul(output, self.im) + self.ib)
            update = tf.matmul(input, self.cx) + tf.matmul(output, self.cm) + self.cb
            state = state * forget_gate + tf.tanh(update) * input_gate
            output_gate = tf.sigmoid(tf.matmul(input, self.ox) + tf.matmul(output, self.om) + self.ob)
            res = output_gate * tf.tanh(state)
            self.saved_state = state
            self.saved_output = output
            
            net = tf.layers.dense(res, 32, activation=tf.nn.relu, name='l1', trainable=trainable)
            a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)

            return tf.multiply(a, self.a_bound, name='scaled_a')    

    
    
    def choose_action(self, s):
        return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]

    def learn(self):
        self.sess.run(self.soft_replace)

        indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
        bt = self.memory[indices, :]

        bs = bt[:, :self.s_dim]
        ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
        br = bt[:, -self.s_dim - 1: -self.s_dim]
        bs_ = bt[:, -self.s_dim:]
        
        self.sess.run(self.atrain, {self.S: bs})
        self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
#         print(self.sess.run(self.q,{self.S: bs, self.a: ba}))

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, a, [r], s_))
        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
        self.memory[index, :] = transition
        self.pointer += 1


    def _build_c(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            n_l1 = 30
            w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
            w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
            b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
            net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
            return tf.layers.dense(net, 1, trainable=trainable)  # Q(s,a)
        
ldqn = LSTM_DQN(a_dim, s_dim, a_bound)



In [4]:
var = 2
for episode in range(MAX_EPISODES):
    s = env.reset()
    ep_reward = 0
    for j in range(MAX_EP_STEPS):
        if RENDER:
            env.render()
        a = ldqn.choose_action(s)
        a = np.clip(np.random.normal(a, var), -2, 2)
        s_, r, done, info = env.step(a)
        ldqn.store_transition(s, a, r/10, s_)
        if ldqn.pointer > MEMORY_CAPACITY:
            var *= .9995
            ldqn.learn()
        s = s_
        ep_reward += r
        
        if j == MAX_EP_STEPS-1:
            print('Episode:', episode, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var)
            break

Episode: 0  Reward: -1152 Explore: 2.00
Episode: 1  Reward: -1230 Explore: 2.00
Episode: 2  Reward: -1012 Explore: 2.00
Episode: 3  Reward: -1638 Explore: 2.00
Episode: 4  Reward: -1068 Explore: 2.00
Episode: 5  Reward: -1544 Explore: 2.00
Episode: 6  Reward: -1240 Explore: 2.00
Episode: 7  Reward: -1549 Explore: 2.00
Episode: 8  Reward: -942 Explore: 2.00
Episode: 9  Reward: -1163 Explore: 2.00
Episode: 10  Reward: -985 Explore: 2.00
Episode: 11  Reward: -1288 Explore: 2.00
Episode: 12  Reward: -1677 Explore: 2.00
Episode: 13  Reward: -1286 Explore: 2.00
Episode: 14  Reward: -973 Explore: 2.00
Episode: 15  Reward: -1762 Explore: 2.00
Episode: 16  Reward: -1330 Explore: 2.00
Episode: 17  Reward: -1117 Explore: 2.00
Episode: 18  Reward: -969 Explore: 2.00
Episode: 19  Reward: -1164 Explore: 2.00
Episode: 20  Reward: -1703 Explore: 2.00
Episode: 21  Reward: -1230 Explore: 2.00
Episode: 22  Reward: -769 Explore: 2.00
Episode: 23  Reward: -1376 Explore: 2.00
Episode: 24  Reward: -944 Explo

KeyboardInterrupt: 