# Actor Critic

In [1]:
# Set relative path to parent directory
import sys, os
sys.path.insert(0, os.path.abspath('..'))

In [2]:
# Import environment
from environments.pendulum import pendulum

env = pendulum(mass=1, length=1, gravity=9.81)
env.step_size = 0.05

In [3]:
import tensorflow.contrib.slim as slim
import tensorflow as tf
import numpy as np
import gym

## Actor class
The following codes sets up the neural network for the actor. The actor has two functions, the choose_action and learn functions.

### The Network
The Actor network consists of some hidden layers which learns the abstraction of the inpuut action. This then feeds into two layers which learn the standard deviation $\sigma$, and the mean $\mu$ of the normaldistribution (see the image below), which decides which action to take. 
<img src="https://upload.wikimedia.org/wikipedia/commons/7/74/Normal_Distribution_PDF.svg" alt="Drawing" style="width: 300px;"/>

### Choosing an action
The action to take is chosen by feeding a state into the network, and predicting the resulting standard deviation $\sigma$ and mean $\mu$. A random action based on the distribution is then chosen.

### Learning
The learning alorithem works by 

In [4]:
class Actor(object):
    def __init__(self, sess, n_features, action_bound, hidden_layer_shape = [32], lr=0.0001):
        self.sess = sess
        
        # Placeholders for action and td_error for learning, and state for learning and action selection
        self.state = tf.placeholder(tf.float32, [None, n_features], "state")
        self.action_holder = tf.placeholder(tf.float32, None, name="action")
        self.td_error = tf.placeholder(tf.float32, None, name="td_error")  # TD_error
        
        # Actor hidden layers
        hidden = slim.stack(inputs = self.state,
                            layer = slim.fully_connected, 
                            stack_args = hidden_layer_shape,
                            activation_fn = tf.nn.relu, 
                            weights_initializer = tf.random_normal_initializer(0., .1),  # weights
                            biases_initializer = tf.constant_initializer(0.1),  # biases
                            scope='hidden')
        
        # Predicted mean
        mu = tf.layers.dense(
            inputs=hidden,
            units=1,
            activation=tf.nn.tanh,
            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
            bias_initializer=tf.constant_initializer(0.1),  # biases
            name='mu'
        )
        
        # Predicted standard deviation
        sigma = tf.layers.dense(
            inputs=hidden,
            units=1,
            activation=tf.nn.softplus,  # get action probabilities
            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
            bias_initializer=tf.constant_initializer(1.),  # biases
            name='sigma'
        )
        global_step = tf.Variable(0, trainable=False)
        # self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
        self.mu, self.sigma = mu*2, sigma + 0.1#tf.squeeze(mu*2), tf.squeeze(sigma+0.1)
        self.normal_dist = tf.distributions.Normal(self.mu, self.sigma)

        self.action = tf.clip_by_value(self.normal_dist.sample(1), action_bound[0], action_bound[1])[0]

        with tf.name_scope('exp_v'):
            log_prob = self.normal_dist.log_prob(self.action_holder)  # loss without advantage
            self.exp_v = log_prob * self.td_error  # advantage (TD_error) guided loss
            # Add cross entropy cost to encourage exploration
            self.exp_v += 0.01*self.normal_dist.entropy()

        with tf.name_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step)    # min(v) = max(-v)

    def learn(self, state, action, td_error, batch = False):
        if batch == False:
            state = state[np.newaxis, :]
        feed_dict = {self.state: state, self.action_holder: action, self.td_error: td_error}
        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
        return exp_v

    def choose_action(self, state, batch = False):
        if batch == False:
            state = state[np.newaxis, :]
        return self.sess.run(self.action, {self.state: state})  # get probabilities for all actions
    
    

## Critic class
The critic is responsible for estimating the value function of the policy the actor is following. It does so by using a neural network to parameterize the value function $V(s)$. To find the best parameterization of the value function the critic learns by updating the value function using the temporal difference error $r + \gamma V(s') - V(s)$ whith the loss function as the squared temporal difference error.

In [5]:
class Critic(object):
    def __init__(self, sess, n_features, hidden_layer_shape = [32, 64], lr=0.01):
        self.sess = sess
        with tf.name_scope('inputs'):
            self.state = tf.placeholder(tf.float32, [None, n_features], "state")
            self.v_next = tf.placeholder(tf.float32, [None, 1], name="v_next")
            self.reward = tf.placeholder(tf.float32, None, name='reward')

        with tf.variable_scope('Critic'):
            hidden = slim.stack(inputs = self.state,
                                layer = slim.fully_connected, 
                                stack_args = hidden_layer_shape,
                                activation_fn = tf.nn.relu, 
                                weights_initializer = tf.random_normal_initializer(0., .1),  # weights
                                biases_initializer = tf.constant_initializer(0.1),  # biases
                                scope='hidden'
            )

            self.v = tf.layers.dense(
                inputs=hidden,
                units=1,
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='value_function'
            )

        with tf.variable_scope('squared_TD_error'):
            self.td_error = self.reward + GAMMA * self.v_next - self.v #tf.reduce_mean(self.reward + GAMMA * self.v_next - self.v)
            self.loss = tf.square(tf.reduce_mean(self.td_error))    # TD_error = (r+gamma*V_next) - V_eval
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)

    def learn(self, state, reward, state_next, batch = False):
        if batch == False:
            state = state[np.newaxis, :]
            state_next = state_next[np.newaxis, :]
        
        # Get value of next state
        v_next = self.sess.run(self.v, {self.state: state_next})
        
        # Gradient decent using td_error
        train_dict = {self.state: state, self.v_next: v_next, self.reward: reward}
        td_error, _ = self.sess.run([self.td_error, self.train_op], train_dict)
        return td_error

## Experience replay buffer
The idea behind the experience replay buffer is that by storing an agent’s experiences, and then randomly drawing batches of them to train the network, we can more robustly learn to perform well in the task. By keeping the experiences we draw random, we prevent the network from only learning about what it is immediately doing in the environment, and allow it to learn from a more varied array of past experiences. The experience is stored as a tuple $[s, a, r, s']$ where $s$ is the state we are in, $a$ is the action we take, $r$ is the reward we get, and $s'$ is the state we end up in. 

In [6]:
import random
class experience_buffer():
    def __init__(self, buffer_size = 1000000):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self,experience):
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []
        self.buffer.extend(experience)
            
    def sample(self,size):
        return np.reshape(np.array(random.sample(self.buffer,size)),[size,5])

In [None]:
np.random.seed(2)
tf.set_random_seed(2)

# Reset graph
tf.reset_default_graph()

# Training parameters
MAX_EPISODE = 10000
MAX_EP_STEPS = 200
DISPLAY_REWARD_THRESHOLD = -40
RENDER = False
BATCH_SIZE = 32
GAMMA = 0.9
LR_A = 0.001    # learning rate for actor
LR_C = 0.01     # learning rate for critic

experience = experience_buffer()

env = gym.make('Pendulum-v0')
env = env.unwrapped

N_S = env.observation_space.shape[0]
A_BOUND = env.action_space.high

sess = tf.Session()

# Set up actor and critic
actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
critic = Critic(sess, n_features=N_S, lr=LR_C)
sess.run(tf.global_variables_initializer())

for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 1
    ep_rs = []
    while True:
        if RENDER:
            pass
            env.render()
        # Choose action (Actor)
        a = actor.choose_action(s)[0]
        
        # Perform action and observe reward and next state
        s_, r, done, info = env.step(a)
        r /= 10
        
        # Save experience
        experience.add(np.reshape(np.array([s,a,np.array(r),s_,np.array(done)]),[1,5]))
        
        if t%BATCH_SIZE == 0:
            batch = experience.sample(BATCH_SIZE)
            b_s = np.vstack(batch[:,0])
            b_a = np.vstack(batch[:,1])
            b_r = np.vstack(batch[:,2])
            b_s_ = np.vstack(batch[:,3])
            
            td_error = critic.learn(b_s, b_r, b_s_, batch=True)  # gradient = grad[r + gamma * V(s_) - V(s)]
            actor.learn(b_s, b_a, td_error, batch=True)  # true_gradient = grad[logPi(s,a) * td_error]
            
            experience.buffer = []

        s = s_
        t += 1
        ep_rs.append(r)
        if t > MAX_EP_STEPS:
            ep_rs_sum = sum(ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))
            break

In [None]:
tf.reset_default_graph()

OUTPUT_GRAPH = True
MAX_EPISODE = 10000
MAX_EP_STEPS = 200
DISPLAY_REWARD_THRESHOLD = -60  # renders environment if total episode reward is greater then this threshold
RENDER = False  # rendering wastes time
GAMMA = 0.9
LR_A = 0.001    # learning rate for actor
LR_C = 0.01     # learning rate for critic

env = gym.make('Pendulum-v0')
env.seed(1)  # reproducible
env = env.unwrapped

N_S = env.observation_space.shape[0]
A_BOUND = env.action_space.high

sess = tf.Session()

actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
critic = Critic(sess, n_features=N_S, lr=LR_C)

sess.run(tf.global_variables_initializer())

if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    ep_rs = []
    while True:
        if RENDER:
            env.render()
        a = actor.choose_action(s)[0]

        s_, r, done, info = env.step(a)
        r /= 10

        td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
        actor.learn(s, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]

        s = s_
        t += 1
        ep_rs.append(r)
        if t > MAX_EP_STEPS:
            ep_rs_sum = sum(ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))
            break