In [3]:
import tensorflow as tf
import tflearn as tl
from collections import deque
import random
import numpy as np



In [None]:
class ReplayBuffer(object):

    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.count = 0
        self.buffer = deque()

    def add(self, s, a, r, t, s2):
        experience = (s, a, r, t, s2)
        if self.count < self.buffer_size: 
            self.buffer.append(experience)
            self.count += 1
        else:
            self.buffer.popleft()
            self.buffer.append(experience)

    def size(self):
        return self.count

    def sample_batch(self, batch_size):
        '''     
        batch_size specifies the number of experiences to add 
        to the batch. If the replay buffer has less than batch_size
        elements, simply return all of the elements within the buffer.
        Generally, you'll want to wait until the buffer has at least 
        batch_size elements before beginning to sample from it.
        '''
        batch = []

        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)

        s_batch = np.array([_[0] for _ in batch])
        a_batch = np.array([_[1] for _ in batch])
        r_batch = np.array([_[2] for _ in batch])
        t_batch = np.array([_[3] for _ in batch])
        s2_batch = np.array([_[4] for _ in batch])

        return s_batch, a_batch, r_batch, t_batch, s2_batch

    def clear(self):
        self.deque.clear()
        self.count = 0

In [None]:
class ActorPair(object):
    
    def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.action_bound = action_bound
        self.learning_rate = learning_rate
        self.tau = tau
        
        # Create the main network
        self.state, self.unscaled_action, self.action = self.create_actor_network()
        # Create a reference to the params
        self.network_params = tf.trainable_variables()
        # Create the target network
        self.target_state, self.target_unscaled_action, self.target_action = self.create_actor_network()
        # Reference the target params
        self.target_network_params = tf.trainable_variables()[len(self.network_params):]
        
        # dQ/da is provided by critic
        self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim])
        # dQ/dtheta = dQ/da da/dtheta
        self.actor_gradients = tf.gradients(self.action, self.network_params, -self.action_gradient)
        # Create the optimization op
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).\
            apply_gradients(zip(self.actor_gradients, self.network_params))
        
        # Create an interpolation op for the target network
        self.update_target_network_params = \
        [self.target_network_params[i].assign(tf.mul(self.network_params[i], self.tau) + \
            tf.mul(self.target_network_params[i], 1. - self.tau))
            for i in range(len(self.target_network_params))]
    
    def get_action(state):
        # call to to sess.run on self.action_var
        return action
    
    def create_actor_network(self):
        # state placeholders (these could be sequences for recurrent model)
        state = tflearn.input_data(shape=[None, self.s_dim])
        
        # feedforward / recurrent model to action 
        
        l1 = tflearn.fully_connected(inputs, 400, activation='relu')
        l2 = tflearn.fully_connected(l1, 300, activation='relu')
        # Final layer weights are init to Uniform[-3e-3, 3e-3]
        w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
        unscaled_action = tflearn.fully_connected(l2, self.a_dim, activation='tanh', weights_init=w_init)
        action = tf.mul(unscaled_action, self.action_bound) # Scale output to -action_bound to action_bound
        return state, unscaled_action, action 
    
    
    def train(self, state, a_gradient):
        self.sess.run(self.optimize, 
                      feed_dict = {self.state: state,
                                   self.action_gradient : a_gradient
                                  }
                     )
    
    def predict(self, state):
        return self.sess.run(self.action,
                             feed_dict = {self.state: state}
                            )
    
    def predict_target(self, state):
        return self.sess.run(self.target_action,
                             feed_dict = {self.target_state: state}
                            )
    
    def update_target_network(self):
        self.sess.run(self.update_target_network_params)

In [None]:
class CriticPair(object):
    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau
        
        
        # Create the main network
        self.state, self.action, self.value = self.create_critic_network()
        # Create a reference to the params
        self.network_params = tf.trainable_variables()
        # Create the target network
        self.target_state, self.target_action, self.target_value = self.create_critic_network()
        # Reference the target params
        self.target_network_params = tf.trainable_variables()[len(self.network_params):]
        
        # Obtained from the target networks
        self.hindsight_q_value = tf.placeholder(tf.float32, [None, 1])
        # Define loss and optimization Op
        self.loss = tflearn.mean_square(self.hindsight_q_value, self.value)
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        # Get the gradient of the net w.r.t. the action
        self.action_grads = tf.gradients(self.value, self.action)
        
        # Create an interpolation op for the target
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.mul(self.network_params[i], self.tau) + \
                tf.mul(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]
       
    def get_value(state):
        # call sess.run on value var
        return value
        
    def create_critic_network(self): # TODO 
        # state placeholders (these could be sequences for recurrent model)
        state = tflearn.input_data(shape=[None, self.s_dim])
        # action placeholder
        action = tflearn.input_data(shape=[None, self.a_dim])
        
        # feedforward / recurrent model to value
        
        
        l1 = tflearn.fully_connected(state, 400, activation='relu')
        # Add the action tensor in the 2nd hidden layer
        # Use two temp layers to get the corresponding weights and biases
        t1 = tflearn.fully_connected(net, 300)
        t2 = tflearn.fully_connected(action, 300)
        l2 = tflearn.activation(tf.matmul(l1,t1.W) + tf.matmul(action, t2.W) + t2.b, activation='relu')
        # linear layer connected to 1 output representing Q(s,a) 
        # Weights are init to Uniform[-3e-3, 3e-3]
        w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
        value = tflearn.fully_connected(net, 1, weights_init=w_init)
        return state, action, value
    
    
    def train(self, state, action, hindsight_q_value):
        self.sess.run(self.optimize, 
                      feed_dict = {self.state: state,
                                   self.action_gradient: a_gradient
                                   self.hindsight_q_value: hindsight_q_value
                                  }
                     )
    
    def predict(self, state, action):
        return self.sess.run(self.value,
                             feed_dict = {self.state: state
                                          self.action: action
                                         }
                            )
    
    def predict_target(self, state, action):
        return self.sess.run(self.target_value,
                             feed_dict = {self.target_state: state
                                          self.target_action: action}
                            )
    def action_gradients(self, state, actions):
        return self.sess.run(self.action_grads, 
                             feed_dict = {self.state: state,
                                          self.action: actions
                                         }
                            )
    
    def update_target_network(self):
        self.sess.run(self.update_target_network_params)
        

In [None]:

# ==========================
#   Training Parameters
# ==========================
# Max training steps
MAX_EPISODES = 50000
# Max episode length
MAX_EP_STEPS = 1000
# Base learning rate for the Actor network
ACTOR_LEARNING_RATE = 0.0001
# Base learning rate for the Critic Network
CRITIC_LEARNING_RATE = 0.001
# Discount factor 
GAMMA = 0.99
# Soft target update param
TAU = 0.001

# ===========================
#   Utility Parameters
# ===========================
# Render gym env during training
RENDER_ENV = True
# Use Gym Monitor
GYM_MONITOR_EN = True
# Gym environment
ENV_NAME = 'Pendulum-v0'
# Directory for storing gym results
MONITOR_DIR = './results/gym_ddpg'
# Directory for storing tensorboard summary results
SUMMARY_DIR = './results/tf_ddpg'
RANDOM_SEED = 1337
# Size of replay buffer
BUFFER_SIZE = 10000
MINIBATCH_SIZE = 64

In [None]:
def train(sess, env, actor, critic):
    
    # Initialize our Tensorflow variables
    sess.run(tf.initialize_all_variables())
   
    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
    
    for i in range(MAX_EPISODES):
        s = env.reset()
        for j in range(MAX_EP_STEPS):
            
            if RENDER_ENV:
                env.render()
            
            # generating a step
            
            # adding noise so that actor explores systematically across episode and step
            a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i + j))
            # get new state and reward
            s2, r, is_done, info = env.step(a[0])
            
            # add step to replay buffer
            
            replay_buffer.add(np.reshape(s, (actor.s_dim,)),
                              np.reshape(a, (actor.a_dim,)), r,
                              terminal, np.reshape(s2, (actor.s_dim,)))
            
            # keep adding steps until there are enough to do a training update
            
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, is_done_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)
                
                
                # calculate targets
                target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch))
                
                #if the game has ended target_q not added to get hindsight q
                hindsight_q_vec = (r_batch + (1 - is_done_batch.astype(float)) * GAMMA * target_q)
                
                
                
                
                #critic training
                
                predicted_q_value, _ = critic.train(s_batch, a_batch,
                                                     np.reshape(hindsight_q_vec, (MINIBATCH_SIZE, 1)))
                # actor training
                a_values = actor.predict(s_batch)
                dQda_list = critic.action_gradients(s_batch, a_outs) # could repeat more than once, or even less than once
                actor.train(s_batch, dQda_list[0])
                
                # updates targets
                actor.update_target_network()
                critic.update_target_network()
            
            if is_done:
                break
            
            
            
            
            

In [None]:
# defining environment

with tf.Session() as sess:
    
    env = gym.make('Pendulum-v0')
    
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    
    # make sure action bound is symmetric (can change in future,
    # but need to remember to scale actor output appropriately)
    assert (env.action_space.high == -env.action_space.low)
    
    action_bound = env.action_space.high
    
    # start up actor and critic pair
    
    actor = ActorPair(sess, start_dim, action_dim, action_bound, 
                     ACTOR_LEARNING_RATE, TAU)

    critic = CriticPair(sess, state_dim, action_dim,
                       CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars())

In [None]:
N_EPISODES = 10

In [None]:
# initialise the env

# initialise the actor and critic


In [None]:
for i in range(N_EPISODES):
    s0 = env.reset()
    for t in range(100):
        env.render()
        action = actor.get_action(state)
        
        s, r, done, info = env.step(action)
        
        # store in replay memory
        
        # use target network to calculate hindsight val
        
        # Change critic learning steps per env step by: modulo on t OR for loop 
        
            # call critic update method
            

        # Change actor learning steps per env step
            
            # call actor update method 
        
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break