In [3]:
import tensorflow as tf
import tflearn as tl
from collections import deque
import random
import numpy as np


In [None]:
class ReplayBuffer(object):

    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.count = 0
        self.buffer = deque()

    def add(self, s, a, r, t, s2):
        experience = (s, a, r, t, s2)
        if self.count < self.buffer_size: 
            self.buffer.append(experience)
            self.count += 1
        else:
            self.buffer.popleft()
            self.buffer.append(experience)

    def size(self):
        return self.count

    def sample_batch(self, batch_size):
        '''     
        batch_size specifies the number of experiences to add 
        to the batch. If the replay buffer has less than batch_size
        elements, simply return all of the elements within the buffer.
        Generally, you'll want to wait until the buffer has at least 
        batch_size elements before beginning to sample from it.
        '''
        batch = []

        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)

        s_batch = np.array([_[0] for _ in batch])
        a_batch = np.array([_[1] for _ in batch])
        r_batch = np.array([_[2] for _ in batch])
        t_batch = np.array([_[3] for _ in batch])
        s2_batch = np.array([_[4] for _ in batch])

        return s_batch, a_batch, r_batch, t_batch, s2_batch

    def clear(self):
        self.deque.clear()
        self.count = 0

In [None]:
class ActorPair(object):
    def __init__(self):
        self.weights = []
        self.action_var = self.get_action_var()
        
        # Create the main network
        self.state, self.unscaled_action, self.action = self.create_actor_network()
        # Create a reference to the params
        self.network_params = tf.trainable_variables()
        # Create the target network
        self.target_state, self.target_unscaled_action, self.target_action = self.create_actor_network()
        # Reference the target params
        self.target_network_params = tf.trainable_variables()[len(self.network_params):]
        
        # dQ/da is provided by critic
        self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim])
        # dQ/dtheta = dQ/da da/dtheta
        self.actor_gradients = tf.gradients(self.scaled_out, self.network_params, -self.action_gradient)
        # Create the optimization op
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).\
            apply_gradients(zip(self.actor_gradients, self.network_params))
        
        # Create an interpolation op for the target network
        self.update_target_network_params = \
        [self.target_network_params[i].assign(tf.mul(self.network_params[i], self.tau) + \
            tf.mul(self.target_network_params[i], 1. - self.tau))
            for i in range(len(self.target_network_params))]
    
    def get_action(state):
        # call to to sess.run on self.action_var
        return action
    
    def create_actor_network(self):
        # state placeholders (these could be sequences for recurrent model)
        state = tflearn.input_data(shape=[None, self.s_dim])
        
        # feedforward / recurrent model to action 
        
        l1 = tflearn.fully_connected(inputs, 400, activation='relu')
        l2 = tflearn.fully_connected(l1, 300, activation='relu')
        # Final layer weights are init to Uniform[-3e-3, 3e-3]
        w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
        unscaled_action = tflearn.fully_connected(l2, self.a_dim, activation='tanh', weights_init=w_init)
        action = tf.mul(unscaled_action, self.action_bound) # Scale output to -action_bound to action_bound
        return state, unscaled_action, action 
    
    
        
class CriticPair(object):
    def __init__(self):
        self.weights =[]
        
        # Create the main network
        self.state, self.action, self.value = self.create_critic_network()
        # Create a reference to the params
        self.network_params = tf.trainable_variables()
        # Create the target network
        self.target_state, self.target_action, self.target_value = self.create_critic_network()
        # Reference the target params
        self.target_network_params = tf.trainable_variables()[len(self.network_params):]
        
        # Obtained from the target networks
        self.hindsight_q_value = tf.placeholder(tf.float32, [None, 1])
        # Define loss and optimization Op
        self.loss = tflearn.mean_square(self.hindsight_q_value, self.value)
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        # Get the gradient of the net w.r.t. the action
        self.action_grads = tf.gradients(self.value, self.action)
        
        # Create an interpolation op for the target
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.mul(self.network_params[i], self.tau) + \
                tf.mul(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]
       
    def get_value(state):
        # call sess.run on value var
        return value
        
    def create_critic_network(self): # TODO 
        # state placeholders (these could be sequences for recurrent model)
        state = tflearn.input_data(shape=[None, self.s_dim])
        # action placeholder
        action = tflearn.input_data(shape=[None, self.a_dim])
        
        # feedforward / recurrent model to value
        
        
        l1 = tflearn.fully_connected(state, 400, activation='relu')
        # Add the action tensor in the 2nd hidden layer
        # Use two temp layers to get the corresponding weights and biases
        t1 = tflearn.fully_connected(net, 300)
        t2 = tflearn.fully_connected(action, 300)
        l2 = tflearn.activation(tf.matmul(l1,t1.W) + tf.matmul(action, t2.W) + t2.b, activation='relu')
        # linear layer connected to 1 output representing Q(s,a) 
        # Weights are init to Uniform[-3e-3, 3e-3]
        w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
        value = tflearn.fully_connected(net, 1, weights_init=w_init)
        return state, action, value
    
    
        

In [None]:
N_EPISODES = 10

In [None]:
# initialise the env

# initialise the actor and critic


In [None]:
for i in range(N_EPISODES):
    s0 = env.reset()
    for t in range(100):
        env.render()
        action = actor.get_action(state)
        
        s, r, done, info = env.step(action)
        
        # store in replay memory
        
        # use target network to calculate hindsight val
        
        # Change critic learning steps per env step by: modulo on t OR for loop 
        
            # call critic update method
            

        # Change actor learning steps per env step
            
            # call actor update method 
        
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break