In [3]:
%matplotlib inline

import gym
import numpy as np
from matplotlib import pyplot as plt

import gym
from gym.wrappers import Monitor
import itertools
import numpy as np
import os
import random
import sys
import psutil
import tensorflow as tf

if "../" not in sys.path:
  sys.path.append("../")

from lib import plotting
from collections import deque, namedtuple

In [4]:
env = gym.make('Pendulum-v0')

[2017-08-22 21:04:27,382] Making new env: Pendulum-v0


In [5]:
env.observation_space.shape[0]

3

In [36]:
class ReplayMemory():
    
    def __init__(self, init_size, max_size, batch_size):
        self.replay_memory = []
        self.batch_size = batch_size
        
    def append(self, transition):
        self.replay_memory.append(transition)
        
    def pop(self):
        return self.replay_memory.pop(0)
    
    def sample(self):
        return random.sample(self.replay_memory, self.batch_size)
    
    def get_size(self):
        return len(self.replay_memory)

In [37]:
class UpdateTargetNetwork():
    
    def __init__(self, tau, estimator, target_estimator, scope):
        self.scope = scope
        self.tau = tau
        with tf.variable_scope(scope):
            e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator.scope)]
            e1_params = sorted(e1_params, key=lambda v: v.name)

            e2_params = [t for t in tf.trainable_variables() if t.name.startswith(target_estimator.scope)]
            e2_params = sorted(e2_params, key=lambda v: v.name)
        
            self.update_ops = []
            for e1_v, e2_v in zip(e1_params, e2_params):
                op = e2_v.assign((1-tau)*e2_v + tau*e1_v)
                self.update_ops.append(op)
                
    def update(self, sess):
        return sess.run(self.update_ops)

In [42]:
class Actor():
    
    def __init__(self, tau, learning_rate, scope="actor"):
        self.scope = scope
        self.learning_rate = learning_rate
        self.tau = tau
        self.action_bound = env.action_space.high
        self.s_dim = env.observation_space.shape[0]
        self.a_dim = env.action_space.shape[0]
        with tf.variable_scope(scope):
            self._build_model()
            
        
    def _build_model(self):
        self.X_pl = tf.placeholder(dtype=tf.float32, shape=[None, self.s_dim], name="X")
        self.y_pl = tf.placeholder(dtype=tf.float32, shape=[None], name="y")
        self.actions_pl = tf.placeholder(dtype=tf.int32, shape=[None, self.a_dim], name="actions")
        self.action_gradient = tf.placeholder(dtype=tf.float32, shape=[None, self.a_dim], name="action_gradients")
        batch_size = tf.shape(self.X_pl)[0]
        
        #3 Fully Connected Layers
        fc1 = tf.contrib.layers.fully_connected(self.X_pl, num_outputs=400, activation_fn=tf.nn.relu)
        fc2 = tf.contrib.layers.fully_connected(fc1, num_outputs=300, activation_fn=tf.nn.relu)
        self.predictions = tf.contrib.layers.fully_connected(fc2,
                                                             num_outputs=env.action_space.shape[0],
                                                             activation_fn=tf.nn.tanh)

        
        self.scaled_predictions = tf.multiply(self.predictions, self.action_bound)
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)
        
        self.losses = -tf.log(self.action_predictions) * self.y_pl
        self.loss = tf.reduce_mean(self.losses)
        
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
        
        self.network_params = tf.trainable_variables()
        self.actor_gradients = tf.gradients(
            self.scaled_predictions, self.network_params, -self.action_gradient)
    
    def predict(self, sess, state):
        return sess.run(self.scaled_predictions, feed_dict={ self.X_pl: state })
    
    def update_gradient(self, sess, s, a_grads):
        
        feed_dict = { self.X_pl:s, self.action_gradient:a_grads }
        return sess.run(self.actor_gradients, feed_dict=feed_dict)
        
        

In [43]:
class Critic():
    
    def __init__(self, tau, learning_rate, scope="critic"):
        self.scope = scope
        self.learning_rate = learning_rate
        self.tau = tau
        self.s_dim = env.observation_space.shape[0]
        self.a_dim = env.action_space.shape[0]
        with tf.variable_scope(scope):
            self._build_model()
            
        
    def _build_model(self):
        self.X_pl = tf.placeholder(dtype=tf.float32, shape=[None, self.s_dim], name="X")
        self.actions_pl = tf.placeholder(dtype=tf.float32, shape=[None, self.a_dim], name="actions")
        self.y_pl = tf.placeholder(dtype=tf.float32, shape=[None], name="y")
        
        #2 Fully Connected Layers with temp layers
        fc1 = tf.contrib.layers.fully_connected(self.X_pl, 400, activation_fn=tf.nn.relu)
        fc2_1 = tf.contrib.layers.fully_connected(fc1, 300)
        fc2_2 = tf.contrib.layers.fully_connected(self.actions_pl, 300)
        
        fc2_1_w = [t for t in tf.trainable_variables() if t.name == 'critic/fully_connected_1/weights:0'][0]
        fc2_2_w = [t for t in tf.trainable_variables() if t.name == 'critic/fully_connected_2/weights:0'][0]
        fc2_2_b = [t for t in tf.trainable_variables() if t.name == 'critic/fully_connected_2/biases:0'][0]
        
        out = tf.matmul(fc1, fc2_1_w) + tf.matmul(self.actions_pl, fc2_2_w) + fc2_2_b
        self.action_value_predictions = tf.contrib.layers.fully_connected(out, 1, activation_fn=tf.nn.relu)
        
        self.losses = tf.squared_difference(self.action_value_predictions, self.y_pl)
        self.loss = tf.reduce_mean(self.losses)
        
        self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
        
        self.action_gradients = tf.gradients(self.action_value_predictions, self.actions_pl)
    
    def predict(self, sess, s, a):
        return sess.run(self.action_value_predictions, feed_dict={ self.X_pl: s, self.actions_pl:a })
    
    def update(self, sess, s, a, y):
        
        feed_dict = { self.X_pl:s, self.y_pl:y, self.actions_pl:a }
        _, loss = sess.run([self.train_op, self.loss], feed_dict=feed_dict)
        return loss
    
    def get_action_gradients(self, sess, s, a):
        return sess.run(self.action_gradients, feed_dict={ self.X_pl:s, self.actions_pl:a})

In [56]:
def deep_policy_gradients(sess,
                    env,
                    actor,
                    critic,
                    actor_target,
                    critic_target,
                    num_episodes,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    
    episode_loss = []
    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
        
    #Initilaize Epsilons
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    
    #Initialize Replay Memory
    replay_memory = ReplayMemory(replay_memory_init_size, replay_memory_size, batch_size)
    print("populating replay memory")
    state = env.reset()
    for i in range(replay_memory_init_size):
        action = actor.predict(sess, np.reshape(state, (1,3))) + 1./1. + i
        next_state, reward, done, _ = env.step(action[0])
        replay_memory.append(Transition(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)),
                                        reward, np.reshape(next_state, (actor.s_dim,)), done))
        if done:
            state = env.reset()
        else:
            state = next_state
    
    
    #Update target actor-critic
    actor_target_updater = UpdateTargetNetwork(tau, actor, actor_target, scope="update_actor_target")
    actor_target_updater.update(sess)
    
    critic_target_updater = UpdateTargetNetwork(tau, critic, critic_target, scope="update_critic_target")
    critic_target_updater.update(sess)
    
    total_reward_gained = 0
    
    #Start learning
    state = env.reset()
    for i_episode in range(num_episodes):
        
        #Add exploration noise
        print("Get next state and rewards")
        action = actor.predict(sess, np.reshape(state, (1,3))) + 1./1. + i_episode
        next_state, reward, done, _ = env.step(action[0])
        
        #Add transition to memory
        print("Add to replay memory")
        replay_memory.append(Transition(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)),
                                        reward, np.reshape(next_state, (actor.s_dim,)), done))
        
        #Pop the first transition if the replay memory gets full
        if replay_memory.get_size() >= replay_memory_size:
            replay_memory.pop()
            
        #Randomly sample from replay memory
        print("Sample from the memory")
        samples = replay_memory.sample()
        states_batch, actions_batch, rewards_batch, next_states_batch, done_batch = map(np.array, zip(*samples))
        
        #Update the critic with the targets
        print("Update critic with target")
        q_values_next = critic_target.predict(sess, next_states_batch, actor_target.predict(sess, next_states_batch))
        targets_batch = rewards_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1)
        loss = critic.update(sess, states_batch, actions_batch, targets_batch)
        
        #Update the actor with the gradients
        print("Update actor gradient")
        predicted_actions_batch = actor.predict(sess, states_batch)
        action_gradients_batch = critic.get_action_gradients(sess, states_batch, predicted_actions_batch)
        actor.update_gradient(sess, states_batch, action_gradients_batch[0])
    
        #Update target networks
        print("Update target networks")
        actor_target_updater.update(sess)
        critic_target_updater.update(sess)
        
        if done:
            break
            
        state = next_state
        total_reward_gained += reward
        
        episode_loss.append(loss)
        print loss
        s

In [57]:
tf.reset_default_graph()

# Create a glboal step variable
global_step = tf.Variable(0, name='global_step', trainable=False)

tau=0.001
actor_learning_rate=0.0001
critic_learning_rate=0.001


#Initilaize actor-critic
actor = Actor(tau, actor_learning_rate, scope="actor")
critic = Critic(tau, critic_learning_rate, scope="critic")

#Initialize target actor-critics
actor_target = Actor(tau, actor_learning_rate, scope="actor_target")
critic_target = Critic(tau, critic_learning_rate, scope="critic_target")
    
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    deep_policy_gradients(sess,
                        env,
                        actor,
                        critic,
                        actor_target,
                        critic_target,
                        num_episodes=10000,
                        replay_memory_size=500000,
                        replay_memory_init_size=50000,
                        update_target_estimator_every=10000,
                        epsilon_start=1.0,
                        epsilon_end=0.1,
                        epsilon_decay_steps=500000,
                        discount_factor=0.99,
                        batch_size=32)

populating replay memory
Get next state and rewards
Add to replay memory
Sample from the memory
Update critic with target
Update actor gradient
[[ 0.13966396]
 [-0.43929243]
 [-0.54813761]
 [ 0.12418531]
 [-0.4349741 ]
 [-0.54281658]
 [-0.25805441]
 [-0.35066798]
 [ 0.04961627]
 [-0.50596958]
 [-0.14419   ]
 [-0.36073563]
 [-0.70022857]
 [-0.22213225]
 [-0.69198722]
 [ 0.19086984]
 [-0.47850454]
 [ 0.05552344]
 [-0.08996425]
 [-0.69432509]
 [ 0.02476518]
 [-0.58344424]
 [-0.39837417]
 [-0.61992568]
 [-0.20832171]
 [-0.36708239]
 [-0.36909261]
 [-0.65425164]
 [-0.36145258]
 [-0.20287432]
 [-0.71347344]
 [-0.44395462]]
#################################
[array([[ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [-0.12101294],
       [ 0.        ],
       [ 0.        ],
       [-0.12101294],
       [ 0.        ],
       [ 0.        ],
       [-0.12101294],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0

NameError: global name 's' is not defined