In [0]:
import tensorflow as tf
import gym
import numpy as np
import random
import os

In [0]:
# PARAMETERS

env_name = "Pendulum-v0"

env = gym.make(env_name)
env.reset()

state_size = env.observation_space.shape
action_size = env.action_space.shape
buffer_size = 1000 * 1000
batch_size = 64
action_low = env.action_space.low 
action_high = env.action_space.high 

discount_rate = 0.99
actor_learning_rate = 0.0001
critic_learning_rate = 0.001
momentum = 0.9
noise_scale = 0.1
tau = 0.001

random_steps = 50000
total_episodes = 1000 * 1000
steps_per_episode = 1000

run_num = 1
run_dir = '/run_' + str(run_num)
save_model_step = 10
print_progress_step = 10
model_dir = 'results' + run_dir
tensorboard_dir = 'summaries' + run_dir

In [0]:
class Actor:
    def __init__(self, state, state_size, action_size, action_low, action_high, training, momentum, learning_rate, batch_size, scope):
        self.state = state
        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high
        self.training = training
        self.momentum = momentum
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.scope = scope
        
        dense1_units = 400
        dense2_units = 300
        
        with tf.variable_scope(self.scope):
            # input
            self.input_norm = tf.layers.batch_normalization(self.state, momentum=momentum, training=training, 
                                                            name='input_norm')
            
            # first layer
            minval1 = -1/(tf.sqrt(tf.to_float(np.prod(self.state_size))))
            maxval1 = 1/(tf.sqrt(tf.to_float(np.prod(self.state_size))))
            
            self.dense1_init = tf.layers.dense(self.input_norm, units=dense1_units, name='dense1',
                                          kernel_initializer=tf.random_uniform_initializer(minval1, maxval1), 
                                          bias_initializer=tf.random_uniform_initializer(minval1, maxval1)) 
            
            self.dense1_norm = tf.layers.batch_normalization(self.dense1_init, momentum=momentum, training=training, 
                                                             name='dense1')
            
            self.dense1 = tf.nn.relu(self.dense1_norm, name='dense1')
            
            # second layer
            minval2 = -1/(tf.sqrt(tf.to_float(np.prod(dense1_units))))
            maxval2 = 1/(tf.sqrt(tf.to_float(np.prod(dense1_units))))
            
            self.dense2_init = tf.layers.dense(self.dense1, units=dense2_units, name='dense2',
                                          kernel_initializer=tf.random_uniform_initializer(minval2, maxval2), 
                                          bias_initializer=tf.random_uniform_initializer(minval2, maxval2))
            
            self.dense2_norm = tf.layers.batch_normalization(self.dense2_init, momentum=momentum, training=training, 
                                                             name='dense2')
            
            self.dense2 = tf.nn.relu(self.dense2_norm, name='dense2')
            
            # output
            minval3 = -0.003
            maxval3 = 0.003
            
            self.output_init = tf.layers.dense(self.dense2, units=np.prod(self.action_size), name='output',
                                          kernel_initializer=tf.random_uniform_initializer(minval3, maxval3), 
                                          bias_initializer=tf.random_uniform_initializer(minval3, maxval3))
            
            self.output_tanh = tf.nn.tanh(self.output_init, name='output')
            
            self.output = tf.multiply(0.5, tf.multiply(self.output_tanh, (self.action_high-self.action_low)) + (self.action_high+self.action_low))
            
            self.network_params = tf.trainable_variables(scope=self.scope)
            
    def train(self, critic_output):
        with tf.variable_scope(self.scope):
            with tf.variable_scope('train'):
                self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.gradients = tf.gradients(self.output, self.network_params, -critic_output)
                self.gradients_mean = list(map(lambda x: tf.divide(x, self.batch_size), self.gradients))
                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=self.scope)
                with tf.control_dependencies(update_ops):
                    train_step = self.optimizer.apply_gradients(zip(self.gradients_mean, self.network_params))
                
                return train_step

In [0]:
class Critic:
    def __init__(self, state, action, state_size, action_size, action_low, action_high, training, momentum, learning_rate, batch_size, scope):
        self.state = state
        self.action = action
        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high
        self.training = training
        self.momentum = momentum
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.scope = scope
        
        dense1_units = 400
        dense2_units = 300
        
        with tf.variable_scope(self.scope):
            # input
            self.input_norm = tf.layers.batch_normalization(self.state, momentum=momentum, training=training, 
                                                                  name='input_norm')
            
            # first layer
            minval1 = -1/(tf.sqrt(tf.to_float(np.prod(self.state_size))))
            maxval1 = 1/(tf.sqrt(tf.to_float(np.prod(self.state_size))))
            
            self.dense1_init = tf.layers.dense(self.input_norm, units=dense1_units, name='dense1',
                                          kernel_initializer=tf.random_uniform_initializer(minval1, maxval1), 
                                          bias_initializer=tf.random_uniform_initializer(minval1, maxval1)) 
            
            self.dense1_norm = tf.layers.batch_normalization(self.dense1_init, momentum=momentum, training=training,
                                                             name='dense1')
            
            self.dense1 = tf.nn.relu(self.dense1_norm, name='dense1')
            
            # second layer (action appears)
            minval2 = -1/(tf.sqrt(tf.to_float(np.prod(dense1_units + np.prod(self.action_size)))))
            maxval2 = 1/(tf.sqrt(tf.to_float(np.prod(dense1_units + np.prod(self.action_size)))))
            
            self.dense2_init = tf.layers.dense(self.dense1, units=dense2_units, name='dense2_init',
                                          kernel_initializer=tf.random_uniform_initializer(minval2, maxval2), 
                                          bias_initializer=tf.random_uniform_initializer(minval2, maxval2))
            
            self.dense2_action = tf.layers.dense(self.action, units=dense2_units, name='dense2_action',
                                          kernel_initializer=tf.random_uniform_initializer(minval2, maxval2), 
                                          bias_initializer=tf.random_uniform_initializer(minval2, maxval2))
            
            self.dense2 = tf.nn.relu(self.dense2_init + self.dense2_action, name='dense2')
            
            #output
            minval3 = -0.003
            maxval3 = 0.003
            
            self.output = tf.layers.dense(self.dense2, units=1, name='output',
                                          kernel_initializer=tf.random_uniform_initializer(minval3, maxval3), 
                                          bias_initializer=tf.random_uniform_initializer(minval3, maxval3))
            
            self.network_params = tf.trainable_variables(scope=self.scope)
            
            self.critic_grads = tf.gradients(self.output, self.action)

            
    def train(self, target_q):
        with tf.variable_scope(self.scope):
            with tf.variable_scope('train'):
                self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.loss = tf.losses.mean_squared_error(target_q, self.output)
                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=self.scope)
                with tf.control_dependencies(update_ops):
                    train_step = self.optimizer.minimize(self.loss, var_list=self.network_params)
                
                return train_step

In [0]:
class OrnsteinUhlenbeckNoise:
    def __init__(self, mu, sigma=0.3, theta=0.15, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

In [0]:
class ReplayBuffer:
    def __init__(self, state_size, action_size, buffer_size, batch_size):
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.now_size = 0
        self.now_pos = 0
        
        self.actions = np.empty((self.buffer_size,) + self.action_size, dtype=np.float32)
        self.rewards = np.empty(self.buffer_size, dtype=np.float32)
        self.terminals = np.empty(self.buffer_size, dtype=np.bool)
        self.states = np.empty((self.buffer_size,) + self.state_size, dtype=np.float32)

        self.batch_states = np.empty((self.batch_size,) + self.state_size, dtype=np.float32)
        self.batch_next_state = np.empty((self.batch_size,) + self.state_size, dtype=np.float32)

        self.indices = np.empty(self.batch_size, dtype=np.int32)

    def add(self, state, action, reward, terminal):
        self.states[self.now_pos, ...] = state
        self.actions[self.now_pos, ...] = action
        self.rewards[self.now_pos] = reward
        self.terminals[self.now_pos] = terminal
        self.now_size = max(self.now_size, self.now_pos + 1)
        self.now_pos = (self.now_pos + 1) % self.buffer_size

    def get_indicies(self):
        for i in range(self.batch_size):
            while True:
                index = np.random.randint(1, self.now_size)
                # state and next_state must be from one episode
                if index == self.now_pos:
                    continue
                # state and next_state must be from one episode
                if self.terminals[index-1]:
                    continue
                break
                
            self.indices[i] = index

    def get_batch(self):
        self.get_indicies()
        for i, index in enumerate(self.indices):
            self.batch_states[i, ...] = self.states[index - 1, ...]
            self.batch_next_state[i, ...] = self.states[index, ...]

        return self.batch_states, self.actions[self.indices], self.rewards[self.indices], self.terminals[self.indices], self.batch_next_state

In [0]:
def update_target_network(network_params, target_network_params, tau):
    copy = []
    for old, new in zip(network_params, target_network_params):
        copy.append(new.assign((tf.multiply(old, tau) + tf.multiply(new, 1. - tau))))

    return copy

In [0]:
# INITIALIZATION
tf.reset_default_graph()

# placeholders
state_ph = tf.placeholder(dtype=tf.float32, shape=((None,) + state_size))
action_ph = tf.placeholder(dtype=tf.float32, shape=((None,) + action_size))
target_q_ph = tf.placeholder(dtype=tf.float32, shape=(None, 1))
critic_grads_ph = tf.placeholder(dtype=tf.float32, shape=((None,) + action_size))
training_ph = tf.placeholder_with_default(input=True, shape=None)

# init actor networks
actor = Actor(state_ph, state_size, action_size, action_low, action_high, training_ph, momentum, actor_learning_rate, batch_size, 'actor')
actor_target = Actor(state_ph, state_size, action_size, action_low, action_high, training_ph, momentum, actor_learning_rate, batch_size, 'actor_target')

# init critic networks
critic = Critic(state_ph, action_ph, state_size, action_size, action_low, action_high, training_ph, momentum, critic_learning_rate, batch_size, 'critic')
critic_target = Critic(state_ph, action_ph, state_size, action_size, action_low, action_high, training_ph, momentum, critic_learning_rate, batch_size, 'critic_target')

# train operations
critic_train = critic.train(target_q_ph)
actor_train = actor.train(critic_grads_ph)

# update network parameters operations
update_critic_target = update_target_network(critic.network_params, critic_target.network_params, tau)
update_actor_target = update_target_network(actor.network_params, actor_target.network_params, tau)

# buffer and noise
replay_buffer = ReplayBuffer(state_size, action_size, buffer_size, batch_size)
noise = OrnsteinUhlenbeckNoise(mu=np.zeros(action_size))
noise_scaling = noise_scale * (action_high - action_low)

# make directories
os.makedirs(model_dir, exist_ok=True)
os.makedirs(tensorboard_dir, exist_ok=True)

# init session, saver and writer
sess = tf.Session()
saver = tf.train.Saver(max_to_keep=10)
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(tensorboard_dir, graph=sess.graph)

# copy networks to target networks
sess.run(update_target_network(critic.network_params, critic_target.network_params, 1))
sess.run(update_target_network(actor.network_params, actor_target.network_params, 1))

# episode reward for tensorboard scalar
episode_reward_var = tf.Variable(0.0, trainable=False)
tf.summary.scalar('Episode_reward', episode_reward_var)
summary_op = tf.summary.merge_all()





# TRAINING

# fill replay buffer with random actions
for step in range(random_steps):
    action = env.action_space.sample()
    state, reward, terminal, _ = env.step(action)
    replay_buffer.add(state, action, reward, terminal)
    if terminal:
        env.reset()

episode_rewards = []
losses = []

for episode in range(total_episodes):
    state = env.reset()
    noise.reset()
    episode_reward = 0
    episode_terminal = False
    count_steps = 0

    while not episode_terminal:
        state_expand = np.expand_dims(state, 0)
        action = sess.run(actor.output, feed_dict={state_ph : state_expand, training_ph : False})[0]
        action += noise() * noise_scaling
        state, reward, terminal, info = env.step(action)
        episode_reward += reward
        replay_buffer.add(state, action, reward, terminal)
        count_steps += 1

        states, actions, rewards, terminals, next_states = replay_buffer.get_batch()

        # target_q = reward + discount_rate * Q'(next_state, M'(next_state))
        next_actions = sess.run(actor_target.output, feed_dict={state_ph : next_states})

        predicted_q = sess.run(critic_target.output, feed_dict={state_ph : next_states, action_ph : next_actions})[:, 0]

        predicted_q[terminals] = 0

        target_q = rewards + discount_rate * predicted_q

        target_q_expand = np.expand_dims(target_q, 1)

        # train critic
        sess.run(critic_train, feed_dict={state_ph : states, action_ph : actions, target_q_ph : target_q_expand})

        # train actor
        actor_output = sess.run(actor.output, feed_dict={state_ph : states})
        critic_grads = sess.run(critic.critic_grads, feed_dict={state_ph : states, action_ph : actor_output})
        sess.run(actor_train, feed_dict={state_ph : states, critic_grads_ph : critic_grads[0]})

        # update target networks
        sess.run(update_critic_target)
        sess.run(update_actor_target)

        if terminal or count_steps == steps_per_episode:
            episode_terminal = True
            episode_rewards.append(episode_reward)
            # update summary
            summ = sess.run(summary_op, feed_dict={episode_reward_var : episode_reward})
            writer.add_summary(summ, episode)
           
    
    if episode % print_progress_step == 0:
        # print progress
        print('Episode =', episode, 'Mean reward =', np.mean(episode_rewards[-10:]))
        
    
    if episode % save_model_step == 0:
        saver.save(sess, model_dir, global_step=episode)

Instructions for updating:
Use keras.layers.batch_normalization instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.cast instead.
Episode = 0 Mean reward = -1325.9862349526536
Episode = 10 Mean reward = -1379.0853186895224
Episode = 20 Mean reward = -1431.2582922275988
Episode = 30 Mean reward = -1234.6649399889452
Episode = 40 Mean reward = -1002.7581047726198
Episode = 50 Mean reward = -653.8453193033907
Episode = 60 Mean reward = -339.57815173665847
Episode = 70 Mean reward = -203.5316214812877
Episode = 80 Mean reward = -216.60701220466876
Episode = 90 Mean reward = -172.6179050007132
Episode = 100 Mean reward = -185.67593696918084
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Episode = 110 Mean reward = -161.10139528118188
Episode = 120 Mean reward = -143.974057201851
Episode 