# Exercise 9: Gaussian Advantage Actor-Critic (A2C)

In [None]:
import numpy as np
import os
import tensorflow as tf
from tensorflow.python.framework import ops

from unityagents import UnityEnvironment

### Hyperparameters

In [None]:
y = 0.9 # Discount rate.
total_episodes = 10000 # Set total number of episodes to train agent on.
update_frequency = 20 # How many episodes before updating model.
learning_rate = 3e-4 # Agent learning rate.
hidden_units = 128 # Number of units in hidden layer.
summary_freq = 50 # How often to display information about training
model_path = "./models/gac" # The path to save our model to.
summary_path = "./summaries/gac" # The path to save our model to.
load_model = False # Whether to load a saved model.
train_model = True # Whether to train the model.

### Load the environment

In [None]:
env = UnityEnvironment("./envs/3DBall", worker_id=7)
default_brain = env.brain_names[0]

### Examine observation space

In [None]:
# Reset the environment
env_info = env.reset()[default_brain]

# Examine the state space for the default brain
print("Agent state looks like: \n{}".format(env_info.vector_observations[0]))

### The Actor Critic Agent

In [None]:
def discount_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [None]:
class ACAgent():
    def __init__(self, lr, s_size, a_size, h_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.observation_in =tf.placeholder(shape=[None, s_size],dtype=tf.float32)
        self.batch_size = tf.placeholder(shape=None, dtype=tf.int32)
        
        hidden_1 = tf.layers.dense(tf.layers.flatten(self.observation_in),h_size,use_bias=False,activation=tf.nn.tanh)
        hidden_2 = tf.layers.dense(tf.layers.flatten(hidden_1),h_size,use_bias=False,activation=tf.nn.tanh)
        
        self.value = tf.layers.dense(hidden_2, 1, activation=None, use_bias=False)

        # Compute the mean and log standard deviation.
        self.mu = tf.layers.dense(hidden_2, a_size, activation=None, use_bias=False,
                                  kernel_initializer=tf.contrib.layers.variance_scaling_initializer(factor=0.01))

        self.log_sigma_sq = tf.get_variable("log_sigma_squared", [a_size], dtype=tf.float32,
                                            initializer=tf.zeros_initializer())

        # Sample an action.
        self.sigma_sq = tf.exp(self.log_sigma_sq)
        self.epsilon = tf.random_normal(tf.shape(self.mu), dtype=tf.float32)
        self.action = self.mu + tf.sqrt(self.sigma_sq) * self.epsilon
        
        # Calculate the probability for the action, given the distribution.
        a = tf.exp(-1 * tf.pow(tf.stop_gradient(self.action) - self.mu, 2) / (2 * self.sigma_sq))
        b = 1 / tf.sqrt(2 * self.sigma_sq * np.pi)
        self.probs = tf.multiply(a, b, name="action_probs")
        
        # Calculate randomness in policy.
        self.entropy = tf.reduce_mean(0.5 * tf.log(2 * np.pi * np.e * self.sigma_sq))
        
        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        
        advantage = self.reward_holder - tf.stop_gradient(tf.reduce_sum(self.value, axis=1))
        mean_adv, var_adv = tf.nn.moments(advantage, axes=0)
        normalized_advantage = (advantage - mean_adv) / (tf.sqrt(var_adv) + 1e-10)
        
        self.policy_loss = -tf.reduce_mean(tf.log(self.probs + 1e-10) * tf.reshape(normalized_advantage, [-1, 1]))
        self.value_loss = tf.reduce_mean(tf.squared_difference(self.reward_holder, tf.reduce_sum(self.value,axis=1)))
        self.loss = self.policy_loss + 0.5 * self.value_loss - 0.01 * self.entropy
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss, tvars)
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

### Training the Agent

In [None]:
if not os.path.exists(model_path):
    os.makedirs(model_path)
    
if not os.path.exists(summary_path):
    os.makedirs(summary_path)

# Load the agent.
tf.reset_default_graph() 
state_size = env.brains[default_brain].vector_observation_space_size * env.brains[default_brain].num_stacked_vector_observations
action_size = env.brains[default_brain].vector_action_space_size
myAgent = ACAgent(lr=learning_rate, s_size = state_size, a_size=action_size, h_size=hidden_units)

# Start an interactive TensorFlow session.
sess = tf.InteractiveSession()
init = tf.global_variables_initializer()
sess.run(init)
summary_writer = tf.summary.FileWriter(summary_path)
saver = tf.train.Saver()

# Optionally load the model.
if load_model == True:
    print('Loading Model...')
    ckpt = tf.train.get_checkpoint_state(model_path)
    saver.restore(sess,ckpt.model_checkpoint_path)
summary_writer = tf.summary.FileWriter(summary_path)

# Create variables which will be used throughout training.
total_steps = 0
total_reward = []
total_length = []
total_entropy = []
total_value = []
v_losses = []
p_losses = []

# Create buffer for storing gradients.
gradBuffer = sess.run(tf.trainable_variables())
for ix,grad in enumerate(gradBuffer):
    gradBuffer[ix] = grad * 0

# Start training loop
for i in range(total_episodes):
    done = False
    brain_info = env.reset()[default_brain]
    state = brain_info.vector_observations[0]
    running_reward = 0
    episode_steps = 0
    ep_history = []
    entropy = []
    while not done:
        episode_steps += 1
        total_steps += 1
        # Probabilistically pick an action given our network outputs.
        a_dist, action, ent, value = sess.run([myAgent.probs, myAgent.action, 
                                               myAgent.entropy, myAgent.value],
                          feed_dict={myAgent.observation_in:[state], myAgent.batch_size: 1})
        action = action[0]
        brain_info = env.step(action)[default_brain]
        state_1 = brain_info.vector_observations[0]
        reward = brain_info.rewards[0]
        done = brain_info.local_done[0]
        ep_history.append([state, action, reward, state_1, value, a_dist[0][0]])
        state = state_1
        running_reward += reward
        entropy.append(ent)
        if done and train_model:
            # Update the network.
            ep_history = np.array(ep_history)
            ep_history[:,2] = discount_rewards(ep_history[:,2], y)
            feed_dict={myAgent.reward_holder:ep_history[:,2],
                       myAgent.action:np.vstack(ep_history[:,1]),
                       myAgent.observation_in:np.vstack(ep_history[:,0]),
                       myAgent.batch_size: len(ep_history)}
            v_loss, p_loss, grads = sess.run([myAgent.value_loss, 
                                              myAgent.policy_loss, 
                                              myAgent.gradients], 
                                             feed_dict=feed_dict)
            v_losses.append(v_loss)
            p_losses.append(p_loss)
            for idx,grad in enumerate(grads):
                gradBuffer[idx] += grad

            if i % update_frequency == 0 and i != 0:
                feed_dict = dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                for ix,grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0

    total_reward.append(running_reward)
    total_length.append(episode_steps)
    total_entropy.append(np.mean(entropy))
    total_value.append(np.mean(ep_history[:,4]))

    # Write training statistics to tensorboard.
    if i % summary_freq == 0 and i != 0:
        summary = tf.Summary()
        summary.value.add(tag='Info/Reward', simple_value=float(np.mean(total_reward[-summary_freq:])))
        summary.value.add(tag='Info/Episode Length', simple_value=float(np.mean(total_length[-summary_freq:])))
        summary.value.add(tag='Info/Value Estimate', simple_value=float(np.mean(total_value[-summary_freq:])))
        summary.value.add(tag='Info/Value Loss', simple_value=float(np.mean(v_losses[-summary_freq:])))
        summary.value.add(tag='Info/Policy Loss', simple_value=float(np.mean(p_losses[-summary_freq:])))
        summary.value.add(tag='Info/Entropy', simple_value=float(np.mean(total_entropy[-summary_freq:])))
        summary_writer.add_summary(summary, i)
        summary_writer.flush()
        print("Step: {}, Episode: {}, Mean Reward: {}".format(str(total_steps), str(i),
                                                              str(round(np.mean(total_reward[-summary_freq:]), 3))))        
    # Save agent's model
    if i % 1000 == 0 and i != 0:
        saver.save(sess, model_path+'/model-'+str(i)+'.cptk')
        print("Saved Model")
env.close()

In [None]:
env.close()