# Exercise 6: Vanilla Policy Gradient (VPG)

In [None]:
import numpy as np
import os
import tensorflow as tf

from unityagents import UnityEnvironment

### Hyperparameters

In [None]:
y = 0.99 # Discount rate.
total_episodes = 20000 #Set total number of episodes to train agent on.
update_frequency = 20 # How many episodes before updating model.
summary_freq = 100 # How often to display summary statistics.
learning_rate = 1e-3 # Agent learning rate.
hidden_units = 32 # Number of units in hidden layer.
summary_path = './summaries/vpg' # Where to save summary statistics.
model_path = './models/vpg' # Where to save model checkpoints.
load_model = False # Whether to load a saved model.
train_model = True # Whether to train the model.

### Load the Unity environment

In [None]:
env = UnityEnvironment("./envs/2DBall", worker_id=5)
default_brain = env.brain_names[0]

### Examine state space

In [None]:
brains = env.reset()
agent_brain = brains[default_brain]
print(agent_brain.vector_observations)

State *(s)* is a vector whose values corresponds to the rotation of the platform and position and velocity of the ball.

### The Actor Critic Agent

In [None]:
class VPGAgent():
    def __init__(self, lr, s_size,a_size,h_size):
        # These lines established the feed-forward part of the network. 
        # The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        hidden = tf.layers.dense(self.state_in, h_size, activation=tf.nn.elu)
        self.policy = tf.layers.dense(hidden, a_size, activation=tf.nn.softmax)
        self.sampled_action = tf.multinomial(self.policy, 1)
        
        # The randomness in the action probabilities.
        self.entropy = -tf.reduce_sum(self.policy * tf.log(self.policy + 1e-10), axis=1)
        
        # The next six lines establish the training proceedure. 
        # We feed the reward and chosen action into the network
        # to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        
        # We normalize the discounted returns.
        mean_rew, var_rew = tf.nn.moments(self.reward_holder, axes=0)
        normalized_rewards = (self.reward_holder - mean_rew) / (tf.sqrt(var_rew) + 1e-10)

        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        self.actions = tf.one_hot(self.action_holder, a_size)
        
        self.responsible_outputs = tf.reduce_sum(self.policy * self.actions, axis=1)
        self.policy_loss = -tf.reduce_mean(tf.log(self.responsible_outputs) * normalized_rewards)
        self.loss = self.policy_loss - 0.01 * tf.reduce_mean(self.entropy)
                
        # Here we apply the gradients to update the network. 
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_model = optimizer.minimize(self.loss)

### Training the Agent

In [None]:
# Function to take list of rewards and discount factor
# and produce discounted sum of future rewards.
def discount_rewards(r, gamma):
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [None]:
if not os.path.exists(summary_path):
    os.makedirs(summary_path)

if not os.path.exists(model_path):
    os.makedirs(model_path)

# Create our tensorflow agent
tf.reset_default_graph()
space_size = env.brains[default_brain].vector_observation_space_size * env.brains[default_brain].num_stacked_vector_observations
action_size = env.brains[default_brain].vector_action_space_size
myAgent = VPGAgent(lr=learning_rate, s_size=space_size, 
                   a_size=action_size, h_size=hidden_units) 

# Start an interactive TensorFlow session.
sess = tf.InteractiveSession()
init = tf.global_variables_initializer()
sess.run(init)
summary_writer = tf.summary.FileWriter(summary_path)
saver = tf.train.Saver()

# Optionally load the model.
if load_model:
    print('Loading Model...')
    ckpt = tf.train.get_checkpoint_state(model_path)
    saver.restore(sess,ckpt.model_checkpoint_path)
    
# Create variables which will be used throughout training.
total_reward = []
total_length = []
total_entropy = []
global_buffer = np.zeros([0, 4])

# Start training loop
brain = env.reset()[default_brain]
state = brain.vector_observations[0]

for i in range(total_episodes):
    episode_steps = 0
    running_reward = 0
    done = False
    ep_history = []
    while not done:
        episode_steps += 1
        
        # Probabilistically pick an action given our network outputs.
        action, entropy = sess.run([myAgent.sampled_action, myAgent.entropy], 
                          feed_dict={myAgent.state_in:[state]})
        
        # Take action in environment.
        brain_info = env.step(action)[default_brain]
        
        # Collect resulting reward and new observation.
        state_1 = brain_info.vector_observations[0]
        reward = brain_info.rewards[0]
        done = brain_info.local_done[0]
        ep_history.append([state, action, reward, entropy])
        
        state = state_1
        running_reward += reward
        
        if done and train_model:
            ep_history = np.array(ep_history)
            ep_history[:,2] = discount_rewards(ep_history[:,2], y)
            global_buffer = np.concatenate([global_buffer, ep_history])
            
            if i % update_frequency == 0 and i != 0:
                feed_dict={myAgent.reward_holder:global_buffer[:,2],
                        myAgent.action_holder:global_buffer[:,1],
                        myAgent.state_in:np.vstack(global_buffer[:,0])}
                p_loss, _ = sess.run([myAgent.policy_loss, myAgent.update_model], feed_dict=feed_dict)
                
                global_buffer = np.zeros([0, 4])

    total_entropy.append(np.mean(ep_history[:,3]))
    total_reward.append(running_reward)
    total_length.append(episode_steps)

    # Update our running tally of scores and save information to Tensorboard.
    if i % summary_freq == 0 and i != 0:
        summary = tf.Summary()
        summary.value.add(tag='Info/Mean Reward', simple_value=float(np.mean(total_reward[-summary_freq:])))
        summary.value.add(tag='Info/Episode Length', simple_value=float(np.mean(total_length[-summary_freq:])))
        summary.value.add(tag='Info/Entropy', simple_value=float(np.mean(total_entropy[-summary_freq:])))
        summary.value.add(tag='Info/Policy Loss', simple_value=float(p_loss))
        summary_writer.add_summary(summary, i)
        summary_writer.flush()
        print("Episode: {}, Mean Reward: {}".format(str(i), str(round(np.mean(total_reward[-summary_freq:]), 3))))
    # Save agent's model
    if i % 1000 == 0 and i != 0:
        saver.save(sess, model_path+'/model-'+str(i)+'.cptk')
        print("Saved Model")
env.close()

In [None]:
env.close()