## HW6 by Denis Osipychev
# Proximal Policy Optimization Algorithms

### Hyper parameters and service functions

In [1]:
# import env and modules
import gym
import numpy as np
import random
import tensorflow as tf
import tensorflow.contrib.slim as slim

In [2]:
# create an instance of env
env = gym.make('Pendulum-v0')
env.reset()

array([0.00756341, 0.9999714 , 0.73552468])

In [23]:
class PPOagent():
    def __init__(self, lr, s_size,a_size,h_size):
        
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        
        #### Value network
        hidden1 = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.value = slim.fully_connected(hidden1,1,biases_initializer=None)
        
        # loss for V network
        self.v_target = tf.placeholder(shape=[None],dtype=tf.float32)
        v_loss = tf.reduce_mean(tf.square(self.value - self.v_target))
        
        #### Policy network
        hidden2 = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        mu = slim.fully_connected(hidden2,a_size,biases_initializer=None,activation_fn=tf.nn.tanh)
        sigma = slim.fully_connected(hidden2,a_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.action = tf.random_normal([1], mean=mu, stddev=sigma)
        
        # loss for Policy network (A value)
        self.old_p_mean = tf.placeholder(shape=[None],dtype=tf.float32)
        self.a_target = tf.placeholder(shape=[None],dtype=tf.float32)
        ratio = tf.exp(tf.log(mu) - tf.log(self.old_p_mean)) # pnew / pold
        surr_loss = ratio * self.a_target # surrogate from conservative policy iteration
        surr_loss_clip = tf.clip_by_value(ratio, 1.0 - epsilon, 1.0 + epsilon) * self.a_target #
        p_loss = - tf.reduce_mean(tf.minimum(surr_loss, surr_loss_clip))
        
        # total loss function
        total_loss = p_loss + v_loss
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients_v = tf.gradients(v_loss,tvars)
        self.gradients_p = tf.gradients(p_loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

In [34]:
def traj2v_a(s_arr, a_arr, r_arr, s_new_arr, v_arr):
    
    traj_len = len(r_arr)
    advantage = np.empty(traj_len, 'float32')
    
    lastgaelam = 0
    
    for t in reversed(range(traj_len-1)):
        delta = r_arr[t] + gamma * v_arr[t+1] - v_arr[t]
        gae = delta + gamma * lastgaelam
        lastgaelam = gae
        advantage[t] = gae
    tdlamret = advantage + v_arr
    return tdlamret, advantage

In [35]:
class ReplayMemory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, arg):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = arg
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        if len(self.memory) < batch_size:
            batch_size = len(self.memory)
        return random.sample(self.memory, batch_size)
    
    def size(self):
        return len(self.memory)

In [36]:
# initialize hyperparams for PG
n_episodes = 5000
episode_length = 200
gamma = 0.9
alpha = 0.001
lambd = 1
epsilon = 0.2
update_freq = 100
buffer_size = 1000
batch_size = 100

In [37]:
# TF variables reset
tf.reset_default_graph()
myAgent = PPOagent(lr=alpha,s_size=3,a_size=1,h_size=20)
init = tf.global_variables_initializer()

In [48]:
with tf.Session() as sess:
    
    sess.run(init)
    
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    
    stats = []
    buffer = ReplayMemory(buffer_size)
    i_episode = 0
    
    while i_episode < n_episodes:
        
        s = env.reset()
        s = np.array(s).reshape(1, 3)
        running_reward = 0
        trajectory = []
        i_episode += 1
        
        for t in range(episode_length):
            
            action, value = sess.run([myAgent.action,myAgent.value],
                                     feed_dict={myAgent.state_in:s})

            s_new,r,d,_ = env.step(action)
            
            #populate the trajectory
            trajectory.append([s,action,r,s_new,value])
            
            prev_action = action
            s = np.array(s_new).reshape(1, 3)
            running_reward += r
            
            if d == True:
                buffer.push(trajectory)
                break
                
        # sample the trajectories
        tra_batch = buffer.sample(batch_size)
        
        for i in range(len(tra_batch)):
            s_arr, a_arr, r_arr, s_new_arr, v_arr = zip(*tra_batch[i])
            v_target_arr, a_target_arr = traj2v_a(s_arr, a_arr, r_arr, s_new_arr, v_arr)
            feed_dict={myAgent.state_in:s_arr,
                        myAgent.a_target:a_target_arr,
                        myAgent.v_target:v_target_arr}
            grads_p = sess.run(myAgent.gradients_p, feed_dict=feed_dict)
            grads_v = sess.run(myAgent.gradients_v, feed_dict=feed_dict)
            
            # add gradients to grad buffer
            for idx,grad in enumerate(grads):
                gradBuffer[idx] += grad_p + grad_v

        if i_episode % update_freq == 0 and i_episode != 0:
            feed_dict= dict(zip(myAgent.gradient_holders, gradBuffer))
            _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
            for ix,grad in enumerate(gradBuffer):
                gradBuffer[ix] = grad * 0

        
        # Update statistics
        stats.append([i_episode,running_reward])
        
        if i_episode % 100 == 0:
            print(' Epoch:',i_episode,',Average R:',np.mean(np.asarray(stats)[-100:,1]))
    

ValueError: Cannot feed value of shape (200, 1, 200) for Tensor 'Placeholder_1:0', which has shape '(?,)'

In [None]:
#plot the results (taken from https://gist.github.com/vksah32/8ba7ee7489bd8ec50c995f6216adcb7d)
import pandas as pd
import matplotlib.pyplot as plt 

i,r = zip(*stats)
r_np = np.asarray(r).flatten()
i_np = np.asarray(i).flatten()

mean_x1 = pd.Series(r_np).rolling(window=100).mean()

plt.xlabel("epochs")
plt.ylabel("score per step")
plt.plot(i_np, mean_x1, 'r-')
plt.legend(['ppo'])
plt.title('Simple pend')
plt.show()

In [None]:
r_np