## HW6 by Denis Osipychev
# Proximal Policy Optimization Algorithms

### Hyper parameters and service functions

In [1]:
# import env and modules
import gym
import numpy as np
import random
import tensorflow as tf
import tensorflow.contrib.slim as slim

In [2]:
# create an instance of env
env = gym.make('Pendulum-v0')
env.reset()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


array([-0.67323767, -0.73942615, -0.11118893])

In [3]:
class PPOagent():
    def __init__(self, lr, s_size,a_size,h_size):
        
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        
        # Value network
        hidden1 = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.tanh)
        self.value = slim.fully_connected(hidden1,1,biases_initializer=None)
        
        # Policy network
        hidden2 = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.tanh)
        self.p_mean = slim.fully_connected(hidden2,a_size,activation_fn=tf.nn.tanh,biases_initializer=None)
        self.action = self.p_mean
        #self.action = tf.random_normal(1, mean=self.outputP,stddev=1.0)
        
        # loss for V network
        self.v_target = tf.placeholder(shape=[None],dtype=tf.float32)
        v_loss = tf.reduce_mean(tf.square(self.value - self.v_target))
        
        # loss for Policy network (A value)
        self.old_p_mean = tf.placeholder(shape=[None],dtype=tf.float32)
        self.a_target = tf.placeholder(shape=[None],dtype=tf.float32)
        ratio = tf.exp(tf.log(self.p_mean) - tf.log(self.old_p_mean)) # pnew / pold
        surr_loss = ratio * self.a_target # surrogate from conservative policy iteration
        surr_loss_clip = tf.clip_by_value(ratio, 1.0 - 0.2, 1.0 + 0.2) * self.a_target #
        p_loss = - tf.reduce_mean(tf.minimum(surr_loss, surr_loss_clip))
        
        # total loss function
        total_loss = p_loss + v_loss
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(total_loss,tvars)
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

In [4]:
def v_and_a(s_arr, a_arr, r_arr, s_new_arr, v_arr, gamma, lambd):
    
    new_state = np.append(s_new_arr, 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1
    v_predicted = np.append(v_arr, r_arr[-1])
    T = len(r_arr)
    advantage = np.empty(T, 'float32')
    gaelam = np.empty(T, 'float32')
    lastgaelam = 0
    for t in reversed(range(T)):
        nonterminal = 1 - new_state[t+1]
        delta = r_arr[t] + gamma * v_predicted[t+1] * nonterminal - v_predicted[t]
        gaelam[t] = delta + gamma * lam * nonterminal * lastgaelam
        lastgaelam = gaelam[t]
        advantage[t] = gaelam[t]
    tdlamret = advantage + v_predicted
    return tdlamret, advantage

In [6]:
class ReplayMemory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, arg):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = arg
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        if len(self.memory) < batch_size:
            batch_size = len(self.memory)
        return random.sample(self.memory, batch_size)
    
    def size(self):
        return len(self.memory)

In [7]:
# initialize hyperparams for PG
n_episodes = 5000
episode_length = 100
gamma = 0.9
alpha = 0.001
update_freq = 100
buffer_size = 1000
batch_size = 100

In [8]:
# TF variables reset
tf.reset_default_graph()
myAgent = PPOagent(lr=alpha,s_size=3,a_size=1,h_size=20)
init = tf.global_variables_initializer()

buffer = ReplayMemory(buffer_size)

In [9]:
with tf.Session() as sess:
    sess.run(init)
    i_episode = 0
    
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    stats = []
        
    while i_episode < n_episodes:
        s = env.reset()
        running_reward = 0
        trajectory = []
        targ_history = []
        i_episode += 1
        
        for j in range(episode_length):
            s = np.array(s).reshape(1, 3)
            action, value = sess.run([myAgent.action,myAgent.value],feed_dict={myAgent.state_in:s})

            s1,r,d,_ = env.step(action) #Get our reward for taking an action given a bandit.
            trajectory.append([s,action,r,s1,value])
            buffer.push(trajectory)
            prev_action = action
            
            s = s1
            running_reward += r
            if d == True:
                #Update the network.
                tra = buffer.sample(batch_size)
                
                for k in range(batch_size):
                    s_arr, a_arr, r_arr, s_new_arr, v_arr = zip(*tra[k])
                    v_target_arr, a_target_arr = v_and_a(s_arr, a_arr, r_arr, s_new_arr, v_arr, gamma, lambd)
                    feed_dict={myAgent.state_in:s_arr,
                            myAgent.a_target:a_target_arr,
                           myAgent.v_target:v_target_arr}
                    grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                    for idx,grad in enumerate(grads):
                        gradBuffer[idx] += grad

                if i_episode % update_freq == 0 and i_episode != 0:
                    feed_dict = dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                break

        
        # Update statistics
        stats.append([i_episode,running_reward])
        
        if i_episode % 100 == 0:
            print(' Epoch:',i_episode,',Average R:',np.mean(np.asarray(stats)[-100:,1]))
    

 Epoch: 100 ,Average R: -636.4497714233398
 Epoch: 200 ,Average R: -645.0633282470703
 Epoch: 300 ,Average R: -640.9600531005859
 Epoch: 400 ,Average R: -637.7628854370117
 Epoch: 500 ,Average R: -637.8624819946289
 Epoch: 600 ,Average R: -641.6584436035156
 Epoch: 700 ,Average R: -628.2841445922852
 Epoch: 800 ,Average R: -638.8971627807617
 Epoch: 900 ,Average R: -655.2780255126953
 Epoch: 1000 ,Average R: -643.877102355957


KeyboardInterrupt: 