In [0]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import gym

In [0]:
class PPO(object):

    def __init__(self):
        self.sess = tf.Session()
        self.tfs = tf.placeholder(tf.float32, [None, s_dim], 'state')
        with tf.variable_scope('critic'):
            l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
            self.v = tf.layers.dense(l1, 1)
            self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
            self.advantage = self.tfdc_r - self.v
            self.closs = tf.reduce_mean(tf.square(self.advantage))
            self.ctrain_op = tf.train.AdamOptimizer(c).minimize(self.closs)
        probs, probs_params = self.actor('probs', trainable=True)
        oldprobs, oldprobs_params = self.actor('oldprobs', trainable=False)
        with tf.variable_scope('sample_action'):
            self.sample_op = tf.squeeze(probs.sample(1), axis=0)
        with tf.variable_scope('update_oldpi'):
            self.update_oldprobs_op = [oldp.assign(p) for p, oldp in zip(probs_params, oldprobs_params)]
        self.tfa = tf.placeholder(tf.float32, [None, a_dim], 'action')
        self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
        with tf.variable_scope('loss'):
            with tf.variable_scope('surrogate'):
                ratio = probs.prob(self.tfa) / oldprobs.prob(self.tfa)
                surr = ratio * self.tfadv
            self.tflam = tf.placeholder(tf.float32, None, 'lambda')
            kl = tf.distributions.kl_divergence(oldprobs, probs)
            self.kl_mean = tf.reduce_mean(kl)
            self.aloss = -(tf.reduce_mean(surr - self.tflam * kl))
        with tf.variable_scope('atrain'):
            self.atrain_op = tf.train.AdamOptimizer(a).minimize(self.aloss)
        tf.summary.FileWriter("log/", self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

    def update(self, s, a, r):
        self.sess.run(self.update_oldprobs_op)
        adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
        for _ in range(10):
            _, kl = self.sess.run(
                [self.atrain_op, self.kl_mean],
                {self.tfs: s, self.tfa: a, self.tfadv: adv, self.tflam: lam})
            if kl > 4*kl_target:
                break
        if kl < kl_target / 1.5:
            lam /= 2
        elif kl > kl_target * 1.5:
            lam *= 2
        lam = np.clip(lam, 1e-4, 10)
        [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(10)]

    def actor(self, name, trainable):
        with tf.variable_scope(name):
            l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable)
            mu = 2 * tf.layers.dense(l1, a_dim, tf.nn.tanh, trainable=trainable)
            sigma = tf.layers.dense(l1, a_dim, tf.nn.softplus, trainable=trainable)
            norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
        return norm_dist, params

    def choose_action(self, s):
        s = s[np.newaxis, :]
        a = self.sess.run(self.sample_op, {self.tfs: s})[0]
        return np.clip(a, -2, 2)

    def get_v(self, s):
        if s.ndim < 2: s = s[np.newaxis, :]
        return self.sess.run(self.v, {self.tfs: s})[0, 0]

In [4]:
episodes = 1000
length = 200
gamma = 0.9
a = 0.0001
c = 0.0002
batch = 32
s_dim, a_dim = 3, 1
kl_target = 0.01
lam = 0.5
env = gym.make('Pendulum-v0').unwrapped
ppo = PPO()
all_ep_r = []

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
Use tf.cast instead.


In [5]:
for ep in range(episodes):
    s = env.reset()
    buffer_s, buffer_a, buffer_r = [], [], []
    ep_r = 0
    for t in range(length):
        env.render()
        a = ppo.choose_action(s)
        s_, r, done, _ = env.step(a)
        buffer_s.append(s)
        buffer_a.append(a)
        buffer_r.append((r+8)/8)
        s = s_
        ep_r += r
        if (t+1) % batch == 0 or t == length-1:
            v_s_ = ppo.get_v(s_)
            discounted_r = []
            for r in buffer_r[::-1]:
                v_s_ = r + GAMMA * v_s_
                discounted_r.append(v_s_)
            discounted_r.reverse()
            bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
            buffer_s, buffer_a, buffer_r = [], [], []
            ppo.update(bs, ba, br)
    if ep == 0: 
      all_ep_r.append(ep_r)
    else: 
      all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1)
    print(
        'Ep: %i' % ep,
        "|Ep_r: %i" % ep_r,
        "|Lam: %.4f" % lam,
    )

NoSuchDisplayException: ignored

In [0]:
plt.plot(np.arange(len(all_ep_r)), all_ep_r)
plt.xlabel('Episode');
plt.ylabel('Moving averaged episode reward');
plt.show()