## Imports

In [2]:
import gym
import numpy      as np
import tensorflow as tf
from tqdm import tnrange

## Params & Constants

In [3]:
env     = gym.make('CartPole-v0')
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n

seed = 42
tf.set_random_seed(seed)
np.random.seed(seed)

pi_lr           = 3e-4
v_lr            = 1e-3
clip_ratio      = 0.2
max_kl          = .01
epochs          = 10
steps_per_epoch = 1000
render          = False
train_pi_iters  = 80
train_v_iters   = 8
hidden_sizes    = [64,64]

## Helper functions

In [4]:
def discount_cumsum(rewards, discount):
    '''
    Method generates a discounted cumulative sum from a reward trajectory
    [1,2,3] -> [3*d + 2*d + 1*d, 2*d + 1*d, 1*d]
    '''
    result = []
    for i in range(len(rewards)):
        d_cummulative_sum = sum([discount * x for x in rewards[i:]])
        result.append(d_cummulative_sum)

    return result

def mlp(x, hidden_sizes, activation=tf.nn.relu, output_activation=None):
    '''
    Cleanest dynamic generation of dense layered NN I've ever seen
    '''
    for h in hidden_sizes[:-1]:
        x = tf.layers.dense(x, units=h, activation=activation)
    return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)

## Buffer

In [5]:
class PPOBuffer:
    
    def __init__(self, obs_dim, gamma=0.99, lam=0.97):
        
        self.obs_dim   = obs_dim

        self.obs_buf   = np.empty((0, self.obs_dim))
        self.act_buf   = np.array([])
        self.rew_buf   = np.array([])
        self.val_buf   = np.array([])
        self.adv_buf   = np.array([])
        self.ret_buf   = np.array([])
        self.logp_buf  = np.array([])

        self.gamma     = gamma
        self.lam       = lam
        self.idx_s     = 0
        self.idx_e     = 0
    
    
    def store(self, obs, act, rew, val, logp):
        
        self.obs_buf  = np.append(self.obs_buf,  obs, axis=0)
        self.act_buf  = np.append(self.act_buf,  act)
        self.rew_buf  = np.append(self.rew_buf,  rew)
        self.val_buf  = np.append(self.val_buf,  val)
        self.logp_buf = np.append(self.logp_buf, logp)

        self.idx_e += 1
    
    
    def finish_path(self, last_val=0):

        path_slice = slice(self.idx_s, self.idx_e)
        rews = np.append(self.rew_buf[path_slice], last_val)
        vals = np.append(self.val_buf[path_slice], last_val)

        # the next two lines implement GAE-Lambda advantage calculation
        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf = np.append(self.adv_buf, discount_cumsum(deltas, self.gamma * self.lam))

        # the next line computes rewards-to-go, to be targets for the value function
        self.ret_buf = np.append(self.ret_buf, discount_cumsum(rews, self.gamma)[:-1])

        self.idx_s = self.idx_e
        
    def get(self):
        # Normalize the advantages
        self.adv_buf = (self.adv_buf - np.mean(self.adv_buf)) / np.std(self.adv_buf)

        return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf]
    
    def reset(self):
        
        self.obs_buf  = np.empty((0, self.obs_dim))
        self.act_buf  = np.array([])
        self.rew_buf  = np.array([])
        self.val_buf  = np.array([])
        self.adv_buf  = np.array([])
        self.ret_buf  = np.array([])
        self.logp_buf = np.array([])
        self.idx_e    = 0
        self.idx_s    = 0

## Place Holders

In [6]:
obs_ph      = tf.placeholder(tf.float32, shape=(None, obs_dim))
act_ph      = tf.placeholder(tf.int32,   shape=(None,))
rew_ph      = tf.placeholder(tf.float32, shape=(None,))
adv_ph      = tf.placeholder(tf.float32, shape=(None,))
ret_ph      = tf.placeholder(tf.float32, shape=(None,))
logp_old_ph = tf.placeholder(tf.float32, shape=(None,))

## Model

In [7]:
# Actor
with tf.variable_scope('pi'):
    logits   = mlp(obs_ph, hidden_sizes=hidden_sizes+[act_dim], activation=tf.nn.relu, output_activation=None)
    pi       = tf.squeeze(tf.multinomial(logits, num_samples=1), axis=1) # Our Action
    logp     = tf.reduce_sum(tf.one_hot(act_ph, depth=act_dim) * tf.nn.log_softmax(logits), axis=1) # Previous Actions
    logp_pi  = tf.reduce_sum(tf.one_hot(pi,     depth=act_dim) * tf.nn.log_softmax(logits), axis=1) # Hypothetical Actions

    # PPO objectives
    ratio       = tf.exp(logp - logp_old_ph)
    clipped_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)
    pi_loss     = -tf.reduce_mean(tf.minimum(ratio * adv_ph, clipped_adv))
    train_pi    = tf.train.AdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)

    # PPO Used for early stopping during training
    approx_kl = tf.abs(tf.reduce_mean(logp_old_ph - logp))

# Critic
with tf.variable_scope('v'):
    v = tf.squeeze(mlp(obs_ph, hidden_sizes=hidden_sizes+[1], activation=tf.tanh, output_activation=None), axis=1)
    v_loss  = tf.reduce_mean((ret_ph - v) ** 2)
    train_v = tf.train.AdamOptimizer(learning_rate=v_lr).minimize(v_loss)  

## Update Function

In [8]:
def update(feed_ph, sess, max_kl=0.01, train_pi_iters=80, train_v_iters=80):
    inputs = {k:v for k,v in zip(feed_ph, buf.get())}
    pls, vls = [], []
    
    # Policy gradient step
    for i in range(train_pi_iters):
        _, pl, kl = sess.run([train_pi, pi_loss, approx_kl], feed_dict=inputs)
        pls.append(pl)
        
        # Early stopping
        if kl > 1.5 * max_kl:
            print('Early stopping at step %d due to reaching max kl.'%i)
            break

    # Value function learning
    for _ in range(train_v_iters):
        _,vl = sess.run([train_v, v_loss], feed_dict=inputs)
        vls.append(vl)
    
    return [pls, vls]

## Episodes

In [9]:
s = tf.InteractiveSession()
s.run(tf.global_variables_initializer())

tb = tf.summary.FileWriter( './logs/1/train ', s.graph)

buf = PPOBuffer(obs_dim)
bar = tnrange(epochs)
for epoch in bar:

    obs, rew, done = env.reset(), 0, False
    total_episode_reward, epoch_rews = 0, []
    finished_rendering_this_epoch    = False

    # -------- Start Batch -------- #
    for step in range(steps_per_epoch):
        
        if render and not finished_rendering_this_epoch:
            env.render()

        act, v_t, logp_t = s.run([pi, v, logp_pi], feed_dict={obs_ph: obs[None,:]})
        buf.store([obs], act, rew, v_t, logp_t)

        obs, rew, done, _ = env.step(act[0])
        total_episode_reward += rew

        if done or (step==steps_per_epoch-1):
            last_val = rew if done else s.run(v, feed_dict={obs_ph: obs[None,:]})
            buf.finish_path(last_val)

            epoch_rews.append(total_episode_reward)
            obs, rew, done       = env.reset(), 0, False
            total_episode_reward =0

            finished_rendering_this_epoch = True

    pls, vls = update([obs_ph, act_ph, adv_ph, ret_ph, logp_old_ph], s, max_kl=max_kl, train_pi_iters=train_pi_iters, train_v_iters=train_v_iters)
    bar.write(f'Epoch {epoch+1}, rewards: {(sum(epoch_rews)/len(epoch_rews)):.2f}, \
              vl mean: {np.mean(vls):.4f}, pl mean: {np.mean(pls):.4f}')
    buf.reset()

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Epoch 1, rewards: 25.00,               vl mean: 430.1942, pl mean: -0.0111
Epoch 2, rewards: 33.33,               vl mean: 829.2031, pl mean: -0.0167
Early stopping at step 12 due to reaching max kl.
Epoch 3, rewards: 52.63,               vl mean: 2387.7151, pl mean: -0.0061
Epoch 4, rewards: 76.92,               vl mean: 3155.3511, pl mean: -0.0126
Epoch 5, rewards: 90.91,               vl mean: 6817.0640, pl mean: -0.0090
Epoch 6, rewards: 142.86,               vl mean: 8485.3613, pl mean: -0.0094
Epoch 7, rewards: 111.11,               vl mean: 7352.4951, pl mean: -0.0109
Epoch 8, rewards: 111.11,               vl mean: 7090.7715, pl mean: -0.0107
Epoch 9, rewards: 142.86,               vl mean: 9058.7588, pl mean: -0.0094
Epoch 10, rewards: 90.91,               vl mean: 7792.9321, pl mean: -0.0104



## Scratch paper

In [20]:
for v in tf.trainable_variables():
    n = np.array(v.eval())
    print(v)
    print(n.shape)
    print('----')

<tf.Variable 'pi/dense/kernel:0' shape=(4, 64) dtype=float32_ref>
(4, 64)
----
<tf.Variable 'pi/dense/bias:0' shape=(64,) dtype=float32_ref>
(64,)
----
<tf.Variable 'pi/dense_1/kernel:0' shape=(64, 64) dtype=float32_ref>
(64, 64)
----
<tf.Variable 'pi/dense_1/bias:0' shape=(64,) dtype=float32_ref>
(64,)
----
<tf.Variable 'pi/dense_2/kernel:0' shape=(64, 2) dtype=float32_ref>
(64, 2)
----
<tf.Variable 'pi/dense_2/bias:0' shape=(2,) dtype=float32_ref>
(2,)
----
<tf.Variable 'v/dense/kernel:0' shape=(4, 64) dtype=float32_ref>
(4, 64)
----
<tf.Variable 'v/dense/bias:0' shape=(64,) dtype=float32_ref>
(64,)
----
<tf.Variable 'v/dense_1/kernel:0' shape=(64, 64) dtype=float32_ref>
(64, 64)
----
<tf.Variable 'v/dense_1/bias:0' shape=(64,) dtype=float32_ref>
(64,)
----
<tf.Variable 'v/dense_2/kernel:0' shape=(64, 1) dtype=float32_ref>
(64, 1)
----
<tf.Variable 'v/dense_2/bias:0' shape=(1,) dtype=float32_ref>
(1,)
----
