In [None]:
import math
import random

import gym
import numpy as np

import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

from multiprocessing_env import SubprocVecEnv

from IPython.display import clear_output
from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline

import time

In [None]:
#Number on envs we'll be running in parallel
num_envs = 16
env_name = "Pendulum-v0"

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)

In [None]:
#Neural Net
#ActorCritic, for continuous action tasks
class ActorCritic(tf.keras.Model):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = tf.keras.Sequential([
            tf.keras.layers.Dense(hidden_size, input_shape=(num_inputs,), activation='relu'),
            tf.keras.layers.Dense(1)
        ])
        
        self.actor = tf.keras.Sequential([
            tf.keras.layers.Dense(hidden_size, input_shape=(num_inputs,), activation='relu'),
            tf.keras.layers.Dense(num_outputs)
        ])
        
        #self.log_std only for continuous action spaces.
        self.log_std = tf.Variable(np.ones([1, num_outputs]) * std)
                
    def call(self, x):
        value = self.critic(x)
        mu    = self.actor(x)
        std   = tf.math.exp(self.log_std)

        #only difference is we're using Normal dist to sample actions from
        #Gaussian distribution. Categorical() for discrete action space.
        dist  = tfd.MultivariateNormalDiag(tf.cast(mu, tf.float32), tf.cast(std, tf.float32))
        return dist, value

In [None]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()
    
def test_env(vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = tf.expand_dims(tf.convert_to_tensor(state), 0)
        dist, _ = model(state)
        next_state, reward, done, _ = env.step(dist.sample().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward

In [None]:
#GAE
#gamma is discount factor for returns, tau is smoothing factor of GAE algo.
def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        #delta is Bellman equation minus value of the state
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        #moving average of advantages discounted by gamma * tau
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns

In [None]:
def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = len(states)
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)

        yield (
            tf.gather(states, rand_ids), tf.gather(actions, rand_ids), tf.gather(log_probs, rand_ids),
            tf.gather(returns, rand_ids), tf.gather(advantage, rand_ids)
        )
        

def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2):
    for j in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            with tf.GradientTape() as tape:
                dist, value = model(state)   #pass state into network to get latest distribution and state value
                entropy = tf.reduce_mean(dist.entropy()) #for inciting exploration
                new_log_probs = dist.log_prob(action) #new log_probs of originally selected actions


                ratio = tf.math.exp(new_log_probs - old_log_probs)
                surr1 = ratio * advantage
                surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

                #CLIP LOSS
                actor_loss  = -tf.reduce_mean(tf.math.minimum(surr1, surr2))
                #MSE LOSS between GAE returns and estimated value of the state
                critic_loss = tf.reduce_mean(tf.square(return_ - value))
                #discounted critic loss plus CLIP LOSS minus scaled entroy
                loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))


In [None]:
num_inputs  = envs.observation_space.shape[0]
num_outputs = envs.action_space.shape[0]

#Hyper params:
hidden_size      = 256   #neurons in hidden layer
lr               = 3e-4  #passed to Adam optimizer    
num_steps        = 1008   #num of transitions we sample for each training iter
mini_batch_size  = 16     #num of samples randomly selected from stored data
ppo_epochs       = 8     #num passes over entire training data
threshold_reward = 90    #we'll stop training when we reach this reward in evaluation

model = ActorCritic(num_inputs, num_outputs, hidden_size)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

In [None]:
max_frames = 50000
frame_idx  = 0
test_rewards = []

state = envs.reset()
early_stop = False

while frame_idx < max_frames and not early_stop:

    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []
    entropy = 0
    
    #each step generate state, action, reward, next_state from each env.
    for _ in range(num_steps):
        dist, value = model(state) #state through netwwork to get prob dist and estimated V(s)
        action = dist.sample()
        #state, reward, done is list of results, 1 per env
        #env.render()
        next_state, reward, done, _ = envs.step(action.numpy())

        log_prob = dist.log_prob(action)
        entropy += tf.reduce_mean(dist.entropy())
        
        #Store log_probs, values, rewards, done_masks, states, actions. Each list num_steps long, each step num_envs wide.
        log_probs.append(tf.cast(tf.reshape(log_prob, (num_envs, 1)), tf.float32))
        values.append(value)
        rewards.append(tf.cast(tf.reshape(reward, (num_envs, 1)), tf.float32))
        masks.append(tf.cast(tf.reshape(1 - done, (num_envs, 1)), tf.float32))

        states.append(tf.cast(state, tf.float32))
        actions.append(tf.cast(action, tf.float32))
        
        state = next_state
        frame_idx += 1
        
        if frame_idx % 1000 == 0:
            test_reward = np.mean([test_env() for _ in range(10)])
            test_rewards.append(test_reward)
            plot(frame_idx, test_rewards)
            if test_reward > threshold_reward: early_stop = True
            
    #to calc returns correctly, run final next_state through network to get value
    _, next_value = model(next_state)
    #run GAE. Loop backwards from recent experience.
    returns = compute_gae(next_value, rewards, masks, values)
    
    #concatanate each list inside a torch tensor.
    #list that was num_steps long, num_envs wide becomes num_steps*num_envs long
    returns = tf.stop_gradient(tf.concat(returns, axis=0))
    log_probs = tf.stop_gradient(tf.concat(log_probs, axis=0))
    values = tf.stop_gradient(tf.concat(values, axis=0))
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values
    
    ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)



In [None]:
#Save trajectories for GAIL
from itertools import count

max_expert_num = 10000
num_steps = 0
expert_traj = []

for i_episode in count():
    state = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        dist, _ = model(state)
        action = dist.sample().numpy()[0]
        #Show
        
        #Take action
        
        next_state, reward, done, _ = env.step(action)
        #show_state(env.env, step=0, info="")
        state = next_state
        total_reward += reward
        expert_traj.append(np.hstack([state, action]))
        num_steps += 1
    
    print("episode:", i_episode, "reward:", total_reward)
    
    if num_steps >= max_expert_num:
        break

for _ in range(10):
    test_env(True)        
    time.sleep(2)

expert_traj = np.stack(expert_traj)
print()
print(expert_traj.shape)
print()
np.save("expert_traj_mntcarcont16.npy", expert_traj)



def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("%s | Step: %d %s" % (env._spec.id,step, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())