In [2]:
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
from torch.distributions import Categorical
from multiprocessing_env import SubprocVecEnv

from IPython.display import clear_output
from IPython import display
import matplotlib.pyplot as plt
%matplotlib inline

import time

#CUDA
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")
use_cuda,device

(False, device(type='cpu'))

In [3]:
#Number on envs we'll be running in parallel
# So we run the same policy 16 times
num_envs = 3
env_name = "Pendulum-v0"

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)



In [4]:
envs.reset()

array([[-0.27713974,  0.96082962,  0.27576441],
       [-0.14506325, -0.98942238, -0.67756921],
       [-0.95750966, -0.28840121, -0.17336665]])

In [5]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

In [6]:
#Neural Net
#ActorCritic, for continuous action tasks

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
        )
        #self.log_std only for continuous action spaces. won't be here for discrete
        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
        
        self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic(x)
        mu    = self.actor(x)
        std   = self.log_std.exp().expand_as(mu)
        
        #only difference is we're using Normal dist to sample actions from
        #Gaussian distribution. Categorical() for discrete action space.
        dist  = Normal(mu, std) 
        return dist, value

In [7]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()
    
def test_env(vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward

In [8]:
def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]
        

def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2):
    for _ in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            dist, value = model(state)   #pass state into network to get latest distribution and state value.
            entropy = dist.entropy().mean() #for inciting exploration
            new_log_probs = dist.log_prob(action) #new log_probs of originally selected actions. done through sampling
            
            # subtract b/c it's log
            ratio = (new_log_probs - old_log_probs).exp() # exponentiate it b/c we're working with logs and its simpler
            # surr = surrogate...it's not our main objective
            # breaking the loss into multiple functions for simplicity
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage
            
            #CLIP LOSS - L_clip(theta) from our notes
            # Calculates the ratios and makes sure the change in probabilities isn't too great
            actor_loss  = - torch.min(surr1, surr2).mean() # since it's taking info for 16 env, we just take the mean
            
            #MSE LOSS between GAE returns and estimated value of the state
            # similar to the advantage but we are squaring it
            critic_loss = (return_ - value).pow(2).mean()
            #discounted critic loss plus CLIP LOSS minus scaled entroy
            loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
            
            optimizer.zero_grad()
            loss.backward() # back propagate
            optimizer.step() # update


In [14]:
#GAE
#genearalist advantage estimation --> Advantage
#gamma is discount factor for returns, tau is smoothing factor of GAE algo.
#rewards that happen soon are more valuable than then ones that happen that happen in the future
# increased tau helps smooth
def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
    print("NEXT_VALUE ", next_value)
    print("======")
    print("REWARDS ", rewards)
    print("======")
    print("MASKS ", masks)
    print("======")
    print("VALUES ", values)
    
    values = values + [next_value]
    print("======")
    print("VALUES + NEXT ", values)
    print("======")
    print("======")
    print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

    gae = 0
    returns = []  # list of advantages
    for step in reversed(range(len(rewards))):
        #delta is Bellman equation minus value of the state
        # masks (is the episode done or not done)
        # values[step + 1] the value that we're getting in the future
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        #moving average of advantages discounted by gamma * tau
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
#     print("GAE", returns)
    return returns

In [15]:
num_inputs  = envs.observation_space.shape[0]
num_outputs = envs.action_space.shape[0]

# Get some of these from the Gym documentation
#Hyper params:
hidden_size      = 256   #neurons in hidden layer
lr               = 3e-4  #passed to Adam optimizer    
num_steps        = 1008   #num of transitions we sample for each training iter
mini_batch_size  = 16     #num of samples randomly selected from stored data
ppo_epochs       = 8     #num passes over entire training data
threshold_reward = 90    #we'll stop training when we reach this reward in evaluation

model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr) 

In [16]:
max_frames = 1  # every thousand frames we test how much better our agent is
frame_idx  = 0
test_rewards = []

state = envs.reset()
early_stop = False

while frame_idx < max_frames and not early_stop:

    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []
    entropy = 0
    
    #each step generate state, action, reward, next_state from each env.
    num_steps = 2
    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device) 
        dist, value = model(state) #state through netwwork to get prob dist and estimated V(s)

        action = dist.sample()  # from this distribution, do an action
        
        # take a setp with this action
        #state, reward, done is list of results, 1 per env
        #env.render()
#         print("ACTION: ", action.cpu().numpy())
        next_state, reward, done, _ = envs.step(action.cpu().numpy())  

        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()
        
        #Store log_probs, values, rewards, done_masks, states, actions. Each list num_steps long, each step num_envs wide.
        # Each list has 1000 steps * 16 different environments
        log_probs.append(log_prob) #
        values.append(value)

        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

        states.append(state)
        actions.append(action)
        print("ACTIONS: ", actions)

        print("LOG_PROBS: ", log_probs)
        print("VALUES: ", values)
        print("REWARDS: ", rewards)

        state = next_state
        frame_idx += 1
        
        # Every 1000 frames it will run a test environment...what was my reward?
        if frame_idx % 1000 == 0:
            test_reward = np.mean([test_env() for _ in range(10)])
            test_rewards.append(test_reward)
            plot(frame_idx, test_rewards)
            if test_reward > threshold_reward: early_stop = True
            
    #to calc returns correctly, run final next_state through network to get value
    next_state = torch.FloatTensor(next_state).to(device)
    _, next_value = model(next_state)
    #run GAE. Loop backwards from recent experience.
    returns = compute_gae(next_value, rewards, masks, values)
    
    #concatanate each list inside a torch tensor.
    #list that was num_steps long, num_envs wide becomes num_steps*num_envs long
#     print("RET", returns)
    returns   = torch.cat(returns).detach()
    log_probs = torch.cat(log_probs).detach()
    values    = torch.cat(values).detach()
    states    = torch.cat(states)
    actions   = torch.cat(actions)
    advantage = returns - values
    print("RET CAT", returns)
    print("VALUE CAT", values)

    # update policy
    ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)
    
    
#     def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):


ACTIONS:  [tensor([[ 2.4261],
        [-0.9555],
        [-0.4908]])]
LOG_PROBS:  [tensor([[-3.6272],
        [-1.4039],
        [-0.9869]], grad_fn=<SubBackward0>)]
VALUES:  [tensor([[0.0733],
        [0.4036],
        [0.4035]], grad_fn=<AddmmBackward>)]
REWARDS:  [tensor([[-1.0152],
        [-0.1086],
        [-3.2166]])]
ACTIONS:  [tensor([[ 2.4261],
        [-0.9555],
        [-0.4908]]), tensor([[ 0.5025],
        [-0.6464],
        [-3.2020]])]
LOG_PROBS:  [tensor([[-3.6272],
        [-1.4039],
        [-0.9869]], grad_fn=<SubBackward0>), tensor([[-1.0055],
        [-1.1286],
        [-5.6844]], grad_fn=<SubBackward0>)]
VALUES:  [tensor([[0.0733],
        [0.4036],
        [0.4035]], grad_fn=<AddmmBackward>), tensor([[-0.0116],
        [ 0.3996],
        [ 0.4103]], grad_fn=<AddmmBackward>)]
REWARDS:  [tensor([[-1.0152],
        [-0.1086],
        [-3.2166]]), tensor([[-1.1529],
        [-0.0632],
        [-3.6812]])]
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
NEXT_VALUE 

In [22]:
# It's trained already so now we're just running it

#Save trajectories for GAIL
from itertools import count

max_expert_num = 10000
num_steps = 0
expert_traj = []

for i_episode in count():
    state = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        action = dist.sample().cpu().numpy()[0]
        #Show
        
        #Take action
        print("ACTION: ", action)
        next_state, reward, done, _ = env.step(action)
        #show_state(env.env, step=0, info="")
        state = next_state
        total_reward += reward
        expert_traj.append(np.hstack([state, action]))
        num_steps += 1
    
    print("episode:", i_episode, "reward:", total_reward)
    
    if num_steps >= max_expert_num:
        break

for _ in range(10):
    test_env(True)  # true for visualization      
    time.sleep(2)

expert_traj = np.stack(expert_traj)
print()
print(expert_traj.shape)
print()
np.save("expert_traj_mntcarcont16.npy", expert_traj)



ACTION:  [0.10859587]
ACTION:  [-0.6178819]
ACTION:  [0.33437526]
ACTION:  [-0.5518475]
ACTION:  [-0.28291312]
ACTION:  [-1.5452234]
ACTION:  [-0.8280457]
ACTION:  [0.9614736]
ACTION:  [1.9143]
ACTION:  [0.34002897]
ACTION:  [-0.8312296]
ACTION:  [-0.07812525]
ACTION:  [-0.08596238]
ACTION:  [-0.4330661]
ACTION:  [0.21516353]
ACTION:  [1.0917648]
ACTION:  [0.01388733]
ACTION:  [0.9507147]
ACTION:  [0.97978926]
ACTION:  [0.29458484]
ACTION:  [1.0974042]
ACTION:  [-1.3421619]
ACTION:  [0.69556874]
ACTION:  [-0.4652428]
ACTION:  [-1.3769544]
ACTION:  [-0.08186988]
ACTION:  [-1.5833106]
ACTION:  [2.3807108]
ACTION:  [-0.05649508]
ACTION:  [0.00865682]
ACTION:  [-0.8512365]
ACTION:  [-0.41110778]
ACTION:  [-1.2422012]
ACTION:  [-0.27108377]
ACTION:  [-0.16202179]
ACTION:  [1.3388011]
ACTION:  [-0.38233006]
ACTION:  [-0.29967642]
ACTION:  [-0.6758616]
ACTION:  [-0.08217808]
ACTION:  [-1.0556824]
ACTION:  [0.40233418]
ACTION:  [0.04412454]
ACTION:  [-0.85031235]
ACTION:  [-0.52934647]
ACTION:

KeyboardInterrupt: 

In [None]:
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("%s | Step: %d %s" % (env._spec.id,step, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())