# GAIL

In [2]:
import numpy as np
import gym


def pendulum(observation):
        x, y, angle_velocity = observation
        flip = (y < 0.)
        if flip:
            y *= -1. # now y >= 0
            angle_velocity *= -1.
        angle = np.arcsin(y)
        if x < 0.:
            angle = np.pi - angle
        if (angle < -0.3 * angle_velocity) or \
                (angle > 0.03 * (angle_velocity - 2.5) ** 2. + 1. and \
                angle < 0.15 * (angle_velocity + 3.) ** 2. + 2.):
            force = 2.
        else:
            force = -2.
        if flip:
            force *= -1.
        action = np.array([force,])
        return action

def mountain_car_continuous(observation):
        position, velocity = observation
        if position > -4 * velocity or position < 13 * velocity - 0.6:
            force = 1.
        else:
            force = -1.
        action = np.array([force,])
        return action
    
    
def lunar_lander(observation):
        x, y, v_x, v_y, angle, v_angle, contact_left, contact_right = observation

        if contact_left or contact_right: # legs have contact
            f_y = -10. * v_y - 1.
            f_angle = 0.
        else:
            f_y = 5.5 * np.abs(x) - 10. * y - 10. * v_y - 1.
            f_angle = -np.clip(5. * x + 10. * v_x, -4, 4) + 10. * angle + 20. * v_angle

        if np.abs(f_angle) <= 1 and f_y <= 0:
            action = 0 # do nothing
        elif np.abs(f_angle) < f_y:
            action = 2 # main engine
        elif f_angle < 0.:
            action = 1 # left engine
        else:
            action = 3 # right engine
        return action

def acrobot(observation):
        x0, y0, x1, y1, v0, v1 = observation
        if v1 < -0.3:
            action = 0
        elif v1 > 0.3:
            action = 2
        else:
            y = y1 + x0 * y1 + x1 * y0
            if y > 0.:
                action = 0
            else:
                action = 2
        return action


def lunar_lander_continuous(observation):
        x, y, v_x, v_y, angle, v_angle, contact_left, contact_right = observation

        if contact_left or contact_right:
            f_y = -10. * v_y - 1.
            f_angle = 0.
        else:
            f_y = 5.5 * np.abs(x) - 10. * y - 10. * v_y - 1.
            f_angle = -np.clip(5. * x + 10. * v_x, -4, 4) + 10. * angle + 20. * v_angle

        action = np.array([f_y, f_angle])
        return action
    
    
def cart_pole(observation):
        
        position, velocity, angle, angle_velocity = observation
        action = int(3. * angle + angle_velocity > 0.)
        return action
        
def mountain_car(observation):

    position, velocity = observation
    lb = min(-0.09 * (position + 0.25) ** 2 + 0.03,
            0.3 * (position + 0.9) ** 4 - 0.008)
    ub = -0.07 * (position + 0.38) ** 2 + 0.07
    end = position > 0.2 and velocity > 0.02
    hard_end = position > 0.45
    begin = position < -0.45 and velocity < 0.001 and velocity > -0.001
    if begin or (lb < velocity < ub) or end or hard_end:
        action = 2 
    else:
        action = 0
    return action

class Expert(object):
        
    
    def __init__(self, env_name):
        
        policies = {
            'CartPole-v0': cart_pole,
            'MountainCar-v0': mountain_car,
            'LunarLanderContinuous-v2': lunar_lander_continuous,
            'Acrobot-v1': acrobot,
            'LunarLander-v2': lunar_lander,
            'MountainCarContinuous-v0': mountain_car_continuous,
            'Pendulum-v0': pendulum
        }
        self.env_name = env_name
        self.env = gym.make(self.env_name)
        self.policy = policies[env_name]
        
    def generate_data(self, num_episodes=1):

        trajectories = []
        rewards = []
        splits = []
        counter = 0
        for i in range(num_episodes):


            observation = self.env.reset()
            done = False
            rd = 0
            
            while not done:

                action = self.policy(observation)
                trajectories.append(
                    [
                        feature for feature in observation
                    ] + [action] + [i]
                )
                observation, reward, done, _ = self.env.step(action)
                rd += reward
                counter += 1
            
            rewards.append(rd)
            splits.append(counter)
                
        return np.array(trajectories), np.array(rewards), splits

### Generate data

In [3]:
!pip install tensorflow==1.15.0
!apt-get update && sudo apt-get install cmake libopenmpi-dev python3-dev zlib1g-dev
!pip install stable-baselines[mpi]

Collecting tensorflow==1.15.0
[?25l  Downloading https://files.pythonhosted.org/packages/3f/98/5a99af92fb911d7a88a0005ad55005f35b4c1ba8d75fba02df726cd936e6/tensorflow-1.15.0-cp36-cp36m-manylinux2010_x86_64.whl (412.3MB)
[K     |████████████████████████████████| 412.3MB 45kB/s 
Collecting keras-applications>=1.0.8
[?25l  Downloading https://files.pythonhosted.org/packages/71/e3/19762fdfc62877ae9102edf6342d71b28fbfd9dea3d2f96a882ce099b03f/Keras_Applications-1.0.8-py3-none-any.whl (50kB)
[K     |████████████████████████████████| 51kB 4.7MB/s 
Collecting gast==0.2.2
  Downloading https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz
Collecting tensorflow-estimator==1.15.1
[?25l  Downloading https://files.pythonhosted.org/packages/de/62/2ee9cd74c9fa2fa450877847ba560b260f5d0fb70ee0595203082dafcc9d/tensorflow_estimator-1.15.1-py2.py3-none-any.whl (503kB)
[K     |████████████████████████████████| 512kB 45.7MB/s 
Colle

In [4]:
from stable_baselines.gail import generate_expert_traj
from stable_baselines import GAIL
from stable_baselines.gail import ExpertDataset

import gym
car = gym.make('Acrobot-v1')

def random_policy(x):

  return car.action_space.sample()

expert_ = Expert('Acrobot-v1')
expert = expert_.policy
expert_data = generate_expert_traj(expert, n_timesteps=0, n_episodes=100, env=car)
#adversarial_data = generate_expert_traj(random_policy, n_timesteps=0, n_episodes=25, env=car)

#full_data = {**expert_data, **adversarial_data}

data = ExpertDataset(traj_data=expert_data, traj_limitation=100, verbose=1)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

actions (8806, 1)
obs (8806, 6)
rewards (8806,)
episode_returns (100,)
episode_starts (8806,)
actions (8806, 1)
obs (8806, 6)
rewards (8806,)
episode_returns (100,)
episode_starts (8806,)
Total trajectories: 100
Total transitions: 8806
Average returns: -87.06
Std for returns: 31.339693680698282


In [8]:
model = GAIL('MlpPolicy', 'Acrobot-v1', data, verbose=1)
x = 0
for param in model.get_parameter_list():
  x += np.product([p for p in param.shape])
#model.pretrain(data, n_epochs=5000, learning_rate=0.001)
#model.learn(total_timesteps=400000)
x

Creating environment from the given name, wrapped in a DummyVecEnv.


Dimension(30543)


### Plot expert data

In [40]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [41]:
data, rewards, splits = expert_.generate_data(100)
preds, _= model.predict(np.array([[x[0],x[1]] for x in data]))
colors = np.array(['red', 'green', 'blue'])
palette = colors[np.unique(data[:,2]).astype(int)].tolist()
acc = np.array([np.mean(pred == act[:,2]) for act, pred in zip(np.split(data, splits)[0:-1], np.split(preds, splits)[0:-1])])
fig, ax = plt.subplots(figsize=(10, 7))
_ = sns.scatterplot(x=data[:,0], y=data[:,1], hue=preds, palette=palette)

ValueError: ignored

In [58]:
data, rewards, splits = expert_.generate_data(20)
preds, _= model.predict(np.array([[x[0],x[1],x[2], x[3], x[4], x[5]] for x in data]))
acc_20 = np.array([np.mean(pred == act[:,6]) for act, pred in zip(np.split(data, splits)[0:-1], np.split(preds, splits)[0:-1])])


In [59]:
acc_20

array([1.   , 1.   , 0.99 , 1.   , 1.   , 0.995, 1.   , 1.   , 1.   ,
       0.978, 1.   , 0.988, 1.   , 0.989, 0.987, 1.   , 1.   , 1.   ,
       0.989, 1.   ])

### Preprocess data into sequences

In [60]:
env = gym.make('Acrobot-v1')
rewards_20 = []


for i in range(20):

  #print(i)
  observation = env.reset()
  done = False
  rd = 0
  #traj = torch.tensor([[1000, 1000, 1000, 1000, 1000, 1000, 0, 1] for i in range(200)], dtype=torch.float).to(device).unsqueeze(0)
  while not done:
    
    #traj[0][-rd] = torch.tensor([ob for ob in observation] + [int(rd==0)] + [0]).to(device)
    action, _= model.predict(observation)
    #print(action)
    #print(action, expert.policy(observation), -rd)
    observation, reward, done, _ = env.step(int(action))
    rd += int(reward)

  rewards_20.append(rd)


In [61]:
rewards_20

[-65,
 -86,
 -71,
 -73,
 -259,
 -222,
 -77,
 -81,
 -91,
 -92,
 -72,
 -79,
 -80,
 -78,
 -102,
 -75,
 -72,
 -73,
 -86,
 -86]

In [63]:
import pickle

pickle.dump(acc_20, open('/content/gdrive/MyDrive/CS780/final_project/gail_acrobot_acc.pkl', 'wb'))

In [None]:
torch.save(model, 'bi-directional_lstm')

In [None]:
import os 
dir_path = os.getcwd()

In [None]:
dir_path

'/content'

In [30]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!ls /content/gdrive/MyDrive/

'2.8 Gerard Jones Response.gdoc'
 444
'4kKpJhOzSbOsyRtAIe4l_NACO TACO TRILLIUM MENU 6.25.20.pdf'
'5 Quotes.gdoc'
'basic concepts 1.pdf'
'basic concepts 2.pdf'
'basic concepts 3.pdf'
'BJs accounting.gsheet'
'BRANDS EXPRESS -- Non Compete and Non Solicitation Agreement.docx'
 Brendan_Crowe_Math_739_Basic_Concepts.pdf
'BrendanCrowe PAWS Fall 2017 Form.pdf'
"Brendan Crowe's Speech.gdoc"
"Brendan Crowe's Time Management .gdoc"
'Brendan Crowe Zoom confirmation.gdoc'
'Colab Notebooks'
'Computer Exercice 6.gdoc'
'Computer Exercise 0.gdoc'
'Computer Exercise 1.gdoc'
'Computer Exercise 3.gdoc'
'Computer Exercise 4.gdoc'
'Computer Exercise 5.gdoc'
'Computer Exercise 7.gdoc'
'Computer Exercise 8.gdoc'
'Computer exerice 13.gdoc'
'Computer Exerice 2.gdoc'
'Copy of 06 ALG_REC_2 - control flow, tail recursion, & backtracking (Student).gdoc'
'Copy of 07 ALG_ANA - algorithm analysis (Student).gdoc'
'Copy of 09 ALG_LIST - Lists & Linked Lists (Student).gdoc'
'Copy of 12 ALG_MAP - Maps (Student).gdoc'
'Co

In [None]:
!ls

bi-directional_lstm  gdrive  sample_data


In [8]:
model_save_name = 'gail_mountain_car'
path = F'/content/gdrive/MyDrive/CS780/final_project/{model_save_name}'
model.save(path, cloudpickle=True)

In [None]:
import pickle

pickle.format_version

'4.0'

In [None]:
import torch
torch.save(model.cpu().state_dict(), '/content/gdrive/MyDrive/CS780/final_project/acrobot.pt')