Modified from https://github.com/higgsfield/RL-Adventure-2 using SB3

References:

https://colab.research.google.com/github/araffin/rl-handson-rlvs21/blob/main/rlvs_hands_on_sb3.ipynb

https://github.com/reinforcement-learning-kr/lets-do-irl

Highly experimental code!!

In [1]:
!apt install swig
!pip install stable-baselines3[extra]

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  swig3.0
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  swig swig3.0
0 upgraded, 2 newly installed, 0 to remove and 34 not upgraded.
Need to get 1,100 kB of archives.
After this operation, 5,822 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig3.0 amd64 3.0.12-1 [1,094 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig amd64 3.0.12-1 [6,460 B]
Fetched 1,100 kB in 1s (1,518 kB/s)
Selecting previously unselected package swig3.0.
(Reading database ... 160706 files and directories currently installed.)
Preparing to unpack .../swig3.0_3.0.12-1_amd64.deb ...
Unpa

In [2]:
import math
import random

import gym
import numpy as np

import torch as th
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

In [3]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from typing import Any, Dict, Optional, Type, Union, List, Callable
import time
from copy import deepcopy

from stable_baselines3.common.on_policy_algorithm import OnPolicyAlgorithm
from stable_baselines3.common.base_class import BaseAlgorithm
from stable_baselines3.common import logger
from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule
from stable_baselines3.common.policies import ActorCriticPolicy, BasePolicy
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvStepReturn, VecEnvWrapper
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_util import make_vec_env as mk_env
from stable_baselines3.common.utils import configure_logger
from stable_baselines3 import PPO
from gym import spaces

<h2>Use CUDA</h2>

In [5]:
use_cuda = th.cuda.is_available()
device   = th.device("cuda" if use_cuda else "cpu")

<h2>Create Environments</h2>

In [6]:
num_envs = 8
env_id = "CartPole-v1"

envs = mk_env(env_id, n_envs=num_envs)
eval_env = gym.make(env_id)

In [7]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()

<h2>Loading expert trajectories from №3 notebook</h2>

In [8]:
try:
    expert_traj = np.load("expert_traj.npy")
    expert_traj = th.from_numpy(expert_traj)
except:
    print("Train, generate and save expert trajectories in notebook №3")
    expert_model = PPO('MlpPolicy', envs, seed=42)
    expert_model.learn(total_timesteps=25000)
    n_eval = 10000

    expert_traj = []

    obs = eval_env.reset()

    for _ in range(n_eval):
        action, states = expert_model.predict(obs)
        act = np.zeros(envs.action_space.n)
        act[action] = 1
        expert_traj.append(np.concatenate((obs, act)))
        obs, _, done, _ = eval_env.step(action)
        if done:
            obs = eval_env.reset()

    expert_traj = np.array(expert_traj, dtype=np.float32)
    th.save(expert_traj, 'expert_traj.npy')
    expert_traj = th.from_numpy(expert_traj)

print(expert_traj.shape)
print(expert_traj[:5])

Train, generate and save expert trajectories in notebook №3
torch.Size([10000, 6])
tensor([[ 0.0099, -0.0051,  0.0384, -0.0310,  1.0000,  0.0000],
        [ 0.0098, -0.2007,  0.0378,  0.2736,  0.0000,  1.0000],
        [ 0.0058, -0.0062,  0.0433, -0.0069,  1.0000,  0.0000],
        [ 0.0056, -0.2019,  0.0431,  0.2991,  1.0000,  0.0000],
        [ 0.0016, -0.3976,  0.0491,  0.6051,  0.0000,  1.0000]])


<h1>Generative Adversarial Imitation Learning</h1>
<h2><a href="https://arxiv.org/abs/1606.03476">Arxiv</a></h2>

In [9]:
class VDB(nn.Module):
    def __init__(self, num_inputs, hidden_size, latent_size, device):
        super(VDB, self).__init__()
        
        self.vae_encoder_input = nn.Linear(num_inputs, hidden_size)
        self.vae_encoder_mu = nn.Linear(hidden_size, latent_size)
        self.vae_encoder_logvar = nn.Linear(hidden_size, latent_size)
        
        self.discrim_input = nn.Linear(latent_size, hidden_size)
        self.discrim_output = nn.Linear(hidden_size, 1)
        
        self.discrim_output.weight.data.mul_(0.1)
        self.discrim_output.bias.data.mul_(0.0)
        
        self.to(device)

    def vae_encoder(self, x):
        h = th.tanh(self.vae_encoder_input(x))
        return self.vae_encoder_mu(h), self.vae_encoder_logvar(h)

    def reparameterize(self, mu, logvar):
        std = th.exp(logvar/2)
        eps = th.randn_like(std)
        return mu + std * eps

    def discriminator(self, z):
        z = th.tanh(self.discrim_input(z))
        prob = th.sigmoid(self.discrim_output(z))
        return prob

    def forward(self, x):
        mu, logvar = self.vae_encoder(x)
        z = self.reparameterize(mu, logvar)
        prob = self.discriminator(z)
        return prob, mu, logvar

In [10]:
class VecCustomReward(VecEnvWrapper):
    def __init__(
        self,
        venv: VecEnv,
        vdb: VDB
    ):
        VecEnvWrapper.__init__(self, venv)

        self.vdb = vdb
        
    def reset(self):
        return self.venv.reset()

    def step_wait(self):
        obs, rewards, dones, infos = self.venv.step_wait()

        rewards = self._update_reward(obs, self.venv.actions, self.action_space.n)

        return obs, rewards, dones, infos

    def _update_reward(self, states, actions, n_actions):
        """Update reward using discriminator."""
        acts = th.zeros((actions.shape[0], n_actions), dtype=th.float32)
        action_idx = th.from_numpy(np.array(actions)).to(th.int64).view(-1, 1)
        actions = acts.scatter(1, action_idx, 1)
        state_action = th.FloatTensor(np.concatenate([states, actions], 1))
        return -np.log(self.vdb(state_action)[0].view(-1).cpu().data.numpy())


In [11]:
def kl_divergence(mu, logvar):
    kl_div = 0.5 * th.sum(mu.pow(2) + logvar.exp() - logvar - 1, dim=1)
    return kl_div

In [104]:
class VAIL(BaseAlgorithm):
    def __init__(self,
                 policy,
                 env: Union[GymEnv, str],
                 vdb: VDB,
                 expert_traj: th.tensor,
                 policy_base: Type[BasePolicy] = None,
                 learning_rate: Union[float, Schedule] = 3e-4,
                 beta: float = 0,
                 alpha_beta: float = 1e-4,
                 i_c: float = 0.5,
                 policy_kwargs: Optional[Dict[str, Any]] = None,
                 tensorboard_log: Optional[str] = None,
                 verbose: int = 0,
                 support_multi_env: bool = True,
                 device: Union[th.device, str] = "auto",
                 create_eval_env: bool = False,
                 seed: Optional[int] = None,
                 _init_setup_model: bool = True,
                 ):
        super(VAIL, self).__init__(policy=policy, env=env, learning_rate=learning_rate,
                                   policy_base=policy_base, support_multi_env=support_multi_env,
                                   policy_kwargs=policy_kwargs, tensorboard_log=tensorboard_log,
                                   verbose=verbose, device=device,
                                   create_eval_env=create_eval_env, seed=seed,
                                   supported_action_spaces=(
                                                    spaces.Box,
                                                    spaces.Discrete,
                                                    spaces.MultiDiscrete,
                                                    spaces.MultiBinary,
                                                )
                                   )
        self.vdb = vdb
        self.expert_traj = expert_traj
        self.policy = policy(**policy_kwargs)
        self.beta = beta
        self.alpha_beta = alpha_beta
        self.i_c = i_c

        if _init_setup_model:
            self._setup_model()

    def get_generator_batch(self, num_samples):
        '''shuffle, and get a batch of num_samples of state, action tensor from
           rollout buffer
        '''
        return self.policy.rollout_buffer.sample(num_samples)

    def train_vdb(self, num_samples):
        expert_state_action = self.expert_traj[np.random.randint(0,
                                                                 self.expert_traj.shape[0],
                                                                 num_samples),
                                                :]
        expert_state_action = th.FloatTensor(expert_state_action).to(device)

        num_samples = expert_state_action.shape[0]

        state_action = self.get_generator_batch(num_samples)

        acts = th.zeros((num_samples, self.action_space.n), dtype=th.float32)
        action_idx = state_action.actions.to(th.int64)
        actions = acts.scatter(1, action_idx, 1)
        state_action = th.cat([state_action.observations, actions], 1)

        fake, l_mu, l_logvar = self.vdb(state_action)
        real, e_mu, e_logvar = self.vdb(expert_state_action)

        l_kld = kl_divergence(l_mu, l_logvar)
        l_kld = l_kld.mean()
        
        e_kld = kl_divergence(e_mu, e_logvar)
        e_kld = e_kld.mean()
        
        kld = 0.5 * (l_kld + e_kld)
        bottleneck_loss = kld - self.i_c

        beta = max(0, self.beta + self.alpha_beta * bottleneck_loss)

        self.optimizer_vdb.zero_grad()
        vdb_losses = self.discrim_criterion(fake, th.ones((num_samples, 1))) + \
                       self.discrim_criterion(real, th.zeros((num_samples, 1))) +\
                       beta * bottleneck_loss
        vdb_losses.backward()
        self.optimizer_vdb.step()

        return vdb_losses

    def _setup_model(self) -> None:
        self._setup_lr_schedule()
        self.set_random_seed(self.seed)

        self.optimizer_vdb = optim.Adam(self.vdb.parameters(), lr=self.learning_rate)
        self.discrim_criterion = nn.BCELoss()
 
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "VAILAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
        ) -> "VAIL":

        iteration = 1  #since we have one ppo.train() outside the loop

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path,
            reset_num_timesteps, tb_log_name
        )

        callback.on_training_start(locals(), globals())

        # need to fill the rollout buffer
        self.policy.learn(total_timesteps=total_timesteps)

        while self.num_timesteps < total_timesteps:

            iteration += 1
            self.num_timesteps += 1

            self._update_current_progress_remaining(self.num_timesteps, total_timesteps)

            dloss = self.train_vdb(self.policy.n_steps * self.policy.n_envs)

            # Display training infos
            fps = int(self.num_timesteps / (time.time() - self.start_time))
            logger.record("vail/time/fps", fps)
            logger.record("vail/time/iterations", iteration)
            logger.record("vail/time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard")
            logger.record("vail/time/number_timesteps", self.num_timesteps)
            logger.record("vail/time/ppo_timesteps", self.policy.num_timesteps)
            logger.record_mean("vail/train/vdb_loss", dloss.item())
            logger.dump(step=self.num_timesteps)

            if iteration % 3 == 0:
                self.policy.learn(total_timesteps=total_timesteps, reset_num_timesteps=False)

        callback.on_training_end()

        return self


In [13]:
%load_ext tensorboard

# Visualization

In [14]:
!apt-get install ffmpeg freeglut3-dev xvfb  # For visualization

Reading package lists... Done
Building dependency tree       
Reading state information... Done
freeglut3-dev is already the newest version (2.8.1-3).
freeglut3-dev set to manually installed.
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  xvfb
0 upgraded, 1 newly installed, 0 to remove and 34 not upgraded.
Need to get 784 kB of archives.
After this operation, 2,270 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.9 [784 kB]
Fetched 784 kB in 1s (1,226 kB/s)
Selecting previously unselected package xvfb.
(Reading database ... 161497 files and directories currently installed.)
Preparing to unpack .../xvfb_2%3a1.19.6-1ubuntu4.9_amd64.deb ...
Unpacking xvfb (2:1.19.6-1ubuntu4.9) ...
Setting up xvfb (2:1.19.6-1

In [15]:
# Set up fake display; otherwise rendering will fail
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [16]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [17]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: gym.make(env_id)])
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs, deterministic=True)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

# Testing VAIL

In [None]:
%tensorboard --logdir=./results/

In [19]:
# https://github.com/araffin/rl-baselines-zoo/blob/master/hyperparams/ppo2.yml
'''
# Tuned
CartPole-v1:
  n_envs: 8
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  n_steps: 32
  nminibatches: 1
  lam: 0.8
  gamma: 0.98
  noptepochs: 20
  ent_coef: 0.0
  learning_rate: lin_0.001
  cliprange: lin_0.2
'''

"\n# Tuned\nCartPole-v1:\n  n_envs: 8\n  n_timesteps: !!float 1e5\n  policy: 'MlpPolicy'\n  n_steps: 32\n  nminibatches: 1\n  lam: 0.8\n  gamma: 0.98\n  noptepochs: 20\n  ent_coef: 0.0\n  learning_rate: lin_0.001\n  cliprange: lin_0.2\n"

In [None]:
num_envs = 8
env_id = "CartPole-v1"

num_inputs  = envs.observation_space.shape[0]
num_outputs = envs.action_space.n
hidden_size = 128
latent_size = 4

vdb = VDB(num_inputs + num_outputs, hidden_size, latent_size, device)

envs = mk_env(env_id, n_envs=num_envs)
eval_env = gym.make(env_id)

envs = VecCustomReward(envs, vdb=vdb)

# verbose must be 1 for both VAIL and Policy algorithm to print on console
vail = VAIL(PPO, envs, vdb, expert_traj=expert_traj, tensorboard_log="./results",
            verbose=1,
            policy_kwargs={'policy': 'MlpPolicy', 'env': envs, 'verbose': 1,
                           'n_steps': 32})
vail.learn(total_timesteps=100)

In [None]:
mean_reward, std_reward = evaluate_policy(vail, eval_env, n_eval_episodes=100)
print(f"Mean reward: {mean_reward:.2f}, Std Reward: {std_reward:.2f}")



Mean reward: 282.79, Std Reward: 58.15


In [None]:
record_video('CartPole-v1', vail, video_length=100, prefix='vail-cartpole')
show_videos('videos', prefix='vail-cartpole')

# Testing against Baselines

In [None]:
%tensorboard --logdir=./ppo/

In [None]:
num_envs = 8
env_id = "CartPole-v1"

envs_ppo = mk_env(env_id, n_envs=num_envs)
eval_env_ppo = gym.make(env_id)

model_test = PPO('MlpPolicy', envs_ppo, verbose=1, tensorboard_log="./ppo", n_steps=32)
model_test.learn(total_timesteps=10000)

In [None]:
# Evaluate the agent
# NOTE: If you use wrappers with your environment that modify rewards,
#       this will be reflected here. To evaluate with original rewards,
#       wrap environment in a "Monitor" wrapper before other wrappers.
mean_reward, std_reward = evaluate_policy(model_test, eval_env_ppo, n_eval_episodes=100)
print(f"Mean reward: {mean_reward:.2f}, Std Reward: {std_reward:.2f}")



Mean reward: 219.38, Std Reward: 68.90


In [None]:
record_video('CartPole-v1', model_test, video_length=100, prefix='ppo-cartpole')
show_videos('videos', prefix='ppo-cartpole')

Saving video to  /content/videos/ppo-cartpole-step-0-to-step-100.mp4
