## Twin Delayed DDPG (TD3)

In [None]:
%%bash

apt-get install swig

git clone https://github.com/pybox2d/pybox2d
cd pybox2d
python setup.py build
python setup.py install

apt-get install -y xvfb

pip install \
    gym==0.21 \
    gym[box2d]==0.21 \
    pytorch-lightning==1.6.0 \
    pyglet==1.5.27 \
    pyvirtualdisplay

#### Setup virtual display

In [None]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start()

#### Import the necessary code libraries

In [None]:
import copy
import gym
import torch
import itertools

import numpy as np
import torch.nn.functional as F

from collections import deque, namedtuple
from IPython.display import HTML
from base64 import b64encode

from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import IterableDataset
from torch.optim import AdamW

from pytorch_lightning import LightningModule, Trainer

from gym.wrappers import RecordVideo, RecordEpisodeStatistics


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
num_gpus = torch.cuda.device_count()

In [None]:
def display_video(episode=0):
  video_file = open(f'/content/videos/rl-video-episode-{episode}.mp4', "r+b").read()
  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"<video width=600 controls><source src='{video_url}'></video>")

#### Create the experience buffer

In [None]:
Experience = namedtuple(
    "Experience",
    field_names=["state", "action", "reward", "done", "new_state"],
)

In [None]:
class ReplayBuffer:

  def __init__(self, capacity):
      self.buffer = deque(maxlen=capacity)

  def __len__(self):
      return len(self.buffer)

  def append(self, experience):

      self.buffer.append(experience)

  def sample(self, batch_size):
      indices = np.random.choice(len(self.buffer), batch_size, replace=False)
      states, actions, rewards, dones, next_states = zip(*(self.buffer[idx] for idx in indices))

      return (
          np.array(states, dtype=np.float32),
          np.array(actions),
          np.array(rewards, dtype=np.float32),
          np.array(dones, dtype=np.bool),
          np.array(next_states, dtype=np.float32)
      )

In [None]:
class RLDataset(IterableDataset):

  def __init__(self, buffer, sample_size=200):
      self.buffer = buffer
      self.sample_size = sample_size

  def __iter__(self):
      states, actions, rewards, dones, new_states = self.buffer.sample(self.sample_size)
      for i in range(len(dones)):
          yield states[i], actions[i], rewards[i], dones[i], new_states[i]

#### Create the environment

In [None]:
def create_environment(name):
  env = gym.make(name)
  env = RecordVideo(env, video_folder='./videos', episode_trigger=lambda x: x % 50 == 0)
  env = RecordEpisodeStatistics(env)
  return env

#### Update the target network

In [None]:
def polyak_average(net, target_net, tau=0.01):
    for qp, tp in zip(net.parameters(), target_net.parameters()):
        tp.data.copy_(tau * qp.data + (1 - tau) * tp.data)

#### Create the gradient policy

In [None]:
class GradientPolicy(nn.Module):

  def __init__(self, hidden_size, obs_size, out_dims, min, max):
    super().__init__()
    self.min = torch.from_numpy(min).to(device)
    self.max = torch.from_numpy(max).to(device)
    self.net = nn.Sequential(
        nn.Linear(obs_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, hidden_size),
        nn.ReLU(),           
        nn.Linear(hidden_size, out_dims),
    )

  def mu(self, x):
    if isinstance(x, np.ndarray):
      x = torch.from_numpy(x).to(device)
    return self.net(x)
  
  def forward(self, x, epsilon=0.0, noise_clip=None):
    mu = self.mu(x)
    noise = torch.normal(0, epsilon, mu.size(), device=mu.device)
    if noise_clip is not None:
      noise = torch.clamp(noise, - noise_clip, noise_clip)
    mu = mu + noise
    action = torch.max(torch.min(mu, self.max), self.min)
    action = action.detach().cpu().numpy()
    return action


Create the Deep Q-Learning

In [None]:
class DQN(nn.Module):

  def __init__(self, hidden_size, obs_size, out_dims):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(obs_size + out_dims, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, hidden_size),
        nn.ReLU(),           
        nn.Linear(hidden_size, 1),
    )

  def forward(self, state, action):
    if isinstance(state, np.ndarray):
      state = torch.from_numpy(state).to(device)
    if isinstance(action, np.ndarray):
      action = torch.from_numpy(action).to(device)
    in_vector = torch.hstack((state, action))
    return self.net(in_vector)


In [None]:
class TD3(LightningModule):

  def __init__(self, env_name, capacity=1_000_000, batch_size=256, lr=1e-4, hidden_size=128,
               gamma=0.99, loss_fn=F.smooth_l1_loss, optim=AdamW, eps_start=5.0, 
               eps_end=0.2, eps_last_episode=200, samples_per_epoch=10_000, tau=0.01):

    super().__init__()

    self.env = create_environment(env_name)

    obs_size = self.env.observation_space.shape[0]
    action_dims = self.env.action_space.shape[0]
    max_action = self.env.action_space.high
    min_action = self.env.action_space.low


    self.q_net1 = DQN(hidden_size, obs_size, action_dims).to(device)
    self.q_net2 = DQN(hidden_size, obs_size, action_dims).to(device)
    self.policy = GradientPolicy(hidden_size, obs_size, action_dims, min_action, max_action).to(device)

    self.target_policy = copy.deepcopy(self.policy)
    self.target_q_net1 = copy.deepcopy(self.q_net1)
    self.target_q_net2 = copy.deepcopy(self.q_net2)

    self.buffer = ReplayBuffer(capacity=capacity)

    self.save_hyperparameters()

    while len(self.buffer) < self.hparams.samples_per_epoch:

      print(f"{len(self.buffer)} samples in experience buffer. Filling...")
      
      self.play_episodes(epsilon=self.hparams.eps_start)

  @torch.no_grad()
  def play_episodes(self, policy=None, epsilon=0.):
      obs = self.env.reset()
      done = False

      while not done:
        if policy:
          action = self.policy(obs, epsilon=epsilon)
        else:
          action = self.env.action_space.sample()
          
        next_obs, reward, done, info = self.env.step(action)
        exp = Experience(obs, action, reward, done, next_obs)
        self.buffer.append(exp)
        obs = next_obs

  def forward(self, x):
    output = self.policy(x)
    return output

  def configure_optimizers(self):
    q_net_parameters = itertools.chain(self.q_net1.parameters(), self.q_net2.parameters())
    q_net_optimizer = self.hparams.optim(q_net_parameters, lr=self.hparams.lr)
    policy_optimizer = self.hparams.optim(self.policy.parameters(), lr=self.hparams.lr)
    return [q_net_optimizer, policy_optimizer]

  def train_dataloader(self):
    dataset = RLDataset(self.buffer, self.hparams.samples_per_epoch)
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=self.hparams.batch_size,
    )
    return dataloader

  def training_step(self, batch, batch_idx, optimizer_idx):
    states, actions, rewards, dones, next_states = batch
    rewards = rewards.unsqueeze(1)
    dones = dones.unsqueeze(1)

    if optimizer_idx == 0:

      epsilon = max(
        self.hparams.eps_end,
        self.hparams.eps_start - self.current_epoch / self.hparams.eps_last_episode
      )

      state_action_values1 = self.q_net1(states, actions)
      state_action_values2 = self.q_net2(states, actions)

      target_actions = self.target_policy(next_states, epsilon=epsilon, noise_clip=0.5)

      next_state_values = torch.min(
          self.target_q_net1(next_states, target_actions),
          self.target_q_net2(next_states, target_actions)
      )

      next_state_values[dones] = 0.0

      expected_state_action_values = rewards + self.hparams.gamma * next_state_values
      q_loss1 = self.hparams.loss_fn(state_action_values1, expected_state_action_values)
      q_loss2 = self.hparams.loss_fn(state_action_values2, expected_state_action_values)
      total_loss = q_loss1 + q_loss2
      self.log("episode/MSE Loss", total_loss)
      return total_loss

    elif optimizer_idx == 1 and batch_idx % 2 == 0:
      policy_loss = - self.q_net1(states, self.policy.mu(states)).mean()
      self.log("episode/Policy Loss", policy_loss)
      return policy_loss

  def training_epoch_end(self, training_step_outputs):
    epsilon = max(
        self.hparams.eps_end,
        self.hparams.eps_start - self.current_epoch / self.hparams.eps_last_episode
    )

    self.play_episodes(policy=self.policy, epsilon=epsilon)

    polyak_average(self.q_net1, self.target_q_net1, tau=self.hparams.tau)
    polyak_average(self.q_net2, self.target_q_net2, tau=self.hparams.tau)
    polyak_average(self.policy, self.target_policy, tau=self.hparams.tau)

    self.log("episode/Episode return", self.env.return_queue[-1], prog_bar=True)

In [None]:
# Start tensorboard.
!rm -r /content/lightning_logs/
!rm -r /content/videos/
%load_ext tensorboard
%tensorboard --logdir /content/lightning_logs/

In [None]:
algo = TD3('LunarLanderContinuous-v2')

trainer = Trainer(
    gpus=num_gpus, 
    max_epochs=2_000, 
    track_grad_norm=2,
)

trainer.fit(algo)