**Dependencies and setup**

This can take a minute or so...

In [3]:
# the FORK implementation was based on https://github.com/honghaow/FORK
# some parts of the model implementation have been taken from https://github.com/Rafael1s/Deep-Reinforcement-Learning-Algorithms/tree/master/BipedalWalkerHardcore-TD3-FORK

!apt update
!apt install -y xvfb
!pip install 'gym[box2d]'
!pip install pyvirtualdisplay
!pip install utils

import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import sys
import copy
import utils
from gym.wrappers.record_video import RecordVideo
from pyvirtualdisplay import Display
from IPython import display as disp
from collections import deque, namedtuple
%matplotlib inline

display = Display(visible=0,size=(600,600))
display.start()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

plot_interval = 10 # update the plot every N episodes
video_every = 25 # videos can take a very long time to render so only do it every N episodes

Reading package lists... Done
[1;31mE: [0mCould not open lock file /var/lib/apt/lists/lock - open (13: Permission denied)[0m
[1;31mE: [0mUnable to lock directory /var/lib/apt/lists/[0m
[1;33mW: [0mProblem unlinking the file /var/cache/apt/pkgcache.bin - RemoveCaches (13: Permission denied)[0m
[1;33mW: [0mProblem unlinking the file /var/cache/apt/srcpkgcache.bin - RemoveCaches (13: Permission denied)[0m
[1;31mE: [0mCould not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)[0m
[1;31mE: [0mUnable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?[0m
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


FileNotFoundError: [Errno 2] No such file or directory: 'Xvfb'

**Reinforcement learning agent**

In [None]:
class ReplayBuffer(object):
  def __init__(self, state_dim, action_dim, max_size=int(1e6)):
    self.max_size = max_size
    self.ptr = 0
    self.size = 0

    self.state = np.zeros((max_size, state_dim))
    self.action = np.zeros((max_size, action_dim))
    self.next_state = np.zeros((max_size, state_dim))
    self.reward = np.zeros((max_size, 1))
    self.done = np.zeros((max_size, 1))
  
  def push(self, state, action, next_state, reward, done):
    self.state[self.ptr] = state
    self.action[self.ptr] = action
    self.next_state[self.ptr] = next_state
    self.reward[self.ptr] = reward
    self.done[self.ptr] = done

    self.ptr = (self.ptr + 1) % self.max_size
    self.size = min(self.size + 1, self.max_size)

  def sample(self, batch_size):
    ind = np.random.randint(0,int(self.size), size=batch_size)
    return (
      torch.FloatTensor(self.state[ind]).to(device),
      torch.FloatTensor(self.action[ind]).to(device),
      torch.FloatTensor(self.next_state[ind]).to(device),
      torch.FloatTensor(self.reward[ind]).to(device),
      torch.FloatTensor(self.done[ind]).to(device)
    )

In [None]:
class Actor(nn.Module):

  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.fc1 = nn.Linear(state_dim, 256)
    self.fc2 = nn.Linear(256, 256)
    self.fc3 = nn.Linear(256, action_dim)
    self.max_action = max_action

  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = F.relu(self.fc2(x))
    x = torch.tanh(self.fc3(x))
    return self.max_action * x


class Critic(nn.Module):

  def __init__(self, state_dim, action_dim):
    super(Critic, self).__init__()
    self.l1 = nn.Linear(state_dim + action_dim, 256)
    self.l2 = nn.Linear(256, 256)
    self.l3 = nn.Linear(256, 1)
    self.l4 = nn.Linear(state_dim + action_dim, 256)
    self.l5 = nn.Linear(256, 256)
    self.l6 = nn.Linear(256, 1)

  def forward(self, state, action):
    x = torch.cat([state, action], 1)
    q1 = F.relu(self.l1(x))
    q1 = F.relu(self.l2(q1))
    q2 = F.relu(self.l4(x))
    q2 = F.relu(self.l5(q2))
    return self.l3(q1), self.l6(q2)


class Sys(nn.Module):

  def __init__(self, state_dim, action_dim):
    super(Sys, self).__init__()
    self.l1 = nn.Linear(state_dim + action_dim, 400)
    self.l2 = nn.Linear(400, 300)
    self.l3 = nn.Linear(300, state_dim)

  def forward(self, state, action):
    x = torch.cat([state, action], 1)
    predict = F.relu(self.l1(x))
    predict = F.relu(self.l2(predict))
    return self.l3(predict)

NameError: name 'nn' is not defined

In [None]:
class TD3(object):

  def __init__(
      self,
      env,
      state_dim,
      action_dim,
      max_action,
      policy_noise = 0.1,
      noise_clip = 0.5,
      policy_freq = 2,
      sys_weight = 0.5,
      sys_threshold = 0.02,
      tau = 0.005,
      lr = 0.001,
      gamma = 0.99
      ):

    self.env = env

    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = copy.deepcopy(self.actor)
    self.actor_optimiser = torch.optim.Adam(self.actor.parameters(), lr=lr)

    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = copy.deepcopy(self.critic)
    self.critic_optimiser = torch.optim.Adam(self.critic.parameters(), lr=lr)

    self.sys = Sys(state_dim, action_dim).to(device)
    self.sys_optimiser = torch.optim.Adam(self.sys.parameters(), lr=lr)
    self.sys.apply(self.weights)
    self.sys_loss = 0

    self.upper = float(self.env.action_space.high[0])
    self.lower = float(self.env.action_space.low[0])
    self.obs_upper = float(self.env.observation_space.high[0])
    self.obs_lower = float(self.env.observation_space.low[0])

    self.max_action = max_action
    self.policy_noise = policy_noise
    self.noise_clip = noise_clip
    self.policy_freq = policy_freq
    self.sys_weight = sys_weight
    self.sys_threshold = sys_threshold
    self.tau = tau
    self.gamma = gamma

  def weights(self, layer):
    if type(layer) == nn.Linear:
      torch.nn.init.xavier_normal_(layer.weight)
      layer.bias.data.fill_(0.001)
  
  def select_action(self, state):
    state = torch.FloatTensor(state.reshape(1, -1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()
  
  def train(self, replay_buffer, batch_size, train_steps):
    for i in range(train_steps):

      state, action, next_state, reward, done = replay_buffer.sample(batch_size)

      with torch.no_grad():

        noise = (
            torch.randn_like(action) * self.policy_noise
        ).clamp(-self.noise_clip, self.noise_clip)
        next_action = (
            self.actor_target(next_state) + noise
        ).clamp(-self.max_action, self.max_action)
        target_Q1, target_Q2 = self.critic_target(next_state, next_action)
        target_Q = torch.min(target_Q1, target_Q2)
        target_Q = reward + (1 - done) * self.gamma * target_Q

      current_Q1, current_Q2 = self.critic(state, action)
      loss_Q1 = F.mse_loss(current_Q1, target_Q)
      loss_Q2 = F.mse_loss(current_Q2, target_Q)
      critic_loss = loss_Q1 + loss_Q2
      self.critic_optimiser.zero_grad()
      critic_loss.backward()
      self.critic_optimiser.step()

      predict_next_state = self.sys(state, action).clamp(self.obs_lower, self.obs_upper)
      sys_loss = F.smooth_l1_loss(predict_next_state, next_state.detach())

      self.sys_optimiser.zero_grad()
      sys_loss.backward()
      self.sys_optimiser.step()
      self.sys_loss = sys_loss.item()

      s_flag = 1 if sys_loss.item() < self.sys_threshold else 0

      if i % self.policy_freq == 0:
        actor_loss_1,_ = self.critic_target(state, self.actor(state))
        actor_loss_1 = actor_loss_1.mean()
        actor_loss_1 = - actor_loss_1
        if s_flag == 1:
          p_next = self.sys(state, self.actor(state)).clamp(self.obs_lower, self.obs_upper)
          p_actions = self.actor(p_next.detach()) * self.upper
          actor_loss_2,_ = self.critic_target(p_next.detach(), p_actions)
          actor_loss_2 = actor_loss_2.mean()
          p_next_2 = self.sys(p_next.detach(), p_actions).clamp(self.obs_lower, self.obs_upper)
          p_actions_2 = self.actor(p_next_2.detach()) * self.upper
          actor_loss_3,_ = self.critic_target(p_next_2.detach(), p_actions_2)
          actor_loss_3 = actor_loss_3.mean()
          actor_loss = actor_loss_1 - (self.sys_weight * actor_loss_2) - (0.5 * self.sys_weight * actor_loss_3)
        else:
          actor_loss = actor_loss_1
      
        self.critic_optimiser.zero_grad()
        self.sys_optimiser.zero_grad()

        self.actor_optimiser.zero_grad()
        actor_loss.backward()
        self.actor_optimiser.step()

        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

**Prepare the environment and wrap it to capture videos**

In [None]:
%%capture

env = gym.make("BipedalWalker-v3")
# env = gym.make("Pendulum-v0") # useful continuous environment for quick experiments
# env = gym.make("BipedalWalkerHardcore-v3") # a more advanced environment
env = RecordVideo(env, "./video", episode_trigger=lambda ep_id: ep_id%video_every == 0)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

In [None]:
print('The environment has {} observations and the agent can take {} actions'.format(state_dim, action_dim))
print('The device is: {}'.format(device))


if device.type != 'cpu': print('It\'s recommended to train on the cpu for this')

In [None]:
seed = 42
torch.manual_seed(seed)
env.reset(seed=seed)
random.seed(seed)
np.random.seed(seed)
env.action_space.seed(seed)

# logging variables
batch_size = 100
start_t = 0 # set to 10000 when testing the model on the hardcore environment
expl_noise = 0.1
total_t = 0
ep_reward = 0
avg_reward = 0
reward_list = []
plot_data = []
log_f = open("agent-log.txt","w+")

# initialise agent
agent = TD3(env, state_dim, action_dim, max_action)
replay_buffer = ReplayBuffer(state_dim, action_dim)
max_episodes = 500
max_timesteps = 2000

for episode in range(1, max_episodes + 1):
  state = env.reset()
  done = False
  for t in range(1, max_timesteps + 1):
    total_t += 1
    if total_t < start_t:
      action = env.action_space.sample()
    else:
      action = (
          agent.select_action(np.array(state))
          + np.random.normal(0, max_action * expl_noise, size = action_dim)
      ).clip(-max_action, max_action)
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, next_state, reward, done)
    state = next_state

    ep_reward += reward
    avg_reward += reward

    if (done or t >= max_timesteps):
      agent.update_sys = 0
      if total_t >= start_t:
        agent.train(replay_buffer, batch_size, t)
      reward_list.append(avg_reward)
      break

    total_t += 1

  log_f.write('episode: {}, reward: {}\n'.format(episode, ep_reward))
  log_f.flush()
  ep_reward = 0
  avg_reward = 0

  if episode % plot_interval == 0:
    plot_data.append([episode, np.array(reward_list).mean(), np.array(reward_list).std()])
    reward_list = []
    # plt.rcParams['figure.dpi'] = 100
    plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data], '-', color='tab:grey')
    plt.fill_between([x[0] for x in plot_data], [x[1]-x[2] for x in plot_data], [x[1]+x[2] for x in plot_data], alpha=0.2, color='tab:grey')
    plt.xlabel('Episode number')
    plt.ylabel('Episode reward')
    plt.show()
    disp.clear_output(wait=True)