Make sure to run every cell in the file in the correct order.

In [None]:
!pip3 install swig > /dev/null 2>&1
!pip3 uninstall box2d-py -y > /dev/null 2>&1
!pip3 install box2d-py > /dev/null 2>&1
!pip3 install box2d box2d-kengz > /dev/null 2>&1
!apt install xvfb > /dev/null 2>&1
!pip3 install pyvirtualdisplay > /dev/null 2>&1
!pip3 install gym==0.25.0 > /dev/null 2>&1

In [None]:
import gym
import random
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Categorical
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import sys
from pyvirtualdisplay import Display
from IPython import display as disp
import copy
from typing import Tuple

%matplotlib inline

## Question 1: Advantage actor critic (A2C) for ``CartPole``

### Tutorial: How it works?
When the action space of the environment is finite, we can use A2C to learn the optimal policy. A2C maintains two neural networks during the update: the first one acts as the "critic", i.e. it outputs the current value function $V_\phi(s_t)$, and the second one is the "actor", which represents our policy $\pi_\theta(\cdot|s_t): \mathcal{S} → \Delta^{\mathcal{A}}$. The difference between A2C and the original actor critic is that they use different update rules for the actor, and maintain a critic in a different form. Classical actor-critic maintains a Q-function $Q_\phi$, and uses the following update for the actor: $$
\Delta\theta = \eta \sum_{t} \nabla_\theta\log \pi_\theta(a_t|s_t) \cdot Q_\phi(s_t,a_t)
$$
while A2C (in all its variants) usually maintains a V-value functions $V_\phi$, and updates the actor as the following: $$
\Delta\theta = \eta \sum_{t} \nabla_\theta\log \pi_\theta(a_t|s_t) \cdot \big(\hat{Q}(s_t,a_t)  - V_\phi(s_t)\big).
$$
Here $\hat{Q}(s_t,a_t)$ is a surrogate of Q-value function constructed using the maintained $V_\phi$, usually in the form of $\sum_{h\geq t} \gamma^{h-t} r(s_h,a_h)$ or $r(s_t,a_t) + \gamma \cdot V_\phi(s_{h+1})$.

### Implementation
In this exercise, we will implement A2C on the simple OpenAI Gym environment ``CartPole``. Specifically, you need to:

(1) Construct a Actor / Critic network. The actor network takes the state as a input and outputs a ``torch.distributions.Categorical`` type distribution on $\mathcal{A}$ that you can sample from. The critic network takes the state as an input and outputs a real number that represents the V-function of the current state.

(2) Train the actor and critic network. For simplicity, we recommend you the following loss function for the critic network:
$$
\min_\phi \text{mean}_{t}f\bigg(V_\phi(s_t, a_t) - \sum_{h\geq t} \gamma^{h-t} r(s_h,a_h)\bigg),
$$
where $f$ can either be $x^2$ or $\ell_1$-smooth function provided by ```torch.nn.SmoothL1Loss```. For the actor network, we also recommend you to use the MC update for $\hat{Q}$ mentioned above: $$
\Delta\theta = \text{mean}_{t} \nabla_\theta\log \pi_\theta(a_t|s_t) \cdot \bigg(\sum_{h\geq t} \gamma^{h-t} r(s_h,a_h)  - V_\phi(s_t)\bigg).
$$
However, feel free to try out any form of A2C as long as your reward in the training process can reach the threshold of around 450.

In the following you will implement the actor/critic network. You can choose whatever structure you like, but a three-layer narrow neural network with ReLU activation should suffice to tackle a simple task like CartPole.




In [None]:
# [Hint] Implement actor/critic networks.
class Actor(nn.Module):
    """
    Input:
    num_inputs: int, state size
    num_actions: int, action size
    Output:
    torch.nn.distribution.Categorical that represents the policy
    """
    def __init__(self, num_inputs: int, num_actions: int):
        super(Actor, self).__init__()
        ############################
        # YOUR IMPLEMENTATION HERE #
        pass
        ############################
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        ############################
        # YOUR IMPLEMENTATION HERE #
        pass
        ############################

class Critic(nn.Module):
    """
    Input:
    num_inputs: int, state size
    Output:
    a torch.Tensor variable that represents the V-value
    """
    def __init__(self, num_inputs: int):
        super(Critic, self).__init__()
        ############################
        # YOUR IMPLEMENTATION HERE #
        pass
        ############################

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        ############################
        # YOUR IMPLEMENTATION HERE #
        pass
        ############################


  and should_run_async(code)


The following utility function should return a list that stores $\big(\sum_{h \geq t }\gamma^h r(s_h, a_h)\big)_{t}$ with $\big(r(s_t,a_t)\big)_t$ as the input. You can refer to the policy gradient implementation in HW3.




In [None]:
def compute_returns(rewards: list, gamma: float)-> list:
    """
    Inputs:
    gamma: discount factor
    rewards: list, reward signal of a trajectory
    Outputs:
    a list that record the discounted cumulative reward
    """
    ############################
    # YOUR IMPLEMENTATION HERE #
    pass
    ############################

Now we are ready to train A2C! Specifically, you need to calculate the loss function of the actor and critic in every episode, and optimize them with Adam in every episode.

In [None]:
def train(env_name: str, num_episodes=500, gamma=0.99, lr=0.001) -> list:
    seed = 1234
    env = gym.make(env_name)
    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n

    # set random seeds
    env.seed(seed);
    np.random.seed(seed);
    torch.manual_seed(seed);

    # Initialize networks and optimizers
    actor = Actor(num_inputs, num_actions)
    critic = Critic(num_inputs)
    actor_opt = optim.Adam(actor.parameters(), lr=lr)
    critic_opt = optim.Adam(critic.parameters(), lr=lr)

    # record the cumulative rewards in eval
    eval = []
    for episode in range(num_episodes):
        state = env.reset()
        log_probs = []
        values = []
        rewards = []
        done = False

        while not done:
            state = torch.tensor(state).unsqueeze(0)
            policy = actor(state)
            value = critic(state)
            action = policy.sample()
            next_state, reward, done, _ = env.step(action.item())
            # record the V-function estimate, the log_prob of current actor, and the reward
            # for each step
            log_prob = policy.log_prob(action)
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(reward)

            state = next_state

        returns = compute_returns(rewards, gamma)
        log_probs = torch.cat(log_probs)
        values = torch.cat(values)
        returns = torch.tensor(returns)
        # Calculate actor and critic loss for A2C.
        # 1. Calculate actor loss with advantage version of policy gradient
        # 2. Calculate critic loss.
        # 3. Add them up. Backward->optimize.
        # [HINT] Remember to detach the advantage gradient when calculating actor loss
        #        To ensure the policy gradient is properly calculated
        ############################
        # YOUR IMPLEMENTATION HERE #
        pass
        ############################

        # record the cumulative reward in this episode
        undiscounted_returns = compute_returns(rewards, 1)
        eval.append(undiscounted_returns[0])
        # print every 10 episodes
        if episode % 10 == 0:
          print(f"Episode {episode}, Loss: {loss.item()}, Return: {undiscounted_returns[0]}")
    return eval

Now test how your implementation works on CartPole! If everything goes well, we would expect a reward of around 500. Plot the reward curve with ``matplotlib.pyplot.plot``. The expected time consumption should be around 5-10 minutes.



In [None]:
eval = train("CartPole-v1")

#  Actor-critic variants for continuous action space: DDPG & TD3
In this section, we will introduce two modern RL algorithms that aim to tackle MDPs with continuous action space: **Deep Deterministic Policy Gradient** (DDPG) and **Twin Delayed DDPG** (TD3).
The workflow for the rest of this assignment is as follows: we will initially implement DDPG  algorithms, and then evaluate their performance on the Gym environment ```MountainCarContinuous```. The implementation for TD3 is **optional**, but we encourage you do do it. By studying these two algorithms, you will gain insight into reinforcement learning in continuous action spaces and become familiar with various techniques in RL.

## Question 2. Deep Deterministic Policy Gradient(DDPG)
#### **1. Introduction: DDPG v.s. original actor critic**
Regular actor-critic algorithms work well when we have only finite actions, as it is essentially learning a mapping from $\mathcal{S}$ to $\mathbb{R}^{\mathcal{A}}$, which can be well-approximated by a neural network with an $\mathcal{A}$-dimensional output.  However, when it comes to the case that we have infinite candidate actions, e.g. in the scenario of robotic control, how can we learn the optimal action given the current state?

In the question, we introduce an actor-critic type algorithm called Deep Deterministic Policy Gradient (DDPG). DDPG maintains 4 neural networks: an estimation $Q_\phi$ as the critic, a policy network $\mu_\theta: \mathcal{S} \rightarrow \mathcal{A}$ as the actor, and two corresponding target networks, $Q_{\phi'}$ and $\mu_{\theta'}$. By deterministic, we mean $\mu$ does not include randomness, as it is hard to approximate a probability with a continuous support with a simple neural network.

 There are 3 major differences between DDPG and A2C:

(1) **Actor update.** To adapt to continuous action space and deterministic policy, in DDPG, the actor is updated by the continuous version of policy gradient: $$
\Delta \theta = \eta\cdot \text{mean}_t \big(\nabla_a Q_\phi(s_t, \mu_\theta(s_t)) \cdot \nabla_\theta \mu_\theta (s_t)\big),
$$
also refer to the lecture notes for details.

(2) **Replay buffer & Exploration strategy.** To mitigate the catastrophic forgetting issue in actor-critic (as you might have already seen in previous examples), DDPG "borrows" the concept of replay buffer from DQN, and correspondingly an exploratory strategy when choosing action. In every step, DDPG samples a $(s_t, a_t, s_{t+1}, r_t)$ from the replay buffer, and update the policy.

Similar to the $\epsilon$-greedy exploration in DQN, DDPG adds noise to the chosen action given by the actor to improve exploration: $$
a_t = \mu_\theta(s_t) + \epsilon_t,
$$
where $\epsilon_t$ is a Gaussian noise.
Then the agent receives the new next_state with reward, and stores it in the replay buffer.

(3) **Target network & Critic update.** To mitigate inconsistency during temporal difference backups, DDPG borrows the concept of target network from double DQN. Specifically, in every episode, DDPG updates the critic $Q_\phi$ by performing gradient descent with the following improved Bellman loss: $$
L(\phi) = \text{mean}_t f(y_t, Q_\phi(s_t,a_t)),
$$
where $y_t = r(s_t, a_t) + \gamma \cdot Q_{\phi'}(s_{t+1}, \mu_{\theta'}(s_{t+1}))$. $f$ can either be $x^2$ or $\ell_1$-smooth function. Note that $y_t$ is calculated by the target networks $Q_{\phi'}$, and $Q_{\phi'}$ should not be trained here! This would decrease inconsistency during training. However, different from DQN, which update the target network with a slower frequency,  DDPG chooses to update $Q_{\phi'}$ and $\mu_{\theta'}$ in every episode but with a slower rate after updating $\phi$ and $\theta$:
$$
\phi' = \tau \phi + (1 - \tau) \phi', \qquad \theta' = \tau \theta + (1 - \tau) \theta'.
$$
Usually $\tau$ is set to be some small constant, e.g. 0.005, to prevent rapid updates to the target networks.

Now you must be ready to implement DDPG by yourselves!











We provide you with the following utility function that implements the replay buffer for experience replay.

In [None]:
# Replay buffer
class ReplayBuffer(object):
	def __init__(self, state_dim, action_dim, max_size=int(1e6)):
		self.max_size = max_size
		self.ptr = 0
		self.size = 0

		self.state = np.zeros((max_size, state_dim))
		self.action = np.zeros((max_size, action_dim))
		self.next_state = np.zeros((max_size, state_dim))
		self.reward = np.zeros((max_size, 1))
		self.not_done = np.zeros((max_size, 1))

		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	def add(self, state, action, next_state, reward, done):
		self.state[self.ptr] = state
		self.action[self.ptr] = action
		self.next_state[self.ptr] = next_state
		self.reward[self.ptr] = reward
		self.not_done[self.ptr] = 1. - done

		self.ptr = (self.ptr + 1) % self.max_size
		self.size = min(self.size + 1, self.max_size)


	def sample(self, batch_size):
		ind = np.random.randint(0, self.size, size=batch_size)

		return (
			torch.FloatTensor(self.state[ind]).to(self.device),
			torch.FloatTensor(self.action[ind]).to(self.device),
			torch.FloatTensor(self.next_state[ind]).to(self.device),
			torch.FloatTensor(self.reward[ind]).to(self.device),
			torch.FloatTensor(self.not_done[ind]).to(self.device)
		)


#### **2. Implementation: Create your own DDPG agent!**

You now need to complete the following implementation of DDPG. First, you want to build your ``Actor_DDPG`` and ``Critic_DDPG`` networks. For our task, two three-layer neural network with ReLU activations suffice. Note that different from A2C you implemented above,  ``Critic_DDPG`` approximates the Q-function, thus takes both the state and action as inputs. Here we assume the action space is a symmetric interval ``[- max_action, max_action]``, so please normalize your output with e.g. ``torch.tanh``.

In [None]:
# Implementation of Deep Deterministic Policy Gradients (DDPG)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Actor_DDPG(nn.Module):
	def __init__(self, state_dim: int, action_dim: int, max_action: torch.tensor):
		super(Actor_DDPG, self).__init__()
	  """
		Inputs:
    state_dim: int, dimension of state
		action_dim: int, dimension of action
		Outputs:
		a torch.Tensor that represents future action
		"""
		# [HINT] Construct a neural network as the actor. Return its value using forward
		############################
		# YOUR IMPLEMENTATION HERE #
		pass
		############################
		self.max_action = max_action

	def forward(self, state):
		############################
		# YOUR IMPLEMENTATION HERE #
		pass
		############################

class Critic_DDPG(nn.Module):
	def __init__(self, state_dim: int, action_dim: int):
		super(Critic_DDPG, self).__init__()
	  """
    Inputs:
		same as the actor
		Outputs:
		torch.Tensor that represents the Q-values
		"""
		# [HINT] Construct a neural network as the critic. Return its value using forward
		############################
		# YOUR IMPLEMENTATION HERE #
		pass
		############################

	def forward(self, state: torch.Tensor, action: torch.Tensor)->torch.Tensor:
		############################
		sa = torch.cat([state, action], 1)
		############################
		# YOUR IMPLEMENTATION HERE #
		pass
		############################
		return q1

Now we are ready to construct a DDPG trainer! In the following cell you will need to:

**(1)** Calculate the TD value using target_Q network and update the critic;

**(2)** Calculate the deterministic policy gradient and update the actor;

**(3)** Update the target networks.

Let's do it!

In [None]:
class DDPG(object):
	def __init__(self, state_dim: int, action_dim: int, max_action: float, discount=0.99, tau=0.001):
		# Initialize the models and the target models
		self.actor = Actor_DDPG(state_dim, action_dim, max_action).to(device)
		self.actor_target = copy.deepcopy(self.actor)
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

		self.critic = Critic_DDPG(state_dim, action_dim).to(device)
		self.critic_target = copy.deepcopy(self.critic)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

		self.discount = discount
		self.tau = tau


	def select_action(self, state):
		state = torch.FloatTensor(state.reshape(1, -1)).to(device)
		return self.actor(state).cpu().data.numpy().flatten()


	def train(self, replay_buffer, batch_size=64):
		# Sample from replay buffer
		state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)

		# Compute the target_Q value with critic_target for the batch
		# [HINT] target_Q = reward + discount * Q_target(next_state) * not_done
		############################
		# YOUR IMPLEMENTATION HERE #
		pass
    ############################

		# Get current Q estimate
		current_Q = self.critic(state, action)

		# Compute critic loss
		# [HINT] It should be the an mse_loss or a mean of smooth_l1 function between current Q and target Q
		# [HINT] Remember that we should not update the target Q here! Detach the TD target to prevent gradient backprop
		############################
		# YOUR IMPLEMENTATION HERE #
		pass
    ############################

		# Optimize the critic
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()

		# Compute actor loss
		# [HINT] You can easily compute the loss by critic(state, actor(state)).mean()
		############################
		# YOUR IMPLEMENTATION HERE #
		pass
 		############################
		# Optimize the actor
		self.actor_optimizer.zero_grad()
		actor_loss.backward()
		self.actor_optimizer.step()

		# Update the target models
		# [HINT] You should calculate the weighted mean of every parameters between critic and critic_target,
		#        and the weighted mean of actor and actor_target. Store their values in new_target_params.
		# [HINT] The weight for the weighted mean is self.tau
		for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
      ############################
			# YOUR IMPLEMENTATION HERE #
      pass
			target_param.data.copy_(new_target_params)

		for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
			############################
			# YOUR IMPLEMENTATION HERE #
      pass
			target_param.data.copy_(new_target_params)

#### **3. See how it works!**
Now we are coming to the most exciting part--- try out DDPG (and optionally, TD3) on ``MountainCarContinuous`` environment! The hyperparameters are provided in the following cell. Note that they apply for both DDPG and TD3. To make the training process more stable, we will first randomly sample from the action space to fullfill our replay buffer. So don't panick if you find the reward in the innitial episodes is super low! You are welcome to change angthing in the training process as long as you find it helpful!



In [None]:
def init_flags():

    flags = {
        "env": "MountainCarContinuous",
        "seed":0, # random seed
        "start_timesteps": 25e3, #total steps of free exploration phase
        "max_timesteps": 6e4, # maximum length of time steps in training
        "expl_noise": 0.1, # noise strength in exploration
        "batch_size": 256,
        "discount":0.99,
        "tau": 0.005, # rate of target update
        "policy_noise": 0.2, # policy noise variance when sampling action
        "noise_clip":0.5, # noise clip rate
        "policy_freq": 2, # delayed policy update frequency in TD3
    }

    return flags

def main(policy_name = 'DDPG'):

    args = init_flags()
    env = gym.make(args["env"])
    env.seed(args["seed"]+100)
    env.action_space.seed(args["seed"])
    torch.manual_seed(args["seed"])
    np.random.seed(args["seed"])

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args["discount"],
        "tau": args["tau"],}
    if policy_name == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args["policy_noise"] * max_action
        kwargs["noise_clip"] = args["noise_clip"] * max_action
        kwargs["policy_freq"] = args["policy_freq"]
        policy = TD3(**kwargs)
    elif policy_name == "DDPG":
        policy = DDPG(**kwargs)

    replay_buffer = ReplayBuffer(state_dim, action_dim)
    evaluations = []
    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    for t in range(int(args["max_timesteps"])):

      episode_timesteps += 1

      # Select action randomly or according to policy
      if t < args["start_timesteps"]:
        action = env.action_space.sample()
      else:
        action = (
          policy.select_action(np.array(state))
          + np.random.normal(0, max_action * args["expl_noise"], size=action_dim)
        ).clip(-max_action, max_action)

      # Perform action
      next_state, reward, done, _ = env.step(action)
      done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

      # Store data in replay buffer
      replay_buffer.add(state, action, next_state, reward, done_bool)

      state = next_state
      episode_reward += reward

      # Train agent after collecting sufficient data
      if t >= args["start_timesteps"]:
        policy.train(replay_buffer, args["batch_size"])

      if done:
        # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
        print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
        evaluations.append(episode_reward)
        # Reset environment
        state, done = env.reset(), False
        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1

    return evaluations


Now run the following cell to see how it works! We would expect a reward that converges to around 90. The estimated wall time for running the whole process is around 10-20 minutes, and you should be able to see a large positive reward at around $5\cdot 10^4$ timesteps. If the innitialization is unsuccessful, which could result in the reward being stuck at around $0$, try restart the ``main`` function or debug your trainer.

In [None]:
evaluations_ddpg = main(policy_name = 'DDPG')

Plot your reward v.s. training_episode curve. You can use the evaluations above and ``plt.plot()``.




### Question 3. TD3 [Optional]
#### 1. Tackle over-estimation: Twin Delayed DDPG (TD3)
Similar to DDPG, TD3 also maintains a continuous actor update with a deterministic policy gradient and a replay buffer with an exploratory strategy. The difference between TD3 and DDPG lies in the critic update. Unlike Double DQN, due to the slow update of the policy (the $\theta$ in previous examples), the current and target values still remain similar even when using a double Q-update technique. While the implementation of an independent target network allows for less biased value estimation, even an unbiased estimate with high variance can still lead to future overestimations in local regions of state space, which in turn can negatively affect the global policy.

To address this issue, TD3 updates the actor/critic network in the following way:

**(1)** TD3 maintains **6 neural networks**: $Q_{\phi_1}, Q_{\phi_2}$ as two independent update for the Q function, $Q_{\phi_1'}, Q_{\phi_2'}$ as their corresponding target networks, and $\mu_\theta, \mu_{\theta'}$ as the actor and its target network. $Q_{\phi_1}$ and $Q_{\phi_2}$ are independently initialized, and all target networks are initialized to be the same as their corresponding counterparts. When updating Q-functions, in every epidsode, DDPG updates the critic $Q_\phi$ by performing gradient descent with the following improved Bellamn loss: $$
L(\phi_i) = \text{mean}_t f(y_t, Q_{\phi_i}(s_t,a_t)),
$$
where $y_t = r(s_t, a_t) + \gamma \cdot \min_{i= 1,2}Q_{\phi_i'}(s_{t+1}, \tilde{a}_{t+1})$, $\tilde{a}_t =  \mu_{\theta'}(s_{t+1}) + \text{clip}_{[-c,c]}(\mathcal{N}(0,\tilde{\sigma}))$,  which is different from the TD target in DDPG, as it take the minimum of the two future Q-values and a "disturbed" future action. Here we clip $\tilde{a}_t$ to prevent it from going too far. Intuitively, TD3 evaluates the Q-value of $s_{t+1}$ in a more "conservative" way.


**(2)** When updating $\mu_\theta$, TD3 only utilizes $Q_{\phi_1}$:$$
\Delta \theta = \eta\cdot \text{mean}_t \big(\nabla_a Q_{\phi_1}(s_t, \mu_\theta(s_t)) \cdot \nabla_\theta \mu_\theta (s_t)\big).
$$
At the end of every episode, similar to DDPG, all parameters are updated by $$
\phi_i' = \tau \phi_i + (1 - \tau) \phi_i', \qquad \theta' = \tau \theta + (1 - \tau) \theta'.
$$

**(3)** The actor update is delayed: it only updates once every several times the critic updates.


We understand the introduction above might be a little bit confusing due to the technical complexity. In the following, we will guide you end to end to implement a TD3 training algorithm.



####2. Implementation: Build your own TD3!

In the following cell, we will implement the actor and critic network for TD3. For the actor and every critic (we need to maintain an additional critic), please make sure it has the same structure as the one in the previous DDPG question so that we can conduct an ablation study.

Our implementation for the ``Critic_TD3`` class is slightly different from the previous critic in DDPG: the class function ``forward`` should return two values $q_1$ and $q_2$ given $(s,a)$, while the class function ``Q1`` should return only $q_1$.

In [None]:
# Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Actor_TD3(nn.Module):
	def __init__(self, state_dim: int, action_dim: int, max_action: float):
		super(Actor_TD3, self).__init__()
		"""
    Inputs: same as DDPG actor
		Outputs of forward: torch.Tensor that represents the chosen action
		"""
		############################
		# YOUR IMPLEMENTATION HERE #
		# [HINT] make sure the structure is the same as DDPG
		pass
		############################
		self.max_action = max_action


	def forward(self, state):
		############################
		# YOUR IMPLEMENTATION HERE #
		pass
    ############################


class Critic_TD3(nn.Module):
	def __init__(self, state_dim: int, action_dim: int):
		super(Critic_TD3, self).__init__()
    """
    Inputs: same as the actor
		Outputs: two torch.Tensors that represent Q1 and Q2
		"""
		# Q1 architecture
		############################
		# YOUR IMPLEMENTATION HERE #
		pass

		# Q2 architecture
		############################
		# YOUR IMPLEMENTATION HERE #
		pass
		############################

	def forward(self, state: torch.Tensor, action: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
		sa = torch.cat([state, action], 1)
		############################
		# YOUR IMPLEMENTATION HERE #
		pass
		############################
		return q1, q2

  # Implement a function that returns only Q1.
	# This is helpful when calculating actor loss
	def Q1(self, state: torch.Tensor, action: torch.Tensor) -> torch.Tensor:
		# [HINT] only returns q1 for actor update
		############################
		# YOUR IMPLEMENTATION HERE #
		pass
	  ############################
		return q1


Now let's implement the TD3 trainer! In the following cell, you will need to implement the following:

**(1)** For the critic update of TD3, when sampled a tuple from the replay buffer, recall that we need to estimate $Q(s_{t+1},\tilde{a}_{t+1})$, where $\tilde{a}_{t+1} = \mu_{\theta'}(s_{t+1})+\text{clip}_{[-c,c]}(\epsilon)$ .

**(2)** Calculate the TD target with the networks $Q_{\phi_i'}, i=1,2,$ and $(s_{t+1}, \tilde{a}_{t+1})$ you obtained in **(1)**. Recall that the TD target = $r(s_t,a_t) + \min_{i=1,2}\{Q_{\phi_i'}(s_{t+1}, \tilde{a}_{t+1})\}$.

**(3)** Calculate the actor loss with $Q_{\phi_1}$.

**(4)** Update the parameters $\phi_i'$, $\theta'$.

In [None]:
class TD3(object):
	def __init__(
		self,
		state_dim: int,
		action_dim: int,
		max_action: float,
		discount=0.99,
		tau=0.005,
		policy_noise=0.2,
		noise_clip=0.5,
		policy_freq=2
	):

		self.actor = Actor_TD3(state_dim, action_dim, max_action).to(device)
		self.actor_target = copy.deepcopy(self.actor)
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

		self.critic = Critic_TD3(state_dim, action_dim).to(device)
		self.critic_target = copy.deepcopy(self.critic)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

		self.max_action = max_action
		self.discount = discount
		self.tau = tau
		self.policy_noise = policy_noise
		self.noise_clip = noise_clip
		self.policy_freq = policy_freq

		self.total_it = 0


	def select_action(self, state):
		state = torch.FloatTensor(state.reshape(1, -1)).to(device)
		return self.actor(state).cpu().data.numpy().flatten()


	def train(self, replay_buffer, batch_size=256):
		self.total_it += 1

		# Sample replay buffer
		state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)

		with torch.no_grad():
			# 1. Select action according to policy and add clipped noise.
      # [HINT]: Return the action from the actor and add truncated noise.
			# [HINT]: The variance of the Gaussian noise is self.policy_noise for every dimension.
			# [HINT]: You can use ().clamp(-self.noise_clip, self.noise_clip) to clip the noise
      ############################
      # YOUR IMPLEMENTATION HERE #
			pass
      ############################
			# Compute the target Q value
			target_Q1, target_Q2 = self.critic_target(next_state, next_action)

			# 2. Compute the target_Q here
			# [HINT]: In TD3 we use min(Q_1,Q_2) as the estimate of the next step Q.
      ############################
      # YOUR IMPLEMENTATION HERE #
			pass
      ############################
		# Get current Q estimates
		current_Q1, current_Q2 = self.critic(state, action)

		# Compute critic loss
		critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

		# Optimize the critic
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()

		# Delayed policy updates
		if self.total_it % self.policy_freq == 0:

			# Compute actor loss
      ############################
      # YOUR IMPLEMENTATION HERE #
			pass
      ############################

			# Optimize the actor
			self.actor_optimizer.zero_grad()
			actor_loss.backward()
			self.actor_optimizer.step()

			# Update the frozen target models using weighted mean
			# [HINT]: the weight is given by self.tau
			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
      ############################
      # YOUR IMPLEMENTATION HERE #
				pass
				target_param.data.copy_(new_target_params)

			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
      ############################
			# YOUR IMPLEMENTATION HERE #
				pass
				target_param.data.copy_(new_target_params)


In [None]:
evaluation_td3 = main(policy_name = 'TD3')