# Deep Deterministic Policy Gradient

In [1]:
!pip install gymnasium
!pip install gymnasium[mujoco]
!pip install omegaconf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import copy
import random
import collections

import numpy as np
import gymnasium as gym

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
import matplotlib.pyplot as plt
from omegaconf import OmegaConf
from gymnasium.experimental.wrappers import RecordVideoV0 as RecordVideo


# Environment

In [19]:
# continuous environment
env = gym.make('HalfCheetah-v4', render_mode="rgb_array")
# env = RecordVideo(env, "./videos", disable_logger=False)
env

<TimeLimit<OrderEnforcing<PassiveEnvChecker<HalfCheetahEnv<HalfCheetah-v4>>>>>

# Hyperparameters

In [20]:
AC_config = OmegaConf.create({
    # RL parameter
    'gamma': 0.99,
    
    # replay memory
    'buffer_limit': int(1e5),
    'batch_size': 32,
    
    # neural network parameters
    'device': 'cuda:0',
    'hidden_dim': 64,
    'state_dim': env.observation_space.shape[0],
    'action_dim': int(env.action_space.shape[0]), # cannot use .n because not actions are continuous!
    
    # learning parameters
    'lr_actor': 0.0005,
    'lr_critic': 0.001,
    'tau': 0.005,
})

# Special functions
- Replay Buffer
- Ornstein_Uhlenbeck_Noise: add noise to the action (=output of Actor network) $\rightarrow$ exploration $\uparrow$
- soft_update: prevent the drastic change of neural network

In [21]:
# replay buffer
class ReplayBuffer():
    def __init__(self, config):
        self.config = config
        self.buffer = collections.deque(maxlen=self.config.buffer_limit)

    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, next_s_lst, done_mask_lst = [], [], [], [], []

        for transition in mini_batch:
            s, a, r, next_s, done = transition
            s_lst.append(s.tolist())
            a_lst.append(a.tolist())
            r_lst.append([r])
            next_s_lst.append(next_s.tolist())
            done_mask = 0.0 if done else 1.0 
            done_mask_lst.append([done_mask])
        
        return torch.Tensor(s_lst), torch.Tensor(a_lst), torch.Tensor(r_lst), torch.Tensor(next_s_lst), torch.Tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)
    
    
class OrnsteinUhlenbeckNoise:
    def __init__(self, mu):
        self.theta, self.dt, self.sigma = 0.1, 0.01, 0.1
        self.mu = mu
        self.x_prev = np.zeros_like(self.mu)

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x
    
# moving average over the neural network parameters
def soft_update(net, net_target, tau):
    # for each parameters,
    for param_target, param in zip(net_target.parameters(), net.parameters()):
        # mix the target and current parameters with the ratio of (1 - tau) : (tau)
        param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau)
    

# Main Structure: Actor-Critic
- with target networks (recall DQN)
- update method has changed:


In [22]:
class ActorCritic(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.data = []
        self.config = config
        
        # create replay buffer
        self.memory = ReplayBuffer(self.config)
        # set exploration noise
        self.action_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(self.config.action_dim))

        # actor: policy network
        self.actor = nn.Sequential(
            nn.Linear(self.config.state_dim, self.config.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.config.hidden_dim, self.config.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.config.hidden_dim, self.config.action_dim),
            nn.Tanh(), # continuous action, bound output to [-1, 1]
        )
        # critic: Q(s, a) network
        self.critic = nn.Sequential(
            nn.Linear(self.config.state_dim + self.config.action_dim, self.config.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.config.hidden_dim, self.config.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.config.hidden_dim, 1),
        )
        # we need target networks:
        self.actor_target, self.critic_target = copy.deepcopy(self.actor), copy.deepcopy(self.critic)
        
        # load them to gpu (if available)
        self.to(self.config.device)
        
        # we use different learning rates for actor and critic networks
        self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=self.config.lr_actor)
        self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=self.config.lr_critic)
        
        # parameter for soft update
        self.tau = self.config.tau
        
    # training function
    def update(self):
        # randomly sample from replay buffer
        states, actions, rewards, next_states, dones = self.memory.sample(self.config.batch_size)
        states = torch.Tensor(states).to(AC_config['device'])
        actions = torch.Tensor(actions).to(AC_config['device'])
        rewards = torch.Tensor(rewards).to(AC_config['device'])
        next_states = torch.Tensor(next_states).to(AC_config['device'])
        dones = torch.Tensor(dones).to(AC_config['device'])
        
        # compute target q values -- we concatenate state & action to make (s, a) 
        # they have shape of (Batch size x state_dim) & (Batch size x action_dim).
        # We need to make it (Batch size x state_dim + action_dim), meaning that the concatenation must happen in the last dimension,
        # i.e. dim=-1
        target_q_values = rewards + self.config.gamma * self.critic_target(
            torch.cat([next_states, self.actor_target(next_states)], dim=-1)
        ) * dones
        
        # compute q loss
        critic_loss = F.smooth_l1_loss(self.critic(torch.cat([states, actions], dim=-1)), target_q_values.detach())
        # compute gradient & update
        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        actor_loss = -self.critic(torch.cat([states, self.actor(states)], dim=-1)).mean() # That's all for the policy loss.
        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()
        
        # soft update
        soft_update(self.actor, self.actor_target, self.tau)
        soft_update(self.critic, self.critic_target, self.tau)
        

# Learn

In [None]:
num_epis, epi_rews = 5000, []
agent = ActorCritic(AC_config)

for n_epi in tqdm(range(num_epis)):
    state, _ = env.reset()
    terminated, truncated = False, False
    epi_rew = 0
    
    while not (terminated or truncated):
        # get action from actor network
        action = agent.actor(torch.Tensor(list(state)).to(AC_config.device))
        # add noise for better exploration
        action = action + torch.Tensor(agent.action_noise()).to(AC_config.device)

        next_state, reward, terminated, truncated, _ = env.step(action.detach().cpu().numpy())

        # save transition to replay buffer
        agent.memory.put((state, action, reward, next_state, terminated or truncated))

        # state transition
        state = next_state

        # record reward
        epi_rew += reward
        
    # enough memory
    if agent.memory.size() > 5000:
        # off-line training
        for i in range(10):
            agent.update()
            
    epi_rews += [epi_rew]
env.close()

 10%|▉         | 489/5000 [06:43<1:01:15,  1.23it/s]

In [None]:
plt.figure(figsize=(20, 10), dpi=300)
plt.plot(epi_rews, label='episode returns')
plt.legend(fontsize=20)
plt.show()
plt.close()

# Check the video!