## Implementation of DDPG 

Reference Paper

[1] Lillicrap, T. P., Hunt, J. J., Pritzel, A., Heess, N., Erez, T., Tassa, Y., Silver, D., & Wierstra, D. (2016). Continuous Control with Deep Reinforcement Learning. International Conference on Learning Representations (ICLR) 2016. 

In [1]:
import torch 
import torch.nn as nn
import numpy as np 
import gymnasium as gym
from tqdm import tqdm
import time 

from copy import deepcopy
from utils.replay import ReplayBuffer
from utils.metrics import RollingAverage

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [2]:
test_env = gym.make("MountainCarContinuous-v0")

In [44]:
# https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py
class OrnsteinUhlenbeckActionNoise:
    def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = np.array(mu)
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def noise(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt \
            + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

In [48]:
class Critic(nn.Module):
    
    def __init__(
        self, 
        env: gym.Env,
        hidden_layers: list = list([400, 300]), 
        *args, 
        **kwargs
    ):
        super(Critic, self).__init__(*args, **kwargs)
        
        self.env = env
        
        # the layers before we add the action
        self.before_action = nn.Sequential(
            nn.Linear(self.env.observation_space.shape[0], hidden_layers[0]), # note: this assumes obs spaces of shape (N,) ie 1d
            nn.BatchNorm1d(hidden_layers[0]),
            nn.ReLU(),
            
            nn.Linear(hidden_layers[0], hidden_layers[1]), 
            nn.BatchNorm1d(hidden_layers[1]),
            nn.ReLU(),
        )
        
        # add the action input
        self.add_action = nn.Sequential(
            nn.Linear(hidden_layers[1] + self.env.action_space.shape[0], 1),         
        )
        
    def forward(
        self, 
        obs: torch.Tensor,
        action: torch.Tensor
    ):
        representations = self.before_action(obs)
        rep_and_act = torch.hstack((representations, action))
        q_val = self.add_action(rep_and_act)
        return q_val


class Actor(nn.Module):
    
    def __init__(
        self, 
        env: gym.Env,
        action_max: float | int = 1,
        hidden_layers: list = list([400, 300]),
        theta: float = 0.15, 
        sigma: float = 0.2,
        *args, 
        **kwargs
    ):
        super(Actor, self).__init__(*args, **kwargs)
        
        self.env = env
        self.action_max = action_max
        
        self.layers = nn.Sequential(
            nn.Linear(self.env.observation_space.shape[0], hidden_layers[0]),
            nn.BatchNorm1d(hidden_layers[0]),
            nn.ReLU(),
            
            nn.Linear(hidden_layers[0], hidden_layers[1]),
            nn.BatchNorm1d(hidden_layers[1]),
            nn.ReLU(),
            
            nn.Linear(hidden_layers[1], 1),
            nn.Tanh(),
        )
        
        self.ounoise = OrnsteinUhlenbeckActionNoise(mu=[0], theta=theta, sigma=sigma)
        
        
    def forward(
        self, 
        obs: torch.Tensor,
    ): 
        return self.layers(obs) * self.action_max
    
    def sample_actions(
        self, 
        obs: torch.Tensor,
        noise: float
    ): 
        self.eval()
        with torch.no_grad():
            actions = self(obs)
        self.train()
        return actions.cpu().detach().numpy() + self.ounoise.noise()

In [49]:
def train(
    env: gym.Env,
    action_max: int = 1, 
    batch_size: int = 32, 
    timesteps: int = 1000000,
    capacity: int = 100000, 
    preload: int = 10000, 
    gamma: float = 0.99, 
    tau: float = 0.001, 
    lr_actor: float = 0.0001,
    lr_critic: float = 0.001, 
    expl_noise: float = 1, 
    window: int = 10, 
    wd: float = 0.01,
    val_freq: int = 10000, 
    num_val_eps: int = 2, 
    device: str = 'cpu',
    save_step: int = 850000
):
    # setup
    metrics = RollingAverage(window_size=window)
    buffer = ReplayBuffer(buffer_len=capacity)
    
    actor = Actor(env, action_max=action_max).to(device)
    critic = Critic(env).to(device)
    actor_target = Actor(env, action_max=action_max).to(device)
    critic_target = Critic(env).to(device)
    
    actor_target.load_state_dict(actor.state_dict())
    critic_target.load_state_dict(critic.state_dict())
    
    actor_optimizer = torch.optim.Adam(actor.parameters(), lr=lr_actor)
    critic_optimizer = torch.optim.AdamW(critic.parameters(), lr=lr_critic, weight_decay=wd)
    mse_loss = nn.MSELoss()
    
    val_env = deepcopy(env)
    best_reward = float('-inf')
    
    # preload the env
    obs, _ = env.reset()
    done = False
    for _ in tqdm(range(preload)):
        action = env.action_space.sample()
        obs_prime, reward, terminated, truncated, _ = env.step(action)
        
        done = terminated or truncated
        buffer.update(obs.squeeze(), action.squeeze(), reward, obs_prime.squeeze(), done)
        
        obs = obs_prime
        if done: 
            obs, _ = env.reset()
            done = False
    
    obs, _ = env.reset()
    done = False
    # actual training loop 
    for step in range(1, timesteps):
        start_time = time.time()
        if not actor.training:
            actor.train()
        
        sampled_action = actor.sample_actions(torch.as_tensor(obs, dtype=torch.float32, device=device).view(1, -1), 
                                              noise=expl_noise)
        obs_prime, reward, terminated, truncated, _ = env.step(sampled_action)
        
        done = terminated or truncated
        buffer.update(obs.squeeze(), sampled_action.squeeze(), reward, obs_prime.squeeze(), done)
        
        obs = obs_prime
        if done:
            obs, _ = env.reset()
            done = False
            actor.ounoise.reset()
        
        batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = buffer.sample(
            batch_size, device=device)
        
        batch_rewards = batch_rewards.view(-1, 1)
        batch_dones = batch_dones.view(-1, 1) 
        batch_actions = batch_actions.view(-1, 1) 
        
        with torch.no_grad():
            target_actions = actor_target(batch_next_obs)
            q_targets = critic_target(batch_next_obs, target_actions)
            
        td_target = torch.where(batch_dones, batch_rewards, batch_rewards + gamma * q_targets)  
        
        q_values = critic(batch_obs, batch_actions) 
        loss_critic = mse_loss(q_values, td_target)
        
        
        critic_optimizer.zero_grad()
        loss_critic.backward()
        critic_optimizer.step()
        
        actor_actions = actor(batch_obs)
        actor_values = critic(batch_obs, actor_actions)
        loss_actor = -(actor_values).mean()
        
        actor_optimizer.zero_grad()
        loss_actor.backward()
        actor_optimizer.step()
        
        # soft target update 
        for param, target_param in zip(critic.parameters(), critic_target.parameters()):
            target_param.data.copy_(tau * param + (1-tau) * target_param)
            
        for param, target_param in zip(actor.parameters(), actor_target.parameters()):
            target_param.data.copy_(tau * param + (1-tau) * target_param) 
             
        # val loop 
        if step % val_freq == 0:
            actor.eval()
            for _ in range(num_val_eps):
                obs_val, _ = val_env.reset()
                done_val = False
                ep_reward = 0 
                while not done_val:
                    with torch.no_grad():
                        action = actor(torch.as_tensor(obs_val, dtype=torch.float32,
                                                       device=device).view(1, -1)).cpu().detach().numpy() 
                        
                    obs_prime_val, reward_val, terminated, truncated, _ = val_env.step(action)
                    ep_reward += reward_val
                    
                    obs_val = obs_prime_val
                    done_val = terminated or truncated
                    
                metrics.update(ep_reward)                            
                
            if step > save_step and metrics.get_average < best_reward:
                torch.save({
                    'actor_state_dict' : actor.state_dict(),
                    'critic_state_dict' : critic.state_dict()
                })
        
        time_per_step = time.time() - start_time
        print(f'Step {step} | Average Val Reward: {metrics.get_average} | Time per step {time_per_step:.5f}', end='\r')
    
    return actor, critic, metrics

In [None]:
env = gym.make('MountainCarContinuous-v0')
actor, critic, metrics = train(env, device=device, val_freq=5000)

 60%|█████▉    | 5973/10000 [00:00<00:00, 29888.68it/s]