Run the below cell to download the MuJoCo dependencies

In [1]:
!pip install -U gymnasium[mujoco] mujoco

Collecting mujoco
  Downloading mujoco-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco)
  Downloading glfw-2.10.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.py39.py310.py311.py312.py313.py314-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Downloading mujoco-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (7.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading glfw-2.10.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.py39.py310.py311.py312.py313.py314-none-manylinux_2_28_x86_64.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.5/243.

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# **IMPORTING LIBRARIES**

In [3]:
import torch
import random
import os
import torch.nn as nn
import numpy as np
import time
from collections import deque
import wandb

import gymnasium as gym
# from gymnasium.vector import SyncVectorEnv
from gymnasium.wrappers import RecordVideo

In [4]:
os.environ["MUJOCO_GL"] = "egl"

In [5]:
# def create_environment(cfgs, eval = False):

#   def _init():
#       env = gym.make( id=cfgs.id , render_mode="rgb_array", max_episode_steps=cfg.max_steps)
#       return env

#   return _init

In [6]:
def create_environment(cfgs, eval = False):
  env = gym.make( id=cfgs.id , render_mode="rgb_array", max_episode_steps=cfg.max_steps)
  return env

# **WANDB RUN**

In [7]:
def wandb_runs(cfg):

  wandb.login(key = "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")
  run = wandb.init(
    entity="ajheshbasnet-kpriet",
    project="ddpg",
    name = "DDPG",
    config=vars(cfg),
  )

  return run

# **CONFIGURATIONS**

In [8]:
from dataclasses import dataclass

@dataclass
class configuration:
  id = "Ant-v5"
  n_rollouts = 100_000
  max_steps = 1000
  eval_steps = 10_000
  global_steps = 0
  buffer_size = 800_000
  eval_loops = 3
  batch_size = 512
  wandb_log_steps = 50
  start_training = 50_000
  training_step = 2
  actor_freq = 3
  critic_lr = 2.5e-4
  actor_lr = 2.5e-4
  record_video = 500_000
  eval_max_steps = 800
  device = "cuda" if torch.cuda.is_available() else "cpu"

cfg = configuration()

**SyncVectorEnv so that we can run the n-environments parrallelly and utilize the GPUs because single environment is wayy poor**

In [9]:
# envs = SyncVectorEnv([create_environment(cfg) for _ in range(cfg.n_envs)])

envs = create_environment(cfg)

**Checking environment is working or not:)**

# **Actor and Critic Netowrk**

In [10]:
class Actor(nn.Module):

  def __init__(self, input_dim, action_dim):
    super().__init__()
    self.sequential = nn.Sequential(
        nn.Linear(input_dim, 512),
        nn.ReLU(),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Linear(256, action_dim),
        nn.Tanh()
    )

  def forward(self, x):
    x = self.sequential(x)
    return x

In [11]:
class Critic(nn.Module):

  def __init__(self, input_dim):
    super().__init__()

    self.sequential = nn.Sequential(
        nn.Linear(input_dim, 512),
        nn.ReLU(),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Linear(256, 1)
    )

  def forward(self, state, action):
    x = torch.cat([state, action], dim = 1)
    x = self.sequential(x)
    return x

In [12]:
print(envs.observation_space,"\t", envs.action_space,)

Box(-inf, inf, (105,), float64) 	 Box(-1.0, 1.0, (8,), float32)


In [13]:
actornet = Actor(envs.observation_space.shape[0], envs.action_space.shape[0]).to(cfg.device)  #type: ignore
criticnet1 = Critic(envs.observation_space.shape[0]+envs.action_space.shape[0]).to(cfg.device)  #type: ignore
criticnet2 = Critic(envs.observation_space.shape[0]+envs.action_space.shape[0]).to(cfg.device)  #type: ignore

TargetActor = Actor(envs.observation_space.shape[0], envs.action_space.shape[0]).to(cfg.device) #type: ignore
TargetCritic1 = Critic(envs.observation_space.shape[0]+envs.action_space.shape[0]).to(cfg.device) #type: ignore
TargetCritic2 = Critic(envs.observation_space.shape[0]+envs.action_space.shape[0]).to(cfg.device) #type: ignore

TargetActor.load_state_dict(actornet.state_dict())
TargetCritic1.load_state_dict(criticnet1.state_dict())
TargetCritic2.load_state_dict(criticnet2.state_dict())

<All keys matched successfully>

In [14]:
print(f'''Parameters:
=================================================================
actor-network     : {sum(p.numel() for p in actornet.parameters())/1e3} k
critic-network(s) : {sum(p.numel() for p in criticnet1.parameters())/ 1e3} k + {sum(p.numel() for p in criticnet2.parameters())/ 1e3} k
=================================================================
      ''')

Parameters:
actor-network     : 187.656 k
critic-network(s) : 189.953 k + 189.953 k
      


**Evaluation Loop**

In [15]:
def evaluation(actornet, record_video = False):

  eval_env = gym.make(id = cfg.id, render_mode = 'rgb_array' ,max_episode_steps=cfg.eval_max_steps)
  if record_video:
    video_dir = f"videos/{int(time.time())}"
    eval_env = RecordVideo(eval_env,  video_folder=video_dir, episode_trigger=lambda ep: True)

  net_reward = 0
  net_step = 0

  with torch.no_grad():

    for _ in range(cfg.eval_loops):

      done = False

      episodic_reward = 0
      episodic_step = 0
      state = eval_env.reset()[0]

      while not done:

        stateT = torch.tensor(state, dtype=torch.float32, device=cfg.device)
        action = np.array(actornet(stateT).cpu().numpy())
        nxt_state, reward, terminated, truncated, _ = eval_env.step(action)
        done = terminated or truncated
        state = nxt_state

        episodic_reward += float(reward)
        episodic_step += 1

      net_reward += episodic_reward
      net_step  += episodic_step

  net_reward = net_reward / cfg.eval_loops
  net_step = net_step / cfg.eval_loops

  eval_env.close()

  return net_reward, net_step

In [16]:
evaluation(actornet, False)

(793.4922706923782, 800.0)

**To sample the batches**

In [17]:
def get_batches(memory, batch_size):
    batches = random.sample(memory, batch_size)
    state, action, reward, next_state, done = zip(*batches)

    state = torch.stack(state).float().to(cfg.device)
    action = torch.stack(action).float().to(cfg.device)
    reward = torch.stack(reward).float().to(cfg.device)
    next_state = torch.stack(next_state).float().to(cfg.device)
    done = torch.stack(done).float().to(cfg.device)  # float for TD computation

    return state, action, reward.view(-1, 1), next_state, done.view(-1, 1)

# **REPLAY MEMORY**

In [18]:
replay_buffer = deque(maxlen = cfg.buffer_size)
action_sigma = 0.1
tau = 0.001
gamma = 0.99
noise_clip = 0.5
policy_noise = 0.2
global_step = cfg.global_steps

In [19]:
critic_optimizer1 = torch.optim.AdamW(criticnet1.parameters(), lr = cfg.critic_lr)
critic_optimizer2 = torch.optim.AdamW(criticnet2.parameters(), lr = cfg.critic_lr)
actor_optimizer = torch.optim.AdamW(actornet.parameters(), lr = cfg.actor_lr)

**W&B RUNS TO LOG THE METRICS**

In [20]:
runs = wandb_runs(cfg)

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33majheshbasnet[0m ([33majheshbasnet-kpriet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# **Heart & Core of the notebook: DDPG Algorithm's Training Loop**

In [21]:
from tqdm import tqdm

for _ in tqdm(range(cfg.n_rollouts)):

  states = envs.reset()[0]

  statesT= torch.tensor(states, dtype=torch.float32, device = cfg.device)

  training_rewards = 0

  for _ in range(cfg.max_steps):

    with torch.no_grad():
      action = actornet(statesT).view(-1)

    action_noise = torch.clamp(torch.randn_like(action), -noise_clip, noise_clip)

    action = (action + action_sigma * action_noise)

    action = torch.clamp(action, -1.0, 1.0).cpu().numpy()

    next_states, rewards, terminated, truncated, _ =  envs.step(action)

    done = terminated | truncated

    next_statesT = torch.tensor(next_states, dtype=torch.float32, device = cfg.device)

    actionT = torch.tensor(action, dtype=torch.float32, device = cfg.device)

    rewardsT = torch.tensor(rewards, dtype=torch.float32, device=cfg.device)

    training_rewards += float(rewards)

    doneT = torch.tensor(done, dtype=torch.bool, device = cfg.device)

    replay_buffer.append((statesT, actionT, rewardsT, next_statesT, doneT))

    statesT = next_statesT

    cfg.global_steps += 1

    if cfg.global_steps% cfg.training_step == 0 and len(replay_buffer)>cfg.start_training:
      # Sample batch
      states_b, action_b, reward_b, next_states_b, dones_b = get_batches(replay_buffer, cfg.batch_size)

      # Target Q
      with torch.no_grad():

        next_action_ = TargetActor(next_states_b)
        noise_next_action = torch.clamp(torch.randn_like(next_action_) * policy_noise, -noise_clip, +noise_clip)
        next_action = next_action_ + noise_next_action
        next_action = torch.clamp(next_action, -1.0, 1.0)

        target_next_q1 = TargetCritic1(next_states_b, next_action)
        target_next_q2 = TargetCritic2(next_states_b, next_action)
        target_q = reward_b + gamma * torch.min(target_next_q1, target_next_q2) * (1 - dones_b.float())

      # Current critic Q
      current_q1 = criticnet1(states_b, action_b)
      current_q2 = criticnet2(states_b, action_b)

      # Critic loss
      critic_loss1 = torch.nn.functional.mse_loss(current_q1, target_q)
      critic_loss2 = torch.nn.functional.mse_loss(current_q2, target_q)

      # Optimize critic1
      critic_optimizer1.zero_grad()
      critic_loss1.backward()
      critic_grad_norm1 = torch.nn.utils.clip_grad_norm_(criticnet1.parameters(), max_norm=1.0)
      critic_optimizer1.step()

      # Optimize critic2
      critic_optimizer2.zero_grad()
      critic_loss2.backward()
      critic_grad_norm2 = torch.nn.utils.clip_grad_norm_(criticnet2.parameters(), max_norm=1.0)
      critic_optimizer2.step()

      # Actor loss (use current actor)
      # Clone states_b to create an independent computational graph for actor update
      states_b_actor = states_b.clone()
      actor_actions = actornet(states_b_actor) # Renamed to avoid shadowing action_b from get_batches

      min_q = torch.min(criticnet1(states_b_actor, actor_actions), criticnet2(states_b_actor, actor_actions))
      actor_loss = -min_q.mean()

      if cfg.global_steps%cfg.actor_freq==0:
        for p in criticnet1.parameters():
          p.requires_grad = False

        for p in criticnet2.parameters():
          p.requires_grad = False

        # Optimize actor
        actor_optimizer.zero_grad()
        actor_loss.backward()
        actor_grad_norm = torch.nn.utils.clip_grad_norm_(actornet.parameters(), max_norm=1.0)
        actor_optimizer.step()

        for p in criticnet1.parameters():
          p.requires_grad = True
        for p in criticnet2.parameters():
          p.requires_grad = True

      advantages = (target_q - torch.min(current_q1, current_q2)).detach().mean()

      runs.log(
          {
              "actor-loss": actor_loss.item(),
              "critic-loss1": critic_loss1.item(),
              "critic-loss2": critic_loss2.item(),
              "advantages": advantages.item()
          }
      )
      # Soft update targets
      for target_param, param in zip(TargetCritic1.parameters(), criticnet1.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

      for target_param, param in zip(TargetCritic2.parameters(), criticnet2.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

      for target_param, param in zip(TargetActor.parameters(), actornet.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

      runs.log({"training-reward": training_rewards, "global-steps": cfg.global_steps, "memory": len(replay_buffer)})

      if cfg.global_steps%cfg.eval_steps==0 and cfg.global_steps>1:
          rec = True if global_step%cfg.record_video==0 else False
          eval_reward, eval_steps = evaluation(actornet, rec)
          runs.log(
              {
                  "eval-reward": eval_reward,
              }
          )

envs.close()
wandb.finish()

  0%|          | 65/100000 [03:14<82:58:49,  2.99s/it] 


KeyboardInterrupt: 

In [None]:
next_action_, noise_next_action

In [None]:
torch.clamp(torch.randn_like(next_action_), -noise_clip, noise_clip)

**Run the below cell to save the video from the latest Actor Network**

In [None]:
evaluation(actornet, True)

# **                         **END****

In [None]:
cfg.global_steps