Run the below cell to download the MuJoCo dependencies

In [1]:
!pip install -U gymnasium[mujoco] mujoco

Collecting mujoco
  Downloading mujoco-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m41.0/42.0 kB[0m [31m84.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m725.1 kB/s[0m eta [36m0:00:00[0m
Collecting glfw (from mujoco)
  Downloading glfw-2.10.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.py39.py310.py311.py312.py313.py314-none-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Downloading mujoco-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (7.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading glfw-2.10.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38.py39.py310.py311

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# **IMPORTING LIBRARIES**

In [3]:
import torch
import random
import os
import torch.nn as nn
import numpy as np
import time
from collections import deque
import wandb

import gymnasium as gym
from gymnasium.vector import SyncVectorEnv
from gymnasium.wrappers import RecordVideo

In [4]:
os.environ["MUJOCO_GL"] = "egl"

In [5]:
def create_environment(cfgs, eval = False):

  def _init():
      env = gym.make( id=cfgs.id , render_mode="rgb_array", max_episode_steps=cfg.max_steps)
      return env

  return _init

# **WANDB RUN**

In [6]:
def wandb_runs(cfg):

  wandb.login(key = "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")
  run = wandb.init(
    entity="ajheshbasnet-kpriet",
    project="ddpg",
    name = "DDPG",
    config=vars(cfg),
  )

  return run

# **CONFIGURATIONS**

In [7]:
from dataclasses import dataclass

@dataclass
class configuration:
  id = "HalfCheetah-v5"
  n_envs = 8
  n_rollouts = 100_000
  max_steps = 1000
  eval_steps = 10_000
  global_steps = 0
  buffer_size = 500_000
  eval_loops = 3
  batch_size = 128
  trainng_step = 1
  critic_lr = 2.5e-4
  actor_lr = 2.5e-4
  record_video = 500_000
  device = "cuda" if torch.cuda.is_available() else "cpu"

cfg = configuration()

**SyncVectorEnv so that we can run the n-environments parrallelly and utilize the GPUs because single environment is wayy poor**

In [25]:
envs = SyncVectorEnv([create_environment(cfg) for _ in range(cfg.n_envs)])

**Checking environment is working or not:)**

In [9]:
envs.reset()[0][0]

array([-0.06587524,  0.07457796, -0.09485435, -0.04748046,  0.05739668,
       -0.06521819, -0.08906876,  0.05440453, -0.2159224 , -0.1004027 ,
       -0.03345621, -0.04662377,  0.22998296,  0.03340072,  0.0473454 ,
       -0.05126143,  0.18059978])

# **Actor and Critic Netowrk**

In [10]:
class Actor(nn.Module):

  def __init__(self, input_dim, action_dim):
    super().__init__()
    self.sequential = nn.Sequential(
        nn.Linear(input_dim, 256),
        nn.ReLU(),
        nn.Linear(256, 512),
        nn.ReLU(),
        nn.Linear(512, 512),
        nn.ReLU(),
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, action_dim)
    )

  def forward(self, x):
    x = self.sequential(x)
    return x

In [11]:
class Critic(nn.Module):

  def __init__(self, input_dim):
    super().__init__()

    self.sequential = nn.Sequential(
        nn.Linear(input_dim, 256),
        nn.ReLU(),
        nn.Linear(256, 512),
        nn.ReLU(),
        nn.Linear(512, 512),
        nn.ReLU(),
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, 1)
    )

  def forward(self, state, action):
    x = torch.cat([state, action], dim = 1)
    x = self.sequential(x)
    return x

In [12]:
print(envs.single_observation_space,"\t", envs.single_action_space,)

Box(-inf, inf, (17,), float64) 	 Box(-1.0, 1.0, (6,), float32)


In [14]:
actornet = Actor(17, 6).to(cfg.device)
criticnet = Critic(23).to(cfg.device)

TargetActor = Actor(17, 6).to(cfg.device)
TargetCritic = Critic(23).to(cfg.device)

TargetActor.load_state_dict(actornet.state_dict())
TargetCritic.load_state_dict(criticnet.state_dict())

<All keys matched successfully>

In [15]:
print(f'''Parameters:
===========================
actor-network :  {sum(p.numel() for p in actornet.parameters())/1e3} k
critic-network : {sum(p.numel() for p in criticnet.parameters())/ 1e3} k
===========================
      ''')

Parameters:
actor-network :  465.286 k
critic-network : 466.177 k
      


**Evaluation Loop**

In [16]:
def evaluation(actornet, record_video = False):

  eval_env = gym.make(id = cfg.id, render_mode = 'rgb_array' ,max_episode_steps=cfg.max_steps)
  if record_video:
    video_dir = f"videos/{int(time.time())}"
    eval_env = RecordVideo(eval_env,  video_folder=video_dir, episode_trigger=lambda ep: True)

  net_reward = 0
  net_step = 0

  with torch.no_grad():

    for _ in range(cfg.eval_loops):

      done = False

      episodic_reward = 0
      episodic_step = 0
      state = eval_env.reset()[0]

      while not done:

        stateT = torch.tensor(state, dtype=torch.float32, device=cfg.device)
        action = np.array(actornet(stateT).cpu())
        nxt_state, reward, terminated, truncated, _ = eval_env.step(action)
        done = terminated or truncated
        state = nxt_state

        episodic_reward += float(reward)
        episodic_step += 1

      net_reward += episodic_reward
      net_step  += episodic_step

  net_reward = net_reward / cfg.eval_loops
  net_step = net_step / cfg.eval_loops

  eval_env.close()

  return net_reward, net_step

**To sample the batches**

In [17]:
def get_batches(memory, batch_size):
    batches = random.sample(memory, batch_size)
    state, action, reward, next_state, done = zip(*batches)

    state = torch.stack(state).float().to(cfg.device)
    action = torch.stack(action).float().to(cfg.device)
    reward = torch.stack(reward).float().to(cfg.device)
    next_state = torch.stack(next_state).float().to(cfg.device)
    done = torch.stack(done).float().to(cfg.device)  # float for TD computation

    state = state.reshape(-1, state.size(-1))
    action = action.reshape(-1, action.size(-1))
    reward = reward.reshape(-1,1)
    next_state = next_state.reshape(-1, next_state.size(-1))
    done = done.reshape(-1,1)

    return state, action, reward, next_state, done


In [18]:
replay_buffer = deque(maxlen = cfg.buffer_size)
action_sigma = 0.02
tau = 0.005
gamma = 0.98
global_step = cfg.global_steps

In [19]:
critic_optimizer = torch.optim.AdamW(criticnet.parameters(), lr = cfg.critic_lr)
actor_optimizer = torch.optim.AdamW(actornet.parameters(), lr = cfg.actor_lr)

In [20]:
envs.single_observation_space

Box(-inf, inf, (17,), float64)

In [21]:
runs = wandb_runs(cfg)

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33majheshbasnet[0m ([33majheshbasnet-kpriet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# **Heart & Core of the notebook: DDPG Algorithm's Training Loop**

In [22]:
from tqdm import tqdm

for _ in tqdm(range(cfg.n_rollouts)):

  states = envs.reset()[0]

  statesT= torch.tensor(states, dtype=torch.float32, device = cfg.device)

  training_rewards = torch.zeros((cfg.n_envs,), device = cfg.device)


  for _ in range(cfg.max_steps):

    with torch.no_grad():
      action = actornet(statesT).cpu()

    action = np.array(action) + action_sigma * np.random.rand(cfg.n_envs, 6)

    next_states, rewards, terminated, truncated, _ =  envs.step(action)

    done = terminated | truncated

    next_statesT = torch.tensor(next_states, dtype=torch.float32, device = cfg.device)

    actionT = torch.tensor(action, dtype=torch.float32, device = cfg.device)

    rewardsT = torch.tensor(rewards, dtype=torch.float32, device=cfg.device)

    training_rewards += rewardsT

    doneT = torch.tensor(done, dtype=torch.bool, device = cfg.device)

    replay_buffer.append((statesT, actionT, rewardsT, next_statesT, doneT))

    if (global_step + 1) % cfg.trainng_step == 0 and len(replay_buffer)>80_000:
      # Sample batch
      states_b, action_b, reward_b, next_states_b, dones_b = get_batches(replay_buffer, cfg.batch_size)

      # Target Q
      with torch.no_grad():
        next_action = TargetActor(next_states_b)
        target_next_q = TargetCritic(next_states_b, next_action)
        target_q = reward_b + gamma * target_next_q * (1 - dones_b.float())

      # Current critic Q
      current_q = criticnet(states_b, action_b)

      # Critic loss
      critic_loss = torch.nn.functional.mse_loss(current_q, target_q)

      # Actor loss (use current actor)
      # Clone states_b to create an independent computational graph for actor update
      states_b_actor = states_b.clone()
      actor_actions = actornet(states_b_actor) # Renamed to avoid shadowing action_b from get_batches
      actor_loss = -criticnet(states_b_actor, actor_actions).mean()

      runs.log(
          {
              "actor-loss": actor_loss.item(),
              "critic-loss": critic_loss.item()
          }
      )

      # Optimize actor
      actor_optimizer.zero_grad()
      actor_loss.backward()

      # Optimize critic
      critic_optimizer.zero_grad()
      critic_loss.backward()

      critic_optimizer.step()
      actor_optimizer.step()

      # Soft update targets
      for target_param, param in zip(TargetCritic.parameters(), criticnet.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

      for target_param, param in zip(TargetActor.parameters(), actornet.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

      if global_step%cfg.eval_steps==0 and global_step>0:

        rec = True if global_step%cfg.record_video==0 else False
        eval_reward, eval_steps = evaluation(actornet, rec)
        runs.log(
            {
                "eval-reward": eval_reward,
            }
        )

    if all(done):
      runs.log({"training-reward": training_rewards.mean().item()})

    statesT = next_statesT
    runs.log({"global-steps": global_step, "memory": len(replay_buffer)})
    global_step += 1

  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"
  1%|          | 689/100000 [2:42:52<391:16:35, 14.18s/it]


KeyboardInterrupt: 

# **Run the below cell to save the video from the latest Actor Network**

In [24]:
evaluation(actornet, True)

(5154.715834142035, 1000.0)