


*   This is a minimalistic 1-step Advantage Actor–Critic (A2C) RL algorithm implementation.

*   The agent collects a full episode of transitions and performs a single batch update afterward.


*   The critic uses a 1-step TD target: r + gamma * V(st+1), making it TD(0)-style bootstrapping.
*   There is no entropy bonus, no GAE, and no multi-step returns — just the core vanilla A2C logic.





# **IMPORTING LIBRARIES**

In [84]:
import os
import torch
import random
import torch.nn as nn
import numpy as np
import time
from collections import deque
import wandb
from tqdm import tqdm
import gymnasium as gym
import ale_py
from gymnasium.wrappers import RecordVideo
from torch.distributions import Categorical

In [None]:
def createEnvironment(cfg):
  env = gym.make(cfg.game_id, render_mode="rgb_array")
  return env

# **WANDB RUN**

In [None]:
def wandb_runs(cfg):

  wandb.login(key = "")
  run = wandb.init(
    entity="ajheshbasnet-kpriet",
    project="ddpg",
    name = "DDPG",
    config=vars(cfg),
  )

  return run

# **CONFIGURATIONS**

In [None]:
from dataclasses import dataclass

@dataclass
class configuration:
  game_id = "CartPole-v1"
  n_rollouts = 100_000
  global_steps = 0
  eval_loops = 3
  critic_lr = 2.5e-4
  actor_lr = 2.5e-4
  eval_steps = 10_000
  record_video = 500_000
  device = "cuda" if torch.cuda.is_available() else "cpu"

cfg = configuration()

In [None]:
envs = createEnvironment(cfg)

**Checking environment is working or not:)**

In [None]:
envs.reset()[0]

# **Actor and Critic Netowrk**

In [None]:
class Actor(nn.Module):

  def __init__(self, input_dim, action_dim):
    super().__init__()
    self.sequential = nn.Sequential(
        nn.Linear(input_dim, 200),
        nn.ReLU(),
        nn.Linear(200, 200),
        nn.ReLU(),
        nn.Linear(200, 128),
        nn.ReLU(),
        nn.Linear(128, action_dim)
    )

  def forward(self, x):
    x = self.sequential(x)
    return torch.softmax(x, dim = -1)

  def log_probs(self, x):
        logits = self(x)
        dist = Categorical(logits=logits)

        action = dist.sample()
        log_prob = dist.log_prob(action)

        return log_prob, action

In [None]:
class Critic(nn.Module):

  def __init__(self, input_dim):
    super().__init__()

    self.sequential = nn.Sequential(
        nn.Linear(input_dim, 200),
        nn.ReLU(),
        nn.Linear(200, 200),
        nn.ReLU(),
        nn.Linear(200, 128),
        nn.ReLU(),
        nn.Linear(128, 1)
    )

  def forward(self, x):
    x = self.sequential(x)
    return x

In [None]:
actornet = Actor(envs.reset()[0].shape[0], envs.action_space.n).to(cfg.device)
criticnet = Critic(envs.reset()[0].shape[0]).to(cfg.device)

In [None]:
print(f'''Parameters:
===========================
actor-network :  {sum(p.numel() for p in actornet.parameters())/1e3} k
critic-network : {sum(p.numel() for p in criticnet.parameters())/1e3} k
===========================
      ''')

**Evaluation Loop**

In [None]:
import warnings

# Suppress SyntaxWarning specifically from the moviepy module
warnings.filterwarnings("ignore", category=SyntaxWarning, module="moviepy")

In [None]:
def evaluation(actornet, record_video = False):

  eval_env = gym.make(id = cfg.game_id, render_mode = 'rgb_array')
  if record_video:
    video_dir = f"videos/{cfg.global_steps}"
    eval_env = RecordVideo(eval_env,  video_folder=video_dir, episode_trigger=lambda ep: True)

  net_reward = 0
  net_step = 0

  with torch.no_grad():

    for _ in range(cfg.eval_loops):

      done = False

      episodic_reward = 0
      episodic_step = 0
      state = eval_env.reset()[0]

      while not done:

        stateT = torch.tensor(state, dtype=torch.float32, device=cfg.device)
        action = actornet(stateT).argmax().item()
        nxt_state, reward, terminated, truncated, _ = eval_env.step(action)
        done = terminated or truncated
        state = nxt_state

        episodic_reward += float(reward)
        episodic_step += 1

      net_reward += episodic_reward
      net_step  += episodic_step

  net_reward = net_reward / cfg.eval_loops
  net_step = net_step / cfg.eval_loops

  eval_env.close()

  return net_reward, net_step

In [None]:
# evaluation(actornet, True)

**To sample the batches**

In [None]:
critic_optimizer = torch.optim.AdamW(criticnet.parameters(), lr = cfg.critic_lr)
actor_optimizer = torch.optim.AdamW(actornet.parameters(), lr = cfg.actor_lr)

In [None]:
runs = wandb_runs(cfg)

# **Heart & Core of the notebook: Vanilla A2C Algorithm's Training Loop**

In [None]:
gamma = 0.99

In [None]:
for _ in tqdm(range(cfg.n_rollouts)):

  states = []
  next_states = []
  actions = []
  rewards = []
  dones = []
  log_probs = []

  state = envs.reset()[0]

  statesT = torch.tensor(state, dtype=torch.float32, device=cfg.device).unsqueeze(0)

  done = False

  training_reward = 0
  training_step = 0

  while not done:

    log_prob, actionT = actornet.log_probs(statesT)
    action = actionT.item()
    next_state, reward, terminated, truncated, _ = envs.step(action)
    next_stateT = torch.tensor(next_state, dtype=torch.float32, device=cfg.device)
    rewardT = torch.tensor(reward, dtype=torch.float32, device=cfg.device)
    done = terminated | truncated
    doneT = torch.tensor(done, dtype=torch.float32, device=cfg.device)

    states.append(statesT.squeeze(0))
    next_states.append(next_stateT)
    actions.append(actionT.view(-1))
    log_probs.append(log_prob.view(-1))
    rewards.append(rewardT)
    dones.append(doneT)

    training_reward += float(reward)
    training_step += 1
    statesT = next_stateT.unsqueeze(0)

    cfg.global_steps += 1
    runs.log({"global-steps": cfg.global_steps})
    actornet.eval()
    if cfg.global_steps%cfg.eval_steps == 0 and cfg.global_steps>0:
      rec = True if cfg.global_steps%cfg.record_video == 0 else False
      net_reward, net_step = evaluation(actornet, rec)
      runs.log({"evaluation-reward": net_reward, "eval-steps": net_step})
      actornet.train()

  all_states = torch.stack(states)
  all_next_states = torch.stack(next_states)
  all_actions = torch.stack(actions)
  all_log_probs = torch.stack(log_probs).view(-1)
  all_rewards = torch.stack(rewards)
  all_dones = torch.stack(dones)

  # all_Gt = []
  # Gt = 0

  # for r in reversed(all_rewards):
  #   Gt = r + gamma*Gt
  #   all_Gt.insert(0, Gt)

  with torch.no_grad():
    next_target = all_rewards + (1 - all_dones) * gamma * criticnet(all_next_states).squeeze(-1)

  values = criticnet(all_states).squeeze(-1)

  Advantages = next_target - values

  Advantages = (Advantages - Advantages.mean()) / (Advantages.std() + 1e-9)

  actorloss = - (all_log_probs * Advantages.detach()).mean()

  criticloss = torch.nn.functional.mse_loss(values, next_target.detach())

  runs.log({"actor-loss": actorloss.item(), "critic-loss": criticloss.item()})

  actor_optimizer.zero_grad()
  actorloss.backward()
  actor_optimizer.step()

  critic_optimizer.zero_grad()
  criticloss.backward()
  critic_optimizer.step()

  runs.log({"training_reward" : training_reward, "advantages": Advantages.mean().item(), "training-step": training_step})

# **Save the Model Weights**

In [None]:
torch.save(actornet.state_dict(), "actor.pt")