### Imports

In [1]:
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import gymnasium as gym
import argparse
import os
from tqdm import tqdm
import csv
import matplotlib.pyplot as plt
import time

import utils
import TD3
import DDPG

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")

Using device: cuda
PyTorch version: 2.9.0+cu128


### Utils

In [2]:
class ReplayBuffer(object):
	def __init__(self, state_dim, action_dim, max_size=int(1e6)):
		self.max_size = max_size
		self.ptr = 0
		self.size = 0

		self.state = np.zeros((max_size, state_dim))
		self.action = np.zeros((max_size, action_dim))
		self.next_state = np.zeros((max_size, state_dim))
		self.reward = np.zeros((max_size, 1))
		self.not_done = np.zeros((max_size, 1))

		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	def add(self, state, action, next_state, reward, done):
		self.state[self.ptr] = state
		self.action[self.ptr] = action
		self.next_state[self.ptr] = next_state
		self.reward[self.ptr] = reward
		self.not_done[self.ptr] = 1. - done

		self.ptr = (self.ptr + 1) % self.max_size
		self.size = min(self.size + 1, self.max_size)


	def sample(self, batch_size):
		ind = np.random.randint(0, self.size, size=batch_size)

		return (
			torch.FloatTensor(self.state[ind]).to(self.device),
			torch.FloatTensor(self.action[ind]).to(self.device),
			torch.FloatTensor(self.next_state[ind]).to(self.device),
			torch.FloatTensor(self.reward[ind]).to(self.device),
			torch.FloatTensor(self.not_done[ind]).to(self.device)
		)

### Actor Networks

#### Actor DDPG & Actor TD3

In [4]:
class ActorDDPG(nn.Module):
    """Actor network for DDPG (from DDPG.py - 400-300 architecture)."""
    
    def __init__(self, state_dim, action_dim, max_action):
        super(ActorDDPG, self).__init__()

        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)
        
        self.max_action = max_action

    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        return self.max_action * torch.tanh(self.l3(a))


class ActorTD3(nn.Module):
    """Actor network for TD3 (from TD3.py - 256-256 architecture)."""
    
    def __init__(self, state_dim, action_dim, max_action):
        super(ActorTD3, self).__init__()

        self.l1 = nn.Linear(state_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, action_dim)
        
        self.max_action = max_action

    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        return self.max_action * torch.tanh(self.l3(a))

### Critic Networks

#### Critic DDPG & Critic TD3

In [6]:
class CriticDDPG(nn.Module):
    """Critic network for DDPG (from DDPG.py)."""
    
    def __init__(self, state_dim, action_dim):
        super(CriticDDPG, self).__init__()

        # DDPG architecture: state first, then concat with action
        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400 + action_dim, 300)
        self.l3 = nn.Linear(300, 1)

    def forward(self, state, action):
        # Process state first, then concatenate with action
        q = F.relu(self.l1(state))
        q = F.relu(self.l2(torch.cat([q, action], 1)))
        return self.l3(q)


class CriticTD3(nn.Module):
    """Twin critic networks for TD3 (from TD3.py)."""
    
    def __init__(self, state_dim, action_dim):
        super(CriticTD3, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(state_dim + action_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 1)

        # Q2 architecture
        self.l4 = nn.Linear(state_dim + action_dim, 256)
        self.l5 = nn.Linear(256, 256)
        self.l6 = nn.Linear(256, 1)

    def forward(self, state, action):
        """Forward pass through both Q-networks."""
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2

    def Q1(self, state, action):
        """Get Q1 value only (used for actor update)."""
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1

### DDPG Agent

In [7]:
class DDPG(object):
    """Deep Deterministic Policy Gradient (from DDPG.py - original paper implementation)."""
    
    def __init__(self, state_dim, action_dim, max_action, discount=0.99, tau=0.001):
        # Initialize actor and target actor (using ActorDDPG with 400-300 architecture)
        self.actor = ActorDDPG(state_dim, action_dim, max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)  # lr=1e-4

        # Initialize critic and target critic
        self.critic = CriticDDPG(state_dim, action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), weight_decay=1e-2)  # weight_decay=1e-2

        self.discount = discount
        self.tau = tau  # tau = 0.001

    def select_action(self, state):
        """Select action using current policy (no exploration noise)."""
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, batch_size=64):  # batch_size=64 (not 256)
        """Perform one training step."""
        # Sample replay buffer
        state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)

        # Compute the target Q value
        target_Q = self.critic_target(next_state, self.actor_target(next_state))
        target_Q = reward + (not_done * self.discount * target_Q).detach()

        # Get current Q estimate
        current_Q = self.critic(state, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Compute actor loss
        actor_loss = -self.critic(state, self.actor(state)).mean()
        
        # Optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update the frozen target models
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
        
        return critic_loss.item(), actor_loss.item()

    def save(self, filename):
        """Save model parameters."""
        torch.save(self.critic.state_dict(), filename + "_critic")
        torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer")
        torch.save(self.actor.state_dict(), filename + "_actor")
        torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer")

    def load(self, filename):
        """Load model parameters."""
        self.critic.load_state_dict(torch.load(filename + "_critic"))
        self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer"))
        self.critic_target = copy.deepcopy(self.critic)

        self.actor.load_state_dict(torch.load(filename + "_actor"))
        self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
        self.actor_target = copy.deepcopy(self.actor)

print("DDPG agent class defined (from DDPG.py)")

DDPG agent class defined (from DDPG.py)


### TD3 Agent

In [8]:
class TD3(object):
    """Twin Delayed Deep Deterministic Policy Gradient (from TD3.py)."""
    
    def __init__(
        self,
        state_dim,
        action_dim,
        max_action,
        discount=0.99,
        tau=0.005,
        policy_noise=0.2,
        noise_clip=0.5,
        policy_freq=2
    ):
        # Initialize actor and target actor (using ActorTD3 with 256-256 architecture)
        self.actor = ActorTD3(state_dim, action_dim, max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

        # Initialize twin critics and target critics
        self.critic = CriticTD3(state_dim, action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

        self.max_action = max_action
        self.discount = discount
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq

        self.total_it = 0

    def select_action(self, state):
        """Select action using current policy (no exploration noise)."""
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, batch_size=256):
        """Perform one training step."""
        self.total_it += 1

        # Sample replay buffer
        state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)

        with torch.no_grad():
            # Select action according to policy and add clipped noise (Target Policy Smoothing)
            noise = (
                torch.randn_like(action) * self.policy_noise
            ).clamp(-self.noise_clip, self.noise_clip)
            
            next_action = (
                self.actor_target(next_state) + noise
            ).clamp(-self.max_action, self.max_action)

            # Compute target Q value using minimum of two Q-networks (Clipped Double Q-Learning)
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + not_done * self.discount * target_Q

        # Get current Q estimates from both critics
        current_Q1, current_Q2 = self.critic(state, action)

        # Compute critic loss (MSE for both Q-networks)
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = None
        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:

            # Compute actor loss (maximize Q1)
            actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
            
            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models using soft (Polyak) updates
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
        
        return critic_loss.item(), actor_loss.item() if actor_loss is not None else None

    def save(self, filename):
        """Save model parameters."""
        torch.save(self.critic.state_dict(), filename + "_critic")
        torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer")
        torch.save(self.actor.state_dict(), filename + "_actor")
        torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer")

    def load(self, filename):
        """Load model parameters."""
        self.critic.load_state_dict(torch.load(filename + "_critic"))
        self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer"))
        self.critic_target = copy.deepcopy(self.critic)

        self.actor.load_state_dict(torch.load(filename + "_actor"))
        self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
        self.actor_target = copy.deepcopy(self.actor)

print("TD3 agent class defined (from TD3.py)")

TD3 agent class defined (from TD3.py)


### Evaluation Function

In [9]:
def eval_policy(policy, env_name, seed, eval_episodes=10):
    """Evaluate the policy over multiple episodes."""
    eval_env = gym.make(env_name)
    avg_reward = 0.
    for _ in tqdm(range(eval_episodes), desc="Evaluating", leave=False):
        state = eval_env.reset(seed=seed + 100)[0]
        done = False
        while not done:
            action = policy.select_action(np.array(state))
            state, reward, terminated, truncated, _ = eval_env.step(action)
            done = terminated or truncated
            avg_reward += reward

    avg_reward /= eval_episodes

    print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
    return avg_reward

print("Evaluation function defined")

Evaluation function defined


In [10]:
# Training hyperparameters
env_name = "Reacher-v5"  # Fastest environment for training
seed = 0

# Reduced timesteps for faster convergence analysis
# Reacher typically converges much faster than 1M steps
max_timesteps = int(2e5)  # Reduced from 1M to 200K for faster experimentation
start_timesteps = int(1e4)  # Reduced from 25K to 10K
eval_freq = int(5e3)  # Evaluate every 5K steps

batch_size = 256
discount = 0.99
tau = 0.005  # Default tau (used by TD3)
# Note: DDPG uses tau=0.001 (different from TD3's tau=0.005)

# TD3-specific parameters
policy_noise = 0.2
noise_clip = 0.5
policy_freq = 2  # Delayed policy update frequency

expl_noise = 0.1  # Exploration noise std
save_model = True  # Whether to save models

print(f"Environment: {env_name}")
print(f"Seed: {seed}")
print(f"Max timesteps: {max_timesteps:,}")
print(f"Start timesteps: {start_timesteps:,}")
print(f"Evaluation frequency: {eval_freq:,}")
print(f"\nNote: DDPG uses tau=0.001, TD3 uses tau=0.005")
print(f"Note: Reduced timesteps for Reacher-v5 (typically converges in <200K steps)")

Environment: Reacher-v5
Seed: 0
Max timesteps: 200,000
Start timesteps: 10,000
Evaluation frequency: 5,000

Note: DDPG uses tau=0.001, TD3 uses tau=0.005
Note: Reduced timesteps for Reacher-v5 (typically converges in <200K steps)


In [11]:
# Create environment
env = gym.make(env_name)

# Set seeds
env.action_space.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

# Get environment dimensions
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

print(f"State dimension: {state_dim}")
print(f"Action dimension: {action_dim}")
print(f"Max action: {max_action}")

State dimension: 10
Action dimension: 2
Max action: 1.0


In [13]:
print("="*70)
print("Training DDPG")
print("="*70)
# Initialize DDPG agent with tau=0.001 (from DDPG.py)
ddpg_policy = DDPG(
    state_dim=state_dim,
    action_dim=action_dim,
    max_action=max_action,
    discount=discount,
    tau=0.001  # DDPG.py uses 0.001, not 0.005
)

# Initialize replay buffer
ddpg_replay_buffer = ReplayBuffer(state_dim, action_dim)

# Initialize TensorBoard writer

print("DDPG agent initialized")
print(f"Architecture: 400-300 hidden layers (from DDPG.py)")
print(f"Replay buffer capacity: 1,000,000 transitions")

# Evaluate untrained policy
print("\nEvaluating untrained DDPG policy...")
ddpg_evaluations = [eval_policy(ddpg_policy, env_name, seed)]
ddpg_writer.add_scalar('Evaluation/Average_Reward', ddpg_evaluations[0], 0)

# Initialize training variables
state = env.reset(seed=seed)[0]
done = False
episode_reward = 0
episode_timesteps = 0
episode_num = 0

ddpg_start_time = time.time()

print(f"\nStarting DDPG training for {max_timesteps:,} timesteps...")
print(f"Random exploration for first {start_timesteps:,} steps")
print(f"Evaluation frequency: every {eval_freq:,} steps")
print(f"Watch TensorBoard above for real-time updates!\n")

# Main training loop
for t in tqdm(range(int(max_timesteps)), desc="Training DDPG"):
    
    episode_timesteps += 1

    # Select action randomly or according to policy with exploration noise
    if t < start_timesteps:
        action = env.action_space.sample()
    else:
        action = (
            ddpg_policy.select_action(np.array(state))
            + np.random.normal(0, max_action * expl_noise, size=action_dim)
        ).clip(-max_action, max_action)

    # Perform action in environment
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

    # Store transition in replay buffer
    ddpg_replay_buffer.add(state, action, next_state, reward, done_bool)

    state = next_state
    episode_reward += reward

    # Train agent after collecting sufficient data
    if t >= start_timesteps:
        critic_loss, actor_loss = ddpg_policy.train(ddpg_replay_buffer, batch_size)
        
        # Log losses to TensorBoard
        if t % 1000 == 0:
            ddpg_writer.add_scalar('Loss/Critic', critic_loss, t)
            ddpg_writer.add_scalar('Loss/Actor', actor_loss, t)

    if done:
        # Log episode reward
        ddpg_writer.add_scalar('Training/Episode_Reward', episode_reward, episode_num)
        ddpg_writer.add_scalar('Training/Episode_Length', episode_timesteps, episode_num)
        
        # Reset environment
        state = env.reset(seed=seed)[0]
        done = False
        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1

    # Evaluate episode
    if (t + 1) % eval_freq == 0:
        eval_reward = eval_policy(ddpg_policy, env_name, seed)
        ddpg_evaluations.append(eval_reward)
        ddpg_writer.add_scalar('Evaluation/Average_Reward', eval_reward, t + 1)
        
        if save_model:
            os.makedirs(f"./notebook_models/{env_name}", exist_ok=True)
            ddpg_policy.save(f"./notebook_models/{env_name}/DDPG_seed{seed}")

ddpg_end_time = time.time()
ddpg_duration = ddpg_end_time - ddpg_start_time

print("\nDDPG Training completed!")
hours = int(ddpg_duration // 3600)
minutes = int((ddpg_duration % 3600) // 60)
seconds = int(ddpg_duration % 60)
print(f"Training time: {hours:02d}:{minutes:02d}:{seconds:02d}")
print(f"Final evaluation reward: {ddpg_evaluations[-1]:.3f}")

ddpg_writer.close()

Training DDPG
DDPG agent initialized
Architecture: 400-300 hidden layers (from DDPG.py)
Replay buffer capacity: 1,000,000 transitions

Evaluating untrained DDPG policy...


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

                                                          

Evaluation over 10 episodes: -19.542


NameError: name 'ddpg_writer' is not defined