In [1]:
import os
import random
import time
from dataclasses import dataclass
import sys

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal

from agents.networks import ConvNet_StackedFrames
from env_wrapper import ProcessedFrame, FrameStack, ActionRemapWrapper

import wandb
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [2]:
def make_car_racing_env():
    env = gym.make("CarRacing-v3", continuous=True) 
    env = ProcessedFrame(env)
    env = ActionRemapWrapper(env)
    env = FrameStack(env, num_frames=4, skip_frames=2) # 2 frames stacked, skip 2
    return env

In [3]:
class Agent(nn.Module):
    def __init__(self, envs):
        super().__init__()
        # Matches CustomPPORLModule architecture
        self.convnet = ConvNet_StackedFrames(num_frames=4)
        
        # ConvNet output: 256 channels * 4 * 4 spatial = 4096
        self.fc1 = nn.Linear(256 * 4 * 4, 512)
        self.fc2 = nn.Linear(512, 64)
        
        # Initialize FC weights
        layer_init(self.fc1, std=np.sqrt(2))
        layer_init(self.fc2, std=np.sqrt(2))
        
        # Policy Heads
        # Steering: [-1, 1]
        self.steering_mean = layer_init(nn.Linear(64, 1), std=0.01)
        self.steering_log_std = layer_init(nn.Linear(64, 1), std=0.01)

        # Gas: [-1, 1] -> [0, 1] (remapped)
        self.gas_mean = layer_init(nn.Linear(64, 1), std=0.01, bias_const=-2.0)
        self.gas_log_std = layer_init(nn.Linear(64, 1), std=0.01)

        # Brake: [-1, 1] -> [0, 1] (remapped)
        self.brake_mean = layer_init(nn.Linear(64, 1), std=0.01, bias_const=-2.0)
        self.brake_log_std = layer_init(nn.Linear(64, 1), std=0.01)

        # Value Head
        self.vf_head = layer_init(nn.Linear(64, 1), std=1)

        # Log std bounds
        self.LOG_STD_MIN = -20
        self.LOG_STD_MAX = 2

    def get_features(self, x):
        x = self.convnet(x)
        x = x.reshape(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return x

    def get_value(self, x):
        features = self.get_features(x)
        return self.vf_head(features)

    def get_action_and_value(self, x, action=None):
        features = self.get_features(x)
        
        # Value
        value = self.vf_head(features)

        # Policy Heads - All use Tanh
        steering_mean = torch.tanh(self.steering_mean(features))
        gas_mean = torch.tanh(self.gas_mean(features))
        brake_mean = torch.tanh(self.brake_mean(features))
        
        # Log Stds
        steering_log_std = torch.clamp(self.steering_log_std(features), self.LOG_STD_MIN, self.LOG_STD_MAX)
        gas_log_std = torch.clamp(self.gas_log_std(features), self.LOG_STD_MIN, self.LOG_STD_MAX)
        brake_log_std = torch.clamp(self.brake_log_std(features), self.LOG_STD_MIN, self.LOG_STD_MAX)
        
        # Concatenate
        means = torch.cat([steering_mean, gas_mean, brake_mean], dim=1)
        log_stds = torch.cat([steering_log_std, gas_log_std, brake_log_std], dim=1)
        stds = torch.exp(log_stds)
        
        probs = Normal(means, stds)
        if action is None:
            action = probs.sample()
            action = torch.clamp(action, -1.0, 1.0)  # Clamp actions to valid range
            
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), value


In [4]:
# Hyperparameters
exp_name = "ppo_car"
wandb_project_name = "rl-training"
wandb_entity = None
capture_video = False

env_id = "CarRacing-v3"
total_timesteps = 1000000
learning_rate = 2.5e-4
num_envs = 1
num_steps = 128
anneal_lr = True
gamma = 0.99
gae_lambda = 0.95
num_minibatches = 4
update_epochs = 4
norm_adv = True
clip_coef = 0.2
clip_vloss = True
ent_coef = 0.01
vf_coef = 0.5
max_grad_norm = 0.5
target_kl = None

batch_size = int(num_envs * num_steps)
minibatch_size = int(batch_size // num_minibatches)
num_iterations = total_timesteps // batch_size

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Environment setup
env = make_car_racing_env()

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

  from pkg_resources import resource_stream, resource_exists


In [5]:
wandb.login()
wandb.init(
    project=wandb_project_name,
    name=exp_name,
    config={
        "env_id": env_id,
        "total_timesteps": total_timesteps,
        "learning_rate": learning_rate,
        "num_envs": num_envs,
        "num_steps": num_steps,
    }
)

[34m[1mwandb[0m: Currently logged in as: [33malienpenguin[0m ([33malienpenguin-inc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [agents] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [6]:
agent = Agent(env).to(device)
optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5)

In [7]:
# ALGO Logic: Storage setup
# Note: obs shape is (num_steps, num_envs, C, H, W). Even with num_envs=1.
obs_shape = (num_steps, num_envs) + env.observation_space.shape
obs = torch.zeros(obs_shape).to(device)

actions = torch.zeros((num_steps, num_envs) + env.action_space.shape).to(device)
logprobs = torch.zeros((num_steps, num_envs)).to(device)
rewards = torch.zeros((num_steps, num_envs)).to(device)
dones = torch.zeros((num_steps, num_envs)).to(device)
values = torch.zeros((num_steps, num_envs)).to(device)

In [None]:
global_step = 0
start_time = time.time()
last_episode_return = 0
running_return = 0

# Initial reset
next_obs, _ = env.reset()
next_obs = torch.Tensor(next_obs).to(device).unsqueeze(0)  # Shape: (1, 4, 84, 96)
next_done = torch.zeros(num_envs).to(device)

for iteration in range(1, num_iterations + 1):
    # Annealing the rate if instructed to do so.
    if anneal_lr:
        frac = 1.0 - (iteration - 1.0) / num_iterations
        lrnow = frac * learning_rate
        optimizer.param_groups[0]["lr"] = lrnow

    for step in range(0, num_steps):
        global_step += num_envs
        obs[step] = next_obs 
        dones[step] = next_done

        # ALGO LOGIC: action logic
        with torch.no_grad():
            action, logprob, _, value = agent.get_action_and_value(next_obs)
            
            values[step] = value.flatten()
        
        actions[step] = action
        logprobs[step] = logprob

        # TRY NOT TO MODIFY: execute the game and log data.
        real_action = action[0].cpu().numpy()
        next_obs, reward, terminated, truncated, info = env.step(real_action)
        done = terminated or truncated
        
        running_return += reward  # Manual reward tracking
        
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        
        # Handle Reset
        if done:
            next_obs, _ = env.reset()
            next_done = torch.ones(num_envs).to(device)
            last_episode_return = running_return
            running_return = 0
        else:
            next_done = torch.zeros(num_envs).to(device)
            
        next_obs = torch.Tensor(next_obs).to(device).unsqueeze(0) # Ensure (1, C, H, W)

    # bootstrap value if not done
    with torch.no_grad():
        next_value = agent.get_value(next_obs).reshape(1, -1)
        advantages = torch.zeros_like(rewards).to(device)
        lastgaelam = 0
        for t in reversed(range(num_steps)):
            if t == num_steps - 1:
                nextnonterminal = 1.0 - next_done
                nextvalues = next_value
            else:
                nextnonterminal = 1.0 - dones[t + 1]
                nextvalues = values[t + 1]
            delta = rewards[t] + gamma * nextvalues * nextnonterminal - values[t]
            advantages[t] = lastgaelam = delta + gamma * gae_lambda * nextnonterminal * lastgaelam
        returns = advantages + values
    
    # flatten the batch
    b_obs = obs.reshape((-1,) + env.observation_space.shape)
    b_logprobs = logprobs.reshape(-1)
    b_actions = actions.reshape((-1,) + env.action_space.shape)
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = values.reshape(-1)

    # Optimizing the policy and value network
    b_inds = np.arange(batch_size)
    clipfracs = []
    for epoch in range(update_epochs):
        np.random.shuffle(b_inds)
        for start in range(0, batch_size, minibatch_size):
            end = start + minibatch_size
            mb_inds = b_inds[start:end]

            _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions[mb_inds])
            logratio = newlogprob - b_logprobs[mb_inds]
            ratio = logratio.exp()

            with torch.no_grad():
                # calculate approx_kl http://joschu.net/blog/kl-approx.html
                old_approx_kl = (-logratio).mean()
                approx_kl = ((ratio - 1) - logratio).mean()
                clipfracs += [((ratio - 1.0).abs() > clip_coef).float().mean().item()]

            mb_advantages = b_advantages[mb_inds]
            if norm_adv:
                mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

            # Policy loss
            pg_loss1 = -mb_advantages * ratio
            pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
            pg_loss = torch.max(pg_loss1, pg_loss2).mean()

            # Value loss
            newvalue = newvalue.view(-1)
            if clip_vloss:
                v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                v_clipped = b_values[mb_inds] + torch.clamp(
                    newvalue - b_values[mb_inds],
                    -clip_coef,
                    clip_coef,
                )
                v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                v_loss = 0.5 * v_loss_max.mean()
            else:
                v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

            entropy_loss = entropy.mean()
            loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()

        if target_kl is not None and approx_kl > target_kl:
            break
    
    # Logging
    wandb.log({
        "episode_reward": last_episode_return,
        "losses/policy_loss": pg_loss.item(),
        "losses/value_loss": v_loss.item(),
        "losses/kl": approx_kl.item(),
        "charts/learning_rate": optimizer.param_groups[0]["lr"],
        "global_step": global_step
    })
    
    print(f"Iteration {iteration}: Reward={last_episode_return:.2f} Policy Loss={pg_loss.item():.4f} Value Loss={v_loss.item():.4f} KL={approx_kl.item():.4f}")

Iteration 1: Reward=0.00 Policy Loss=-0.0005 Value Loss=1.1716 KL=0.0000
Iteration 2: Reward=0.00 Policy Loss=-0.0019 Value Loss=0.6490 KL=0.0000
Iteration 3: Reward=0.00 Policy Loss=-0.0009 Value Loss=0.7339 KL=0.0002
Iteration 4: Reward=0.00 Policy Loss=-0.0071 Value Loss=0.5903 KL=0.0003
Iteration 5: Reward=0.00 Policy Loss=0.0028 Value Loss=0.5813 KL=0.0002
Iteration 6: Reward=0.00 Policy Loss=0.0025 Value Loss=0.7276 KL=0.0002
Iteration 7: Reward=0.00 Policy Loss=-0.0086 Value Loss=0.5169 KL=0.0009
Iteration 8: Reward=-40.77 Policy Loss=-0.0014 Value Loss=0.5484 KL=0.0005
Iteration 9: Reward=-40.77 Policy Loss=-0.0011 Value Loss=0.4760 KL=0.0006
Iteration 10: Reward=-40.77 Policy Loss=-0.0112 Value Loss=0.5660 KL=0.0017
Iteration 11: Reward=-40.77 Policy Loss=-0.0071 Value Loss=0.3546 KL=0.0021
Iteration 12: Reward=-40.77 Policy Loss=-0.0063 Value Loss=0.5613 KL=0.0013
Iteration 13: Reward=-40.77 Policy Loss=-0.0128 Value Loss=0.4939 KL=0.0021
Iteration 14: Reward=-40.77 Policy Lo

KeyboardInterrupt: 

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x7b94c1996cc0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7b9397d5c620, execution_count=8 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 7b941c5399a0, raw_cell="global_step = 0
start_time = time.time()
last_epis.." transformed_cell="global_step = 0
start_time = time.time()
last_epis.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a227061726b696e227d/homes/vk545/RL/ppo.ipynb#X50sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


ConnectionResetError: Connection lost