# PPO Base Implementation
This will be the baseline implementation for comparing with the other methods.

In [110]:
SEED = 1234
LEARNING_RATE = 1e-4
GAMMA = 0.99
EPOCHS = 20
CLIP_EPSILON = 0.2
BATCH_SIZE = 10

In [111]:
import random
import wandb

import gym
import numpy as np

import torch
from torch.nn import LeakyReLU, Linear, MSELoss, Sequential, Softmax, Conv1d, Flatten, MaxPool1d
from torch.optim import Adam

import logging
logging.basicConfig(level=logging.INFO)

In [112]:
class WandSession:
  def __init__(self, enabled=True):
    self.enabled = enabled
    if enabled:
      wandb.init(
          project="car-racing-base",

          config={
              "learning_rate": LEARNING_RATE,
              "gamma": GAMMA,
              "epochs": EPOCHS,
              "clip_epsilon": CLIP_EPSILON,
              "batch_size": BATCH_SIZE,
              "seed": SEED
          },
      )
      
  def log(self, *args, **kwargs):
    if self.enabled:
      wandb.log(args, kwargs)
      
  def finish(self):
    if self.enabled:
      wandb.finish()

In [113]:
device = torch.device("mps")

def create_env(**kwargs):
  return gym.make('CarRacing-v2', continuous=False, **kwargs)

def reset_env(env: gym.Env):
  state, _ = env.reset()
  state = state.transpose(2, 0, 1)
  state = 0.299 * state[0] + 0.587 * state[1] + 0.114 * state[2]
    
  return state

def step_env(env: gym.Env, action: np.ndarray):
  state, reward, done, *_ = env.step(action)
  state = state.transpose(2, 0, 1)
  state = 0.299 * state[0] + 0.587 * state[1] + 0.114 * state[2]
  
  return state, reward, done

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

wandb_session = WandSession(enabled=False)

## Network Architecture

**PolicyNetwork**:
- Input: State
- Output: Action distribution (0-1)
- 2 Hidden layers with LeakyReLU activation

**ValueNetwork**:
- Input: State
- Output: Value
- 2 Hidden layers with LeakyReLU activation

In [114]:
class PolicyNetwork(torch.nn.Module):
  def __init__(self, hidden_dim):
    super().__init__()
    self.model = Sequential(
      Conv1d(96, 16, 4, stride=4),
      LeakyReLU(),
      Conv1d(16, 32, 4, stride=2), #
      LeakyReLU(),
      MaxPool1d(2),
      Flatten(),
      Linear(160, hidden_dim),
      LeakyReLU(),
      Linear(hidden_dim, 5),
      Softmax(dim=-1)
    )

  def forward(self, state):
    if len(state.shape) == 2:
      state = state.unsqueeze(0)
    
    return self.model(state)
  
  @torch.no_grad()
  def act(self, state: np.array):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    probs = self.model(state)
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)
  
class ValueNetwork(torch.nn.Module):
  def __init__(self, hidden_dim) -> None:
    super().__init__()
    self.model = Sequential(
      Conv1d(96, 16, 4, stride=4),
      LeakyReLU(),
      Conv1d(16, 32, 4, stride=2),
      LeakyReLU(),
      MaxPool1d(2),
      Flatten(),
      Linear(160, hidden_dim),
      LeakyReLU(),
      Linear(hidden_dim, 1),
    )
  
  def forward(self, state):
    if len(state.shape) == 2:
      state = state.unsqueeze(0)
    
    return self.model(state)
  

# Training
- 64 hidden nodes
- Adam optimizer
- MSE loss for value network

In [115]:
policy_net = PolicyNetwork(64).to(device)
value_net  = ValueNetwork(64).to(device)

policy_optimizer = Adam(policy_net.parameters(), lr=LEARNING_RATE)
value_optimizer  = Adam(value_net.parameters(), lr=LEARNING_RATE)

criterion = MSELoss()

In [116]:
def compute_returns(rewards):
  returns = torch.zeros(len(rewards))
  R = 0
  for i in reversed(range(len(rewards))):
    R = rewards[i] + GAMMA * R
    returns[i] = R
  return returns

In [117]:
def ppo_step(env: gym.Env):
    state = reset_env(env)
    
    # capture entire episode
    done, steps = False, 0
    states, actions, log_probs_old, rewards = [], [], [], []
    
    print(f"Running episode: ", end="", flush=True)
    
    while not done and sum(rewards[:7]) >= 0:
        action, log_prob = policy_net.act(state)
        next_state, reward, done = step_env(env, action)

        log_probs_old.append(log_prob)
        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state
        steps += 1
    
    env.reset()
    
    print(f"{steps} steps")
    # Convert to tensors
    # Be sure to detach() the tensors from the graph as these are "constants"
    states = torch.from_numpy(np.array(states).astype("float32")).detach().to(device)
    actions = torch.tensor(actions).detach().to(device)
    log_probs_old = torch.stack(log_probs_old).detach().to(device)
    
    returns = compute_returns(rewards).detach().to(device)
    
    values = value_net(states)
    advantages = (returns - values.squeeze()).detach().to(device)

    print(f" -- Mean reward: {np.mean(rewards)}")
    for _ in range(EPOCHS):
        for i in range(0, len(states), BATCH_SIZE):
            # Grab a batch of data
            batch_states = states[i:i+BATCH_SIZE]
            batch_actions = actions[i:i+BATCH_SIZE]
            batch_log_probs_old = log_probs_old[i:i+BATCH_SIZE]
            batch_advantages = advantages[i:i+BATCH_SIZE]
            batch_returns = returns[i:i+BATCH_SIZE]

            # Calculate new log probabilities
            new_action_probs = policy_net(batch_states)
            new_log_probs = torch.log(new_action_probs.gather(1, batch_actions.unsqueeze(-1)))

            # rho is the ratio between new and old log probabilities
            ratio = (new_log_probs - batch_log_probs_old).exp()

            # Calculate surrogate loss
            surrogate_loss = ratio * batch_advantages
            clipped_surrogate_loss = torch.clamp(ratio, 1-CLIP_EPSILON, 1+CLIP_EPSILON) * batch_advantages
            policy_loss = -torch.min(surrogate_loss, clipped_surrogate_loss).mean()

            policy_optimizer.zero_grad()
            policy_loss.backward()
            policy_optimizer.step()

            value_loss = criterion(value_net(batch_states),
                                   batch_returns.unsqueeze(-1))

            value_optimizer.zero_grad()
            value_loss.backward()
            value_optimizer.step()
            
            wandb_session.log({
                "policy_loss": policy_loss.item(),
                "value_loss": value_loss.item(),
                "steps": steps,
            })
            
    return (returns.mean(), returns.std(), steps)

In [118]:
gym_env = create_env(render_mode="rgb_array")
gym_env = gym.wrappers.RecordVideo(gym_env, "video", episode_trigger=lambda x: x % 5 == 0)

gym_env.reset()
gym_env.start_video_recorder()

for i in range(300):
  _, _, steps = ppo_step(gym_env)
  if i % 5 == 0:
    print(f"Episode {i}\tSteps: {steps}\tReturn: {steps}")
  
gym_env.close()
wandb_session.finish()

Moviepy - Building video /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-0.mp4
Running episode: 



Moviepy - Building video /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-0.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-0.mp4
411 steps
 -- Mean reward: -6.089872360229492
Episode 0	Steps: 411	Return: 411
Running episode: 403 steps
 -- Mean reward: -2.4592158794403076
Running episode: 1028 steps
 -- Mean reward: -14.090010643005371
Running episode: 1380 steps
 -- Mean reward: -12.673460960388184
Running episode: 870 steps
 -- Mean reward: -16.07280731201172
Running episode: Moviepy - Building video /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-5.mp4.
Moviepy - Writing video /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-5.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-5.mp4
2664 steps
 -- Mean reward: -11.871350288391113
Episode 5	Steps: 2664	Return: 2664
Running episode: 1873 steps
 -- Mean reward: -12.088123321533203
Running episode: 976 steps
 -- Mean reward: -15.386592864990234
Running episode: 639 steps
 -- Mean reward: -8.886055946350098
Running episode: 1099 steps
 -- Mean reward: -7.066318035125732
Running episode: Moviepy - Building video /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-10.mp4.
Moviepy - Writing video /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-10.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/b0kch01/Documents/Code/QuantizeRL/race_car/video/rl-video-episode-10.mp4
931 steps
 -- Mean reward: -15.692913055419922
Episode 10	Steps: 931	Return: 931
Running episode: 

In [None]:
def record_best_effort():
  env = create_env()
  env = gym.wrappers.RecordVideo(env, "tests")

  state, _ = env.reset()
  env.start_video_recorder()

  total_reward = 0
  done, i = False, 0
  
  while not done and i < 10000:
    action, _ = policy_net.deterministic_action(state)
    state, reward, done, *_ = env.step(action)
    total_reward += reward
    i += 1

  env.close()

In [None]:
record_best_effort()