# PPO Base Implementation
This will be the baseline implementation for comparing with the other methods.

In [154]:
import random
import pandas as pd
import time
from datetime import datetime
import os

import gym
import numpy as np

import torch
from torch.nn import LeakyReLU, ReLU, Linear, MSELoss, Sequential, Softmax, Dropout
from torch.optim import Adam
from torch.quantization import quantize_dynamic
from torch.nn.utils import clip_grad_norm_

import logging
logging.basicConfig(level=logging.INFO)

In [155]:
SEED = 1234
LEARNING_RATE = 1e-4
GAMMA = 0.99
EPOCHS = 1
CLIP_EPSILON = 0.2
BATCH_SIZE = 1

DEVICE = torch.device("cuda")
TYPE = torch.float16

In [156]:
torch.set_default_dtype(TYPE)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
# torch.backends.cpu.deterministic = True
torch.backends.cudnn.deterministic = True

class Logger:
    def __init__(self, extras: dict):
        self.data = []
        self.base_time = time.time()
        self.extras = extras
        
    def add(self, p_loss, v_loss, steps):
        if len(self.data) == 0:
            self.base_time = time.time()
        
        time_elapsed = round(time.time() - self.base_time, 5)
        self.data.append((time_elapsed, p_loss, v_loss, steps))
    
    def df(self):
        return pd.DataFrame(self.data, columns=["time", "p_loss", "v_loss", "steps"])
    
    def save(self):
        path = "logs/" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        os.mkdir(path)
        pd.DataFrame([self.extras]).to_csv(path + "/info.csv")
        pd.DataFrame(
            self.data, 
            columns=["time", "p_loss", "v_loss", "steps"]
        ).to_csv(path + "/data.csv")

logger = Logger({
    "seed": SEED,
    "learning_rate": LEARNING_RATE,
    "gamma": GAMMA,
    "epochs": EPOCHS,
    "clip_epsilon": CLIP_EPSILON,
    "batch_size": BATCH_SIZE,
    "device": DEVICE,
    "type": TYPE
})

In [157]:
class CartPoleQuantized:
  def __init__(self, dtype: torch.dtype, device: torch.device, record=True):
    self.dtype = dtype
    self.device = device
    
    env = gym.make('CartPole-v1', render_mode='rgb_array')
    
    if record:
        env = gym.wrappers.RecordVideo(env, f"training/{dtype}/", episode_trigger=lambda x: x % 100 == 0 and x >= 30)
        env.start_video_recorder()
    
    env.reset()
    
    self.env = env
  
  def reset(self):
    return torch.tensor(self.env.reset()[0], dtype=self.dtype, device=self.device)
  
  def step(self, action):
    state, reward, done, truncated, _ = self.env.step(action)
    return torch.tensor(state, dtype=self.dtype, device=self.device), reward, done or truncated
  
  def close(self):
    self.env.close()
  
  @staticmethod
  def env():
    return gym.make('CartPole-v1', render_mode='rgb_array')

## Network Architecture

**PolicyNetwork**:
- Input: State
- Output: Action distribution (0-1)
- 2 Hidden layers with LeakyReLU activation

**ValueNetwork**:
- Input: State
- Output: Value
- 2 Hidden layers with LeakyReLU activation

In [158]:

class PolicyNetwork(torch.nn.Module):
  def __init__(self, input_dim, hidden_dim):
    super().__init__()
    self.model = Sequential(
      Linear(input_dim, hidden_dim),
      ReLU(),
      Linear(hidden_dim, hidden_dim),
      ReLU(),
      Linear(hidden_dim, 2),
      Softmax()
    )

  def forward(self, state): 
    return self.model(state)
 
  def stochastic_action(self, state):
    r"""Returns an action sampled from the policy network."""
    
    probs = self.forward(state).detach()
    # adding floating point error to the maximum probability
    probs[torch.argmax(probs)] += 1 - probs.sum()
    
    probs.squeeze() # quantized tensors have an extra dimension
    
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    return action.item(), m.log_prob(action)
  
  def deterministic_action(self, state):
    r"""Returns an action with the highest probability."""
    
    probs = self.forward(state).detach()
    action = torch.argmax(probs)
    return action.item()

  
class ValueNetwork(torch.nn.Module):
  def __init__(self, input_dim, hidden_dim) -> None:
    super().__init__()
    self.model = Sequential(
      Linear(input_dim, hidden_dim),
      ReLU(),
      Linear(hidden_dim, hidden_dim),
      ReLU(),
      Linear(hidden_dim, 1)
    )
  
  def forward(self, state):
    return self.model(state)
  

# Training
- 64 hidden nodes
- Adam optimizer
- MSE loss for value network

In [159]:
class PPOSession:
    def __init__(self, env: CartPoleQuantized):
        self.env = env
        self.episode = 0
        
        _observation_size = CartPoleQuantized.env().observation_space.shape[0]

        self.policy_net = PolicyNetwork(_observation_size, 64).to(DEVICE)
        self.value_net  = ValueNetwork(_observation_size, 64).to(DEVICE)

        self.policy_optimizer = Adam(self.policy_net.parameters(), lr=LEARNING_RATE, eps=1e-7)
        self.value_optimizer  = Adam(self.value_net.parameters(), lr=LEARNING_RATE, eps=1e-7)
        self.quantized = False
    
    @staticmethod
    def compute_returns(rewards):
        returns = torch.zeros(len(rewards))
        R = 0
        for i in reversed(range(len(rewards))):
            R = rewards[i] + GAMMA * R
            returns[i] = R
        return returns
    
    def run(self, episodes):
        
        def mean(l: list):
            return round(sum(l)/len(l), 3)
            
        self.policy_net.train()
        self.value_net.train()
        
        batch_returns = []
        batch_std = []
        batch_steps = []
        
        for i in range(episodes):
            self.episode += 1
            returns, std, steps = self.ppo_step()

            batch_returns.append(returns.item())
            batch_std.append(std.item())
            batch_steps.append(steps)
            
            if self.episode % 50 == 0:
                print(f"Episode {self.episode} - Returns: {mean(batch_returns)} - Std: {mean(batch_std)} - Steps: {mean(batch_steps)}")
                batch_returns = []
                batch_std = []
                batch_steps = []

        self.policy_net.eval()
        self.value_net.eval()
    
    def ppo_step(self):
        state = self.env.reset()
        
        # capture entire episode
        done, steps = False, 0
        states, actions, log_probs_old, rewards = [], [], [], []
        
        while not done:
            if self.quantized:
                state = state.unsqueeze(0)
            action, log_prob = self.policy_net.stochastic_action(state)
            next_state, reward, done = self.env.step(action)

            log_probs_old.append(log_prob)
            states.append(state.squeeze())
            actions.append(action)
            rewards.append(reward)

            state = next_state
            steps += 1
        
        # Convert to tensors
        # Be sure to detach() the tensors from the graph as these are "constants"
        states = torch.stack(states).detach().to(DEVICE)
        actions = torch.tensor(actions).detach().to(DEVICE)
        log_probs_old = torch.stack(log_probs_old).detach().to(DEVICE)
        
        returns = self.compute_returns(rewards).detach().to(DEVICE)
        values = self.value_net(states).detach().to(DEVICE)
        advantages = returns - values.squeeze()

        for _ in range(EPOCHS):
            for i in range(0, len(states), BATCH_SIZE):
                # Grab a batch of data
                batch_states = states[i:i+BATCH_SIZE]
                batch_actions = actions[i:i+BATCH_SIZE]
                batch_log_probs_old = log_probs_old[i:i+BATCH_SIZE]
                batch_advantages = advantages[i:i+BATCH_SIZE]
                batch_returns = returns[i:i+BATCH_SIZE]

                # Calculate new log probabilities
                new_action_probs = self.policy_net(batch_states)
                new_log_probs = torch.log(new_action_probs.gather(1, batch_actions.unsqueeze(-1)))

                # rho is the ratio between new and old log probabilities
                ratio = (new_log_probs - batch_log_probs_old).exp()

                # Calculate surrogate loss
                surrogate_loss = ratio * batch_advantages
                clipped_surrogate_loss = torch.clamp(ratio, 1-CLIP_EPSILON, 1+CLIP_EPSILON) * batch_advantages
                policy_loss = -torch.min(surrogate_loss, clipped_surrogate_loss).mean()
                
                self.policy_optimizer.zero_grad()
                policy_loss.backward()
                self.policy_optimizer.step()

                # check for nan
                if torch.isnan(policy_loss):
                    print("NaN detected in policy loss")
                    return

                value_loss = torch.pow(self.value_net(
                    batch_states) - batch_returns.unsqueeze(-1), 2).mean()

                self.value_optimizer.zero_grad()
                value_loss.backward()
                self.value_optimizer.step()

                # check for nan
                if torch.isnan(value_loss):
                    print("NaN detected in value loss")
                    return
        
        logger.add(policy_loss.item(), value_loss.item(), steps)
        return (returns.mean(), returns.std(), steps)
    
    def record_best_effort(self):
        env = gym.make('CartPole-v1', render_mode='rgb_array', max_episode_steps=10000)
        env = gym.wrappers.RecordVideo(env, "tests")

        state, _ = env.reset()
        state = torch.tensor(state, dtype=TYPE, device=DEVICE, requires_grad=False)
        env.start_video_recorder()

        total_reward = 0
        done, i = False, 0
        
        while not done and not truncated:
            if self.quantized:
                state = state.unsqueeze(0)
            
            action = self.policy_net.deterministic_action(state)
            state, reward, done, truncated, _ = env.step(action)
            state = torch.tensor(state, dtype=TYPE, device=DEVICE, requires_grad=False).unsqueeze(0)
            total_reward += reward
            i += 1

        env.close()
        return total_reward, i

In [160]:
env = CartPoleQuantized(TYPE, DEVICE, record=False)
session = PPOSession(env)

In [161]:
session.run(20000)

Episode 50 - Returns: 26.469 - Std: 13.281 - Steps: 67.18
Episode 100 - Returns: 32.258 - Std: 15.853 - Steps: 85.6
Episode 150 - Returns: 37.799 - Std: 17.939 - Steps: 108.14
Episode 200 - Returns: 48.887 - Std: 21.665 - Steps: 158.86
Episode 250 - Returns: 54.155 - Std: 23.087 - Steps: 189.5
Episode 300 - Returns: 59.424 - Std: 23.995 - Steps: 230.92
Episode 350 - Returns: 68.847 - Std: 24.988 - Steps: 323.64


KeyboardInterrupt: 

In [18]:
logger.save()

In [19]:
session.record_best_effort()

Moviepy - Building video /home/ubuntu/QuantizeRL/cart_pole/tests/rl-video-episode-0.mp4.
Moviepy - Writing video /home/ubuntu/QuantizeRL/cart_pole/tests/rl-video-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /home/ubuntu/QuantizeRL/cart_pole/tests/rl-video-episode-0.mp4




UnboundLocalError: local variable 'truncated' referenced before assignment

# Loading and saving the new checkpoints!

In [250]:
# save weights
torch.save(session.policy_net.state_dict(), "float-16-good-policy.pt")
torch.save(session.value_net.state_dict(), "float-16-good-value.pt")

In [113]:
# Load weights
env = CartPoleQuantized(TYPE, DEVICE)
session = PPOSession(env)
session.policy_net.load_state_dict(torch.load("checkpoints/float-32-good-policy.pt"))
session.value_net.load_state_dict(torch.load("checkpoints/float-32-good-value.pt"))

session.policy_net.eval()
session.value_net.eval()

torch.backends.quantized.engine = "qnnpack"

quantize_dynamic(
    session.policy_net,
    {torch.nn.Linear},
    dtype=torch.qint8,
    inplace=True
)

session.quantized = True
session.run(500)

Moviepy - Building video /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/training/torch.float32/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/training/torch.float32/rl-video-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/training/torch.float32/rl-video-episode-0.mp4


  return self._call_impl(*args, **kwargs)


Moviepy - Building video /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/training/torch.float32/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/training/torch.float32/rl-video-episode-0.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/training/torch.float32/rl-video-episode-0.mp4


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
#session.record_best_effort()

Moviepy - Building video /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/tests/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/tests/rl-video-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/tests/rl-video-episode-0.mp4




Moviepy - Building video /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/tests/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/tests/rl-video-episode-0.mp4



                                                                   

Moviepy - Done !
Moviepy - video ready /Users/b0kch01/Documents/Code/QuantizeRL/cart_pole/tests/rl-video-episode-0.mp4


(15321.0, 15321)