# On-Policy RL

In this assignment, we will implement and test REINFORCE and PPO, which are both on-policy RL algortihms.

## Setup

In [1]:
!pip -q install gymnasium[mujoco]
!pip -q install imageio

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/211.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import gymnasium as gym
import random
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
from collections import namedtuple, deque
import imageio

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
from torch.distributions import MultivariateNormal
from torch.distributions import Categorical

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Explore the environment

We will train an REINFORCE agent on the `CartPole` environment.

In [3]:
from IPython.display import HTML
from base64 import b64encode

def show_video(path):
    mp4 = open(path, 'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    return HTML("""
    <video width=400 controls>
          <source src="%s" type="video/mp4">
    </video>
    """ % data_url)

In [None]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
frames = []

env.reset()

for _ in range(100):
    frames.append(env.render())
    action = np.random.randint(0, 2)
    obs, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        break
env.close()
imageio.mimsave('./CartPole.mp4', frames, fps=10, macro_block_size=1)
show_video('./CartPole.mp4')


## Policy Network

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.policy_net = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.Softmax(dim=0)
        )

    def forward(self, x):
        action_probs = self.policy_net(x)
        return action_probs

## Agent

REINFORCE algorithm works by interacting with an environment by taking actions based on a policy. As the agent collects rewards from the environment, it records the outcomes and the **log probabilities** of the actions it took. At the end of an episode, the algorithm calculates the total **discounted reward** from each step—this is known as the return.

$$ R_t = \sum_{k=t}^{T} \gamma^{k-t} r_k
 $$

These returns are used to weight the logged probabilities, actions that lead to higher returns are made more probable.


$$ \theta \leftarrow \theta + \alpha \sum_{t=0}^{T-1} \gamma^t R_t \nabla_\theta \log \pi_\theta(a_t|s_t)
 $$


In [None]:
class REINFORCEAgent:
    def __init__(self, policy, optimizer, gamma=0.99):
        self.policy = policy
        self.optimizer = optimizer
        self.gamma = gamma
        self.log_probs = []
        self.rewards = []

    def select_action(self, state):
        state = torch.from_numpy(state)
        probs = self.policy(state)
        m = Categorical(probs)
        action = m.sample()
        self.log_probs.append(m.log_prob(action))
        return action

    def update_policy(self):
        R = 0
        policy_loss = []
        returns = []

        for r in self.rewards[::-1]:
          R = r + self.gamma * R
          returns.insert(0, R)
        returns = torch.tensor(returns)

        for log_prob, G in zip(self.log_probs, returns):
          policy_loss.append(-log_prob * G)

        self.optimizer.zero_grad()
        policy_loss = torch.stack(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()

        del self.rewards[:]
        del self.log_probs[:]

    def store_reward(self, reward):
        self.rewards.append(reward)


## Training

In [None]:
env = gym.make('CartPole-v1')
input_size = 4
output_size = 2
lr = 1e-3

policy = PolicyNetwork(input_size, 32, output_size)
optimizer = optim.Adam(policy.parameters(), lr=lr)
agent = REINFORCEAgent(policy, optimizer)

num_episodes = 1000

for episode in range(num_episodes):
    state, info = env.reset()
    total_reward = 0

    policy.train()
    while True:
        action = agent.select_action(state).item()
        next_state, reward, terminate, truncated, _ = env.step(action)
        agent.store_reward(reward)
        state = next_state
        total_reward += reward
        if terminate or truncated:
          break

    agent.update_policy()
    if episode % 50 == 0:
        print(f'Episode {episode}: Total Reward = {total_reward}')
env.close()

Episode 0: Total Reward = 30.0
Episode 50: Total Reward = 17.0
Episode 100: Total Reward = 12.0
Episode 150: Total Reward = 55.0
Episode 200: Total Reward = 113.0
Episode 250: Total Reward = 96.0
Episode 300: Total Reward = 33.0
Episode 350: Total Reward = 85.0
Episode 400: Total Reward = 209.0
Episode 450: Total Reward = 94.0
Episode 500: Total Reward = 366.0
Episode 550: Total Reward = 500.0
Episode 600: Total Reward = 500.0
Episode 650: Total Reward = 231.0
Episode 700: Total Reward = 236.0
Episode 750: Total Reward = 500.0
Episode 800: Total Reward = 500.0
Episode 850: Total Reward = 500.0
Episode 900: Total Reward = 500.0
Episode 950: Total Reward = 500.0


## Evaluation

In [None]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
state, _ = env.reset()
frames = [env.render()]

total_reward = 0
policy.eval()
with torch.no_grad():
  while True:
      action = agent.select_action(state).item()
      next_state, reward, terminate, truncated, _ = env.step(action)
      frames.append(env.render())
      state = next_state
      total_reward += reward
      if terminate or truncated:
        break

env.close()
print(f'Total Reward: {total_reward}')

imageio.mimsave('./eval_reinforce.mp4', frames, fps=25, macro_block_size=1)
show_video('./eval_reinforce.mp4')

Total Reward: 500.0


# Proximal Policy Optimization

## Setup

### Explore the environment

In [4]:
# Configure MuJoCo to use the EGL rendering backend (requires GPU)
%env MUJOCO_GL=egl

env: MUJOCO_GL=egl


We will train a PPO agent in the `HalfCheetah` environment. This environment features continuous actions and more complex mechanics.

In [5]:
env = gym.make("HalfCheetah-v4", render_mode="rgb_array")
env.reset()
frames = []

for _ in range(100):
    frames.append(env.render())
    action = np.random.uniform(low=-1.0, high=1.0, size=6)
    obs, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        break
env.close()
imageio.mimsave('./HalfCheetah.mp4', frames, fps=20)
show_video('./HalfCheetah.mp4')


## Actor and Critic

Proximal Policy Optimization (PPO) is an advanced reinforcement learning algorithm that uses separate actor and critic networks to optimize policy performance.

The actor network is responsible for predicting a probability distribution over actions (discrete) or estimating the value for each action (continuous), given the current state, while the critic network evaluates how good the action taken by the actor is, by predicting the reward based on state.


In [15]:
class Actor(nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(Actor, self).__init__()
        self.backbone = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh()
        )

        self.mu = nn.Sequential(
            nn.Linear(hidden_dim, action_dim),
            nn.Tanh()
        )

        self.std = nn.Sequential(
            nn.Linear(hidden_dim, action_dim),
            nn.Tanh()
            )

    def forward(self, state):
        latent = self.backbone(state)
        mu = self.mu(latent)
        log_std = self.std(latent)
        return mu, torch.exp(log_std)

class Critic(nn.Module):
    def __init__(self, state_dim, hidden_dim):
        super(Critic, self).__init__()
        self.critic_net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, state):
        value = self.critic_net(state)
        return value


## Memory

In [16]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.state_values = []

    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.state_values[:]


## Agent

In PPO, the actor's goal is to maximize the expected return. However, direct maximization can cause large policy updates, risking instability. To prevent this, PPO employs a clipping mechanism, limiting policy changes to a defined range.

$$ L^{CLIP}(\theta) = \hat{\mathbb{E}}_t \left[ \min(r_t(\theta) \hat{A}_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) \hat{A}_t) \right]
 $$

Additionally, it uses a probability ratio to scale updates, ensuring changes This ratio provides a scaling factor for the policy updates, ensuring that changes are made in proportion to the improvement in policy performance.

$$ r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{old}}(a_t|s_t)}
 $$

 The critic aims to minimize the error between its predictions and the actual returns.

 $$ L^{VF}(\phi) = \left( V_\phi(s_t) - \hat{R}_t \right)^2
 $$

In [18]:
class PPO(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=64, lr=1e-4, gamma=0.99, epochs=4, eps_clip=0.2):
        super(PPO, self).__init__()
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.epochs = epochs

        self.actor = Actor(state_dim, hidden_size, action_dim).to(device)
        self.critic = Critic(state_dim, hidden_size).to(device)

        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=lr)
        self.memory = Memory()

    def select_action(self, state):
        state = torch.FloatTensor(state).to(device)

        with torch.no_grad():
            state_value = self.critic(state)
            mu, std = self.actor(state)

        m = Normal(mu, std)
        action = m.rsample()
        log_prob = m.log_prob(action)

        self.memory.states.append(state.detach())
        self.memory.state_values.append(state_value.detach())
        self.memory.actions.append(action.detach())
        self.memory.logprobs.append(log_prob.detach())

        return action.detach().cpu().numpy()

    def evaluate(self, states, actions):
        state_value = self.critic(states)
        mu, std = self.actor(states)
        m = Normal(mu, std)
        log_prob = m.log_prob(actions)

        return log_prob, state_value, m.entropy()

    def update(self):
        rewards = []
        discounted_reward = 0
        for r in self.memory.rewards[::-1]:
          discounted_reward = r + self.gamma * discounted_reward
          rewards.insert(0, discounted_reward)
        rewards = torch.FloatTensor(rewards).reshape(-1, 1).to(device)

        mem_states = torch.stack(self.memory.states).to(device)
        mem_state_values = torch.stack(self.memory.state_values).to(device)
        mem_actions = torch.stack(self.memory.actions).to(device)
        mem_logprobs = torch.stack(self.memory.logprobs).to(device)

        # clear the buffer
        self.memory.clear()

        advantages = rewards - mem_state_values

        loss_ac = 0
        loss_cri = 0
        for _ in range(self.epochs):
            # calculate logprobs and state values based on the new policy
            logprobs, state_values, entropy = self.evaluate(mem_states, mem_actions)

            prob_ratio = torch.exp(logprobs - mem_logprobs)
            clipped = torch.clamp(prob_ratio, min=1-self.eps_clip, max=1+self.eps_clip)
            loss_actor = -(torch.min(prob_ratio * advantages, clipped * advantages) + 0.01 * entropy).mean()
            loss_critic = F.mse_loss(state_values, rewards)

            self.optimizer_actor.zero_grad()
            loss_actor.backward()
            loss_ac += loss_actor.item()
            self.optimizer_actor.step()

            self.optimizer_critic.zero_grad()
            loss_critic.backward()
            loss_cri += loss_critic.item()
            self.optimizer_critic.step()

        return loss_ac, loss_cri

    def store_reward(self, reward):
        self.memory.rewards.append(reward)

## Training

In [None]:
env = gym.make("HalfCheetah-v4")
state_dim = 17
action_dim = 6
hidden_size = 64
lr = 1e-4

agent = model = PPO(state_dim, action_dim, hidden_size=hidden_size, lr=lr, epochs=4)

num_episodes = 20000

actor_losses = []
critic_losses = []
moving_rewards = np.array([])

for episode in range(num_episodes):
    state, _ = env.reset()
    total_reward = 0
    while True:
        action = agent.select_action(state)
        next_state, reward, terminate, truncated, _ = env.step(action)
        agent.store_reward(reward)
        state = next_state
        total_reward += reward
        if terminate or truncated:
          break
    loss_ac, loss_cri = agent.update()
    actor_losses.append(loss_ac)
    critic_losses.append(loss_cri)
    moving_rewards = np.append(moving_rewards, total_reward)
    if episode % 100 == 0:
        print(f'Episode {episode}: Going Reward = {moving_rewards.mean():.1f}: Std = {moving_rewards.std():.1f}')
        print(f"actor loss:\t{loss_ac:.4f}")
        print(f"critic loss:\t{loss_cri:.4f}")
        moving_rewards = np.array([])

env.close()

Episode 0: Going Reward = -805.5: Std = 0.0
actor loss:	291.9532
critic loss:	23856.1528
Episode 100: Going Reward = -669.5: Std = 66.2
actor loss:	199.7883
critic loss:	11487.8989
Episode 200: Going Reward = -672.2: Std = 72.5
actor loss:	223.2991
critic loss:	14029.6355
Episode 300: Going Reward = -677.5: Std = 67.9
actor loss:	126.3478
critic loss:	6385.1067
Episode 400: Going Reward = -662.3: Std = 60.4
actor loss:	36.1700
critic loss:	2233.1116
Episode 500: Going Reward = -621.7: Std = 60.5
actor loss:	7.6181
critic loss:	1570.2376
Episode 600: Going Reward = -584.2: Std = 64.9
actor loss:	-18.3627
critic loss:	1134.8535
Episode 700: Going Reward = -544.9: Std = 68.6
actor loss:	-14.6477
critic loss:	1754.4107
Episode 800: Going Reward = -500.3: Std = 59.1
actor loss:	-2.3683
critic loss:	761.3340
Episode 900: Going Reward = -478.5: Std = 71.2
actor loss:	25.6277
critic loss:	1120.8704
Episode 1000: Going Reward = -426.8: Std = 70.7
actor loss:	-7.9587
critic loss:	424.9332
Episod

## Evaluation

In [None]:
env = gym.make("HalfCheetah-v4", render_mode="rgb_array")
state, _ = env.reset()
frames = [env.render()]

total_reward = 0
with torch.no_grad():
  while True:
      action = agent.select_action(state)
      next_state, reward, terminate, truncated, _ = env.step(action)
      frames.append(env.render())
      state = next_state
      total_reward += reward
      if terminate or truncated:
        break

env.close()
print(f'Total Reward: {total_reward}')

imageio.mimsave('./eval_ppo.mp4', frames, fps=25)
show_video('./eval_ppo.mp4')

Total Reward: 2275.383372650927
