# REINFORCE

This notebook implements and demonstrates the REINFORCE algorithm for the Frozen Lake environment. We will also compare it with REINFORCE with baseline.

REINFORCE is an on-policy algorithm for learning a policy $\pi_\theta$ to maximize the expected return $J(\theta)$:

$$J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T \gamma^t r(s_t, a_t) \right]$$

where $\tau = (s_0, a_0, s_1, a_1, \ldots)$ is a trajectory, $r(s_t, a_t)$ is the reward at time $t$, and $\gamma$ is the discount factor.


In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

class Policy(nn.Module):
    def __init__(self, state_size, action_size):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, action_size)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        return torch.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.forward(state)
        action = torch.multinomial(probs, 1).item()
        log_prob = torch.log(probs.squeeze(0)[action])

        return action, log_prob

def reinforce(env, policy, optimizer, n_episodes=2000, gamma=0.99):
    all_rewards = []
    for episode in range(n_episodes):
        log_probs = []
        rewards = []
        state = env.reset()

        state = state[0]

        while True:
            action, log_prob = policy.act(state)
            log_probs.append(log_prob)
            state, reward, terminated, truncated, _ = env.step(action)
            rewards.append(reward)

            if terminated or truncated:
                break

        # Compute cumulative rewards
        all_rewards.append(sum(rewards))

        returns = []
        R = 0
        for r in rewards[::-1]:
            R = r + gamma * R
            returns.insert(0, R)

        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-9)

        policy_loss = []
        for log_prob, R in zip(log_probs, returns):
            policy_loss.append(-log_prob * R)

        optimizer.zero_grad()
        policy_loss = torch.stack(policy_loss).sum()
        policy_loss.backward()
        optimizer.step()

        if episode % 100 == 0:
            print(f"Episode: {episode}, Total Reward: {sum(rewards)}")

    # Visualize rewards over time
    import matplotlib.pyplot as plt
    plt.plot(all_rewards)
    plt.xlabel('Episode')
    plt.ylabel('Cumulative Reward')
    plt.title('Policy Gradient Training')
    plt.grid()
    plt.show()

    return all_rewards

env = gym.make("LunarLander-v3")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
policy = Policy(state_size, action_size)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
r_rewards = reinforce(env, policy, optimizer)


In [None]:
env = gym.make("LunarLander-v3", render_mode="human")

# Evaluate the policy
state = env.reset()
state = state[0]
rewards = []
while True:
    # env.render()
    action, _ = policy.act(state)
    state, reward, terminated, truncated, _ = env.step(action)
    rewards.append(reward)
    if terminated or truncated:
        break

env.close()
print("Total reward:", sum(rewards))

In [None]:
class Baseline(nn.Module):
    def __init__(self, input_size, output_size):
        super(Baseline, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, output_size)
        
    def forward(self, x):
        out = self.fc1(x)
        out = torch.relu(out)
        out = self.fc2(out)
        return out

def reinforce_with_baseline(env, policy, baseline, policy_optimizer, baseline_optimizer, num_episodes, gamma=0.99):
    all_rewards = []
    for episode in range(num_episodes):
        state = env.reset()
        log_probs = []
        rewards = []
        baselines = []

        state = state[0]

        while True:
            state_tensor = torch.from_numpy(state).float().unsqueeze(0)
            action_probs = policy(state_tensor)
            action = torch.multinomial(action_probs, 1).item()

            next_state, reward, terminated, truncated, _ = env.step(action)

            log_prob = torch.log(action_probs.squeeze(0)[action])
            log_probs.append(log_prob)
            rewards.append(reward)

            baseline_value = baseline(state_tensor)
            baselines.append(baseline_value)

            state = next_state

            if terminated or truncated:
                break

        all_rewards.append(sum(rewards))
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + gamma * G
            returns.insert(0, G)

        returns = torch.tensor(returns, dtype=torch.float)
        returns = (returns - returns.mean()) / (returns.std() + 1e-9)
        baselines = torch.cat(baselines).squeeze()

        with torch.no_grad():
            advantages = returns - baselines

        policy_loss = []
        for log_prob, advantage in zip(log_probs, advantages):
            policy_loss.append(-log_prob * advantage)

        policy_optimizer.zero_grad()
        policy_loss = torch.stack(policy_loss).mean()
        policy_loss.backward()
        policy_optimizer.step()

        baseline_loss = nn.MSELoss()(baselines, returns)
        baseline_optimizer.zero_grad()
        baseline_loss.backward()
        baseline_optimizer.step()

        if episode % 100 == 0:
            print(f"Episode {episode}, Policy Loss: {policy_loss.item()}, Baseline Loss: {baseline_loss.item()}")

    # Visualize rewards over time
    import matplotlib.pyplot as plt
    plt.plot(all_rewards)
    plt.xlabel('Episode')
    plt.ylabel('Cumulative Reward')
    plt.title('Policy Gradient Training')
    plt.grid()
    plt.show()

    return all_rewards

env = gym.make("LunarLander-v3")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

policy = Policy(state_size, action_size)
baseline = Baseline(state_size, 1)
policy_optimizer = optim.Adam(policy.parameters(), lr=1e-2)
baseline_optimizer = optim.Adam(baseline.parameters(), lr=1e-2)

num_episodes = 2000
rb_rewards = reinforce_with_baseline(env, policy, baseline, policy_optimizer, baseline_optimizer, num_episodes, 0.99)

In [None]:
# Smooth rewards and plot both reinforce and reinforce with baseline
import matplotlib.pyplot as plt
def smooth_rewards(rewards, window_size=100):
    smoothed = []
    for i in range(len(rewards)):
        if i < window_size:
            smoothed.append(np.mean(rewards[:i+1]))
        else:
            smoothed.append(np.mean(rewards[i-window_size+1:i+1]))
    return smoothed

smoothed_r_rewards = smooth_rewards(r_rewards)
smoothed_rb_rewards = smooth_rewards(rb_rewards)

plt.plot(smoothed_r_rewards, label='REINFORCE')
plt.plot(smoothed_rb_rewards, label='REINFORCE with Baseline')
plt.xlabel('Episode')
plt.ylabel('Cumulative Reward')
plt.title('Comparison of REINFORCE and REINFORCE with Baseline')
plt.legend()
plt.grid()
plt.show()

In [None]:
env = gym.make("LunarLander-v3", render_mode="human")

# Evaluate the policy
state = env.reset()
state = state[0]
rewards = []
while True:
    # env.render()
    action, _ = policy.act(state)
    state, reward, terminated, truncated, _ = env.step(action)
    rewards.append(reward)
    if terminated or truncated:
        break

env.close()
print("Total reward:", sum(rewards))