# Policy search
Policy search RL algorithm for playing the game of Space Invaders.

In [29]:
# Import the environment
import ale_py
import gymnasium as gym
gym.register_envs(ale_py) # needed to run atari games

In [30]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import matplotlib.pyplot as plt

In [31]:
def preprocess_frame(frame):
    frame = frame[34:194] 
    frame = frame[::2, ::2, 0] 
    frame[frame == 144] = 0
    frame[frame == 109] = 0
    frame[frame != 0] = 1
    return np.expand_dims(frame.astype(np.float32), axis=0)

In [32]:
class PolicyNetwork(nn.Module):
    def __init__(self, action_space):
        super(PolicyNetwork, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=4, stride=2),
            nn.ReLU(),
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(2048, 256),  # Updated input size after CNN
            nn.ReLU(),
            nn.Linear(256, action_space)
        )

    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return nn.Softmax(dim=-1)(x)

def compute_discounted_rewards(rewards, gamma):
    discounted_rewards = np.zeros_like(rewards)
    cumulative = 0
    for t in reversed(range(len(rewards))):
        cumulative = rewards[t] + gamma * cumulative
        discounted_rewards[t] = cumulative
    return discounted_rewards

In [None]:
def play_one_step(env, state, model):
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    action_probs = model(state)
    dist = Categorical(action_probs)
    action = dist.sample()
    log_prob = dist.log_prob(action)

    next_state, reward, done, _, _ = env.step(action.item())
    next_state = preprocess_frame(next_state)
    return next_state, reward, done, log_prob

def play_multiple_episodes(env, n_episodes, n_max_steps, model):
    all_rewards = []
    all_log_probs = []

    for _ in range(n_episodes):
        current_rewards = []
        current_log_probs = []
        state, _ = env.reset()
        state = preprocess_frame(state)

        for _ in range(n_max_steps):
            state, reward, done, log_prob = play_one_step(env, state, model)
            current_rewards.append(reward)
            current_log_probs.append(log_prob)
            if done:
                break

        all_rewards.append(current_rewards)
        all_log_probs.append(current_log_probs)

    return all_rewards, all_log_probs

def discount_rewards(rewards, discount_factor):
    discounted = np.zeros_like(rewards, dtype=np.float32)
    cumulative = 0
    for t in reversed(range(len(rewards))):
        cumulative = rewards[t] + discount_factor * cumulative
        discounted[t] = cumulative
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / (reward_std + 1e-8) for discounted_rewards in all_discounted_rewards]

# Hyperparameters
env_name = 'ALE/SpaceInvaders-v5'
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.95
learning_rate = 1e-3

# Environment and model setup
env = gym.make(env_name, render_mode=None)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

policy = PolicyNetwork(env.action_space.n).to(device)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

# Training loop
for iteration in range(n_iterations):
    all_rewards, all_log_probs = play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, policy
    )

    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)

    all_mean_grads = []
    policy_loss = 0
    for log_probs, rewards in zip(all_log_probs, all_final_rewards):
        log_probs_tensor = torch.stack(log_probs)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32, device=device)
        policy_loss += -torch.sum(log_probs_tensor * rewards_tensor)

    policy_loss /= n_episodes_per_update

    optimizer.zero_grad()
    policy_loss.backward()
    optimizer.step()

    mean_reward = np.mean([sum(rewards) for rewards in all_rewards])
    print(f"Iteration {iteration + 1}/{n_iterations}: Mean Reward = {mean_reward:.2f}")

env.close()


Using device: cpu
Iteration 1/150: Mean Reward = 51.50
Iteration 2/150: Mean Reward = 60.50
Iteration 3/150: Mean Reward = 46.50
Iteration 4/150: Mean Reward = 41.50
Iteration 5/150: Mean Reward = 33.50
Iteration 6/150: Mean Reward = 46.00
Iteration 7/150: Mean Reward = 31.00
Iteration 8/150: Mean Reward = 39.50
Iteration 9/150: Mean Reward = 51.00
Iteration 10/150: Mean Reward = 39.50
Iteration 11/150: Mean Reward = 93.00
Iteration 12/150: Mean Reward = 89.50
Iteration 13/150: Mean Reward = 82.50
Iteration 14/150: Mean Reward = 81.50
Iteration 15/150: Mean Reward = 88.00
Iteration 16/150: Mean Reward = 105.00
Iteration 17/150: Mean Reward = 112.50
Iteration 18/150: Mean Reward = 115.00
Iteration 19/150: Mean Reward = 109.50
Iteration 20/150: Mean Reward = 115.00
Iteration 21/150: Mean Reward = 115.00
Iteration 22/150: Mean Reward = 115.00
Iteration 23/150: Mean Reward = 115.00
Iteration 24/150: Mean Reward = 115.00
Iteration 25/150: Mean Reward = 115.00
Iteration 26/150: Mean Reward =