## Playing Pong Using Only Pixel Values ft. Policy Gradient

In [None]:
# Import stuff
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time, os
from torch.utils.tensorboard import SummaryWriter

# Set random seed
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Create directory for tensorlog
# Make sure to use a new directory for every new run
log_dir = 'logs/pong_pg/pong_pg_01'
writer = SummaryWriter(log_dir)

# Use GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(device)

In [None]:
# Create the environment
env = gym.make("Pong-v0")
env.seed(seed)

n_actions = env.action_space.n
print(n_actions)

In [None]:
# A function to grab the current game screen and return it as a 2D numpy array
def get_image(env):
    image = env.render(mode='rgb_array')
    image = image.astype(np.float32) / 255.0  # convert to float and scale to the range [0,1]
    return image.transpose(2, 0, 1)

In [None]:
# A class for the policy network. Here, we will use a convolutional neural network
# that will take an entire screen of game state and suggest an action from that.
class PolicyNetwork(nn.Module):
    def __init__(self, input_channels=3, input_height=210, input_width=160, output_size=6):
        """Initialize the network"""
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=input_channels, out_channels=16, kernel_size=8, stride=4)
        self.pool1 = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2)
        self.pool2 = nn.MaxPool2d(kernel_size=2)
        # Can try using pooling layers here
        flat_size = 32 * (self.conv2d_size_out(self.conv2d_size_out(input_height, 8, 4) // 2, 4, 2) // 2) * \
                    (self.conv2d_size_out(self.conv2d_size_out(input_width, 8, 4) // 2, 4, 2) // 2)
        self.fc1 = nn.Linear(flat_size, 256)
        self.fc2 = nn.Linear(256, output_size)
        
    def conv2d_size_out(self, size, kernel_size, stride):
        """Utility function to calculate size of dimension after convolution"""
        return (size - (kernel_size - 1) - 1) // stride + 1
        
    def forward(self, x):
        """Make a forward pass"""
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(x.shape[0], -1)
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=1)
        return x

In [None]:
# A class for the agent
class Agent:
    def __init__(self, learning_rate=0.001, n_actions=6):
        """Initialize agent"""
        self.learning_Rate = learning_rate
        self.n_actions = n_actions
        self.policy = PolicyNetwork().to(device)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
    
    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        state = state.to(device)
        probs = self.policy(state)
        action = np.random.choice(self.n_actions, p=probs.to('cpu').detach().squeeze(0).numpy())
        log_prob = torch.log(probs.squeeze(0)[action])
        return action, log_prob
    
    def get_probs(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.policy(state)
        return probs
    
    def update(self, rewards, log_probs):
        # Convert to tensors
        rewards_tensor = torch.from_numpy(rewards).to(device)
        log_probs_tensor = torch.stack(log_probs).to(device)
        
        # Reset parameter gradients
        self.optimizer.zero_grad()
        
        # Compute "loss" function
        loss = torch.mul(rewards_tensor, -log_probs_tensor).sum()
        
        # Perform backprop
        loss.backward()
        
        # Optimize
        self.optimizer.step()
        
    def save_parameters(self, path):
        """Save policy's parameters"""
        torch.save(self.policy.state_dict(), path)
        
    def load_parameters(self, path):
        """Load policy's parameters"""
        self.policy.load_state_dict(torch.load(path))

In [None]:
# A function to discount a sequence of rewards, assume input is a 1D numpy array
def discount_rewards(rewards, discount_rate=0.9):
    discounted_rewards = np.zeros_like(rewards)
    total_reward = 0
    for i in reversed(range(len(rewards))):
        discounted_rewards[i] = rewards[i] + discount_rate * total_reward
        total_reward = discounted_rewards[i]
    return discounted_rewards

# A function to normalize rewards, assume input is a 1D numpy array
def normalize_rewards(rewards):
    return (rewards - rewards.mean()) / rewards.std()

# A function to simulate an episode
def simulate(env, agent, render=False, fps=30, max_steps=10000, detailed=False):
    seconds_per_frame = 1 / fps
    total_reward = 0
    env.reset()
    image = get_image(env)
    prev_image = image
    state = image - prev_image
    
    for i in range(max_steps):
        if render:
            env.render()
            time.sleep(seconds_per_frame)
            
        if detailed:
            probs = agent.get_probs(state)
            print(probs)
            
        action, log_prob = agent.get_action(state)
        _, reward, done, _ = env.step(action)
        total_reward += reward
        
        if done:
            env.close()
            break
        else:
            prev_image = image
            image = get_image(env)
            state = image - prev_image
    
    if detailed:
        print("Simulation complete - total reward:", total_reward)
    
    return total_reward

In [None]:
# Agent hyperparameters
learning_rate = 0.0001

# Initialize the agent
agent = Agent(learning_rate)

# Autosave settings
save_parameters = True
save_interval = 10
save_path = 'models/pong_pg_v1.pth'
load_parameters_before_training = True

if load_parameters_before_training and os.path.exists(save_path):
    agent.load_parameters(save_path)
    print("Parameters loaded successfully")

In [None]:
# Training hyperparameters
n_epochs = 10000
discount_rate = 0.99

# Option to show the agent in training
show_simulation = False
epoch_per_simulation = 10

# Training loop
for epoch in range(n_epochs):
    rewards = []
    log_probs = []
    score = 0
    episode_length = 0
    done = False
    
    # Initialize environment
    env.reset()
    image = get_image(env)
    prev_image = image
    state = image - prev_image
    
    # Collect trajectory of a full episode
    while not done:
        # Determine action
        action, log_prob = agent.get_action(state)
        _, reward, done, _ = env.step(action)
        
        # Collect reward obtained and the log probability of the selected action
        rewards.append(reward)
        log_probs.append(log_prob)

        # Update state
        prev_image = image
        image = get_image(env)
        state = image - prev_image
        
        # Keep track of current score
        score += reward
        episode_length += 1
        
    # Discount and normalize rewards
    rewards = discount_rewards(np.array(rewards), discount_rate=discount_rate)
    rewards = normalize_rewards(rewards)
    
    # Update the policy
    agent.update(rewards, log_probs)
    
    # Track performance in TensorBoard
    writer.add_scalar('score', score, epoch)
    writer.add_scalar('episode_length', episode_length, epoch)
    print("Epoch:", epoch, "\tScore:", score, "\tEpisode Length:", episode_length)
    
    # Save model parameters
    if save_parameters:
        if epoch % save_interval == 0:
            agent.save_parameters(save_path)
    
    # Simulate agent (optional)
    if show_simulation and epoch % epoch_per_simulation == 0:
        simulate(env, agent, render=True, fps=120)

In [None]:
simulate(env, agent, render=True, detailed=True)

In [None]:
env.close()
writer.close()