## Playing Pong Using Only Pixel Values ft. Policy Gradient

In [1]:
# Import stuff
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time

# Set random seed
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [2]:
# Create the environment
env = gym.make("Pong-v0")
env.seed(seed)

n_actions = env.action_space.n
print(n_actions)

6


In [3]:
# A class for the policy network. Here, we will use a convolutional neural network
# that will take the screen of the game and suggest an action from that.
class PolicyNetwork(nn.Module):
    def __init__(self):
        # Initialize layers
        super().__init__()
        
    def forward(self, X):
        # Make a forward pass
        return np.random.randint(n_actions)

In [4]:
# A class for the agent
class Agent:
    def __init__(self):
        # Initialize stuff
        self.policy = PolicyNetwork()
    
    def get_action(self, state):
        return self.policy(state)

In [5]:
# A function to simulate an episode
def simulate(env, agent, render=False, fps=30, max_steps=10000):
    seconds_per_frame = 1 / fps
    total_reward = 0
    state = env.reset()
    
    for i in range(max_steps):
        if render:
            env.render()
            time.sleep(seconds_per_frame)
            
        action = agent.get_action(state)
        state, reward, done, _ = env.step(action)
        total_reward += reward
        
        if done:
            env.close()
            break
            
    print("Simulation complete - total reward:", total_reward)
    return total_reward

In [6]:
# Create the agent
agent = Agent()

In [7]:
# Training hyperparameters
n_epochs = 1
episodes_per_epoch = 1
max_steps = 10000

# Training loop
for epoch in range(n_epochs):
    for episode in range(episodes_per_epoch):
        state = env.reset()
        for step in range(max_steps):
            action = agent.get_action(state)
            state, reward, done, _ = env.step(action)
            
            if done:
                break

In [None]:
simulate(env, agent, render=True)