## Imports

In [1]:
%%capture
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip install pyvirtualdisplay
!pip install pyglet==1.5.1

In [2]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7b32a3114220>

In [3]:
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit4/requirements-unit4.txt

Collecting git+https://github.com/ntasfi/PyGame-Learning-Environment.git (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit4/requirements-unit4.txt (line 1))
  Cloning https://github.com/ntasfi/PyGame-Learning-Environment.git to /tmp/pip-req-build-_75i3zap
  Running command git clone --filter=blob:none --quiet https://github.com/ntasfi/PyGame-Learning-Environment.git /tmp/pip-req-build-_75i3zap
  Resolved https://github.com/ntasfi/PyGame-Learning-Environment.git to commit 3dbe79dc0c35559bb441b9359948aabf9bb3d331
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting git+https://github.com/simoninithomas/gym-games (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit4/requirements-unit4.txt (line 2))
  Cloning https://github.com/simoninithomas/gym-games to /tmp/pip-req-build-mx4rzg9v
  Running command git clone --filter=blob:none --quiet https://github.com/simoninithomas/gym-games /tmp/pip-req-build-mx4rzg

In [4]:
from ple.games.pong import Pong
from ple import PLE

pygame 2.6.1 (SDL 2.28.4, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [5]:
import numpy as np

from collections import deque

import matplotlib.pyplot as plt
%matplotlib inline

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Gym
import gym
import gym_pygame

# Hugging Face Hub
from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.
import imageio

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


  and should_run_async(code)


In [7]:
def images_to_video(images, out_directory, fps=30):
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [70]:
!rm -rf /content/training

In [71]:
import os
os.makedirs("/content/training")

## Main Part


In [72]:
hyperparameters = {
    "h_size": 512,
    "n_training_episodes": 2000,
    "n_evaluation_episodes": 10,
    "max_t": 500,
    "gamma": 1e-5,
    "lr": 1e-6,
}

In [None]:
env = Pong()
game = PLE(env, display_screen=False)

In [None]:
game.init()
game.reset_game()
print(game.getActionSet())
img = game.getScreenGrayscale()
print(img.shape)

In [None]:
game.act(119)

In [None]:
img = game.getScreenGrayscale()
img

In [None]:
img.shape

In [58]:
class PolicyNetwork(nn.Module):
    def __init__(self, H, learning_rate, input_shape=(64, 48), num_actions=3):
        super(PolicyNetwork, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(self.feature_size(input_shape), H)
        self.fc2 = nn.Linear(H, num_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)  # Use Adam optimizer


    def feature_size(self, input_shape):
        x = torch.zeros(1, *input_shape)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        return x.view(1, -1).size(1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1) # flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)

    def act(self, x):
        state = torch.from_numpy(np.array(x)).float().to(device)
        # Normalize
        state = (state - state.mean()) / (state.std() + 1e-8)

        state = state.unsqueeze(0)  # Add Channel dimension
        state = state.unsqueeze(0)  # Add batch dimension

        # print(state.size())

        probs = self.forward(state)
        m = Categorical(probs)
        # Sampling an action based on probability
        action = m.sample()
        return action.item(), m.log_prob(action)

In [59]:
action_map = {
    0: 119,
    1: 115,
    2: None
}

In [60]:
# Initialize the model
model = PolicyNetwork(hyperparameters["h_size"], hyperparameters["lr"]).to(device)

In [66]:
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
    # Help us to calculate the score during the training
    scores_deque = deque(maxlen=100)
    scores = []

    # Line 3 of pseudocode
    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = []
        rewards = []

        images = []

        game.reset_game()
        state_ = game.getScreenGrayscale()

        # Line 4 of pseudocode
        for t in range(max_t):
            state = game.getScreenGrayscale()
            action, log_prob = policy.act(state)
            # state, state_ = state - state_, state
            # action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)

            reward = game.act(action_map[action])
            rewards.append(reward)

            images.append(state)

            if game.game_over():
                break


        if i_episode % print_every == 0:
            images_to_video(images, f"/content/training/pong-training-{i_episode}.mp4")

        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        # Line 6 of pseudocode: calculate the return
        returns = deque(maxlen=max_t)
        n_steps = len(rewards)
        # Compute the discounted returns at each timestep,
        # as the sum of the gamma-discounted return at time t (G_t) + the reward at time t
        #
        # In O(N) time, where N is the number of time steps
        # (this definition of the discounted return G_t follows the definition of this quantity
        # shown at page 44 of Sutton&Barto 2017 2nd draft)
        # G_t = r_(t+1) + r_(t+2) + ...

        # Given this formulation, the returns at each timestep t can be computed
        # by re-using the computed future returns G_(t+1) to compute the current return G_t
        # G_t = r_(t+1) + gamma*G_(t+1)
        # G_(t-1) = r_t + gamma* G_t
        # (this follows a dynamic programming approach, with which we memorize solutions in order
        # to avoid computing them multiple times)

        # This is correct since the above is equivalent to (see also page 46 of Sutton&Barto 2017 2nd draft)
        # G_(t-1) = r_t + gamma*r_(t+1) + gamma*gamma*r_(t+2) + ...


        ## Given the above, we calculate the returns at timestep t as:
        #               gamma[t] * return[t] + reward[t]
        #
        ## We compute this starting from the last timestep to the first, in order
        ## to employ the formula presented above and avoid redundant computations that would be needed
        ## if we were to do it from first to last.

        ## Hence, the queue "returns" will hold the returns in chronological order, from t=0 to t=n_steps
        ## thanks to the appendleft() function which allows to append to the position 0 in constant time O(1)
        ## a normal python list would instead require O(N) to do this.
        for t in range(n_steps)[::-1]:
            disc_return_t = (returns[0] if len(returns)>0 else 0)
            returns.appendleft( gamma*disc_return_t + rewards[t]   )

        ## standardization of the returns is employed to make training more stable
        eps = np.finfo(np.float32).eps.item()
        ## eps is the smallest representable float, which is
        # added to the standard deviation of the returns to avoid numerical instabilities
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        # Line 7:
        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()

        # Line 8: PyTorch prefers gradient descent
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))

    return scores

In [67]:
scores = reinforce(
    model,
    model.optimizer,
    hyperparameters["n_training_episodes"],
    hyperparameters["max_t"],
    hyperparameters["gamma"],
    5
)

Episode 5	Average Score: -12.00
Episode 10	Average Score: -11.80
Episode 15	Average Score: -11.73
Episode 20	Average Score: -11.55
Episode 25	Average Score: -11.48
Episode 30	Average Score: -11.67
Episode 35	Average Score: -11.57
Episode 40	Average Score: -11.93
Episode 45	Average Score: -11.82
Episode 50	Average Score: -12.06
Episode 55	Average Score: -11.84
Episode 60	Average Score: -11.95
Episode 65	Average Score: -12.26
Episode 70	Average Score: -11.91
Episode 75	Average Score: -11.71
Episode 80	Average Score: -11.74
Episode 85	Average Score: -11.80
Episode 90	Average Score: -11.72
Episode 95	Average Score: -11.88
Episode 100	Average Score: -11.95
Episode 105	Average Score: -12.15
Episode 110	Average Score: -12.15
Episode 115	Average Score: -12.20
Episode 120	Average Score: -12.30
Episode 125	Average Score: -12.31
Episode 130	Average Score: -12.28
Episode 135	Average Score: -12.21
Episode 140	Average Score: -12.29
Episode 145	Average Score: -12.46
Episode 150	Average Score: -12.49


In [63]:
from IPython.display import Video

Video("/content/training/pong-training-200.mp4", embed=True)