In [2]:
!pip install gym[all]==0.24.1
!pip install autorom[accept-rom-license]
!pip install ale-py

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[all]==0.24.1
  Downloading gym-0.24.1.tar.gz (696 kB)
[K     |████████████████████████████████| 696 kB 4.7 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pygame==2.1.0
  Downloading pygame-2.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[K     |████████████████████████████████| 18.3 MB 97 kB/s 
[?25hCollecting box2d-py==2.3.5
  Downloading box2d_py-2.3.5-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 49.0 MB/s 
Collecting mujoco-py<2.2,>=2.1
  Downloading mujoco_py-2.1.2.14-py3-none-any.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 48.6 MB/s 
[?25hCollecting mujoco==2.2.0
  Downloading mujoco-2.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autorom[accept-rom-license]
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-rom-license-0.4.2.tar.gz (9.8 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: AutoROM.accept-rom-license
  Building wheel for AutoROM.accept-rom-license (PEP 517) ... [?25l[?25hdone
  Created wheel for AutoROM.accept-rom-license: filename=AutoROM.accept_rom_license-0.4.2-py3-none-any.whl size=441027 sha256=ed35846dfff86e9f98b362febb0792963c4baa9d2384ef5821a7d298a3c2c1f9
  Stored in directory: /root/.cache/pip/wheels/87/67/2e/6147e7912fe37f5408b80d07527dab807c1d25f5c403a9538a
Successfully built AutoROM.accept-rom-license
Installing collected packages: AutoROM.accept-rom-licens

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab\ Notebooks/
%ls

# 1. Dependencies

In [None]:
import gym
import ale_py 
import random

import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output

import torch
import torch.nn as nn

#from utils.wrapper import make_atari_env
from utils.wrapper import make_atari, wrap_deepmind, wrap_pytorch

# 2. Environment

In [None]:
# env_id = "PongNoFrameskip-v4"
# env = make_atari_env(env_id)
# env.seed(31)

env_id = "PongNoFrameskip-v4"
env    = make_atari(env_id)
env    = wrap_deepmind(env, frame_stack=True)
env    = wrap_pytorch(env)

# 3. Hyperparameters

In [None]:
args = {}

# gpu
args["USE_GPU"] = torch.cuda.is_available()
args["GPU_NUM"] = 0
args["device"] = torch.device(f'cuda:{args["GPU_NUM"]}' if args["USE_GPU"] else 'cpu')

# replay buffer
args["replay_buffer_capacity"] = 50000
args["replay_initial"] = 10000
args["batch_size"] = 32

# epsilon (experience rate)
args["eps_start"] = 1
args["eps_end"] = 0.01
args["eps_dec_frame"] = 3e5
args["eps_type"] = "exp"

# train
args["frame_num"] = 1400000
args["learning_rate"] = 0.00001
args["discount_factor"] = 0.99

# 4. Q-value Approximator

In [None]:
class Q_approximator(nn.Module):
    def __init__(self, frame_size=84, action_number=6):
        super(Q_approximator, self).__init__()
        
        self.conv_layer = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),

            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),

            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        
        in_features = ((((frame_size - 8) // 4 - 3) // 2 + 1) - 2) **2 * 64
        
        self.fc_layer = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=512),
            nn.ReLU(),
        )
        self.value_approximator = nn.Linear(in_features=256, out_features = 1)
        self.action_advantage_approximator = nn.Linear(in_features=256, out_features = action_number)
    
    def forward(self, x):
        x = self.conv_layer(x)
        x = self.fc_layer(torch.flatten(x, 1)).3
        value = self.value_approximator(x[:, :256])
        action_advantage = self.action_advantage_approximator(x[:, 256:])
        return value + (action_advantage - action_advantage.mean(1, True))
    
    def action(self, x) -> int:
        x = self.forward(x)
        return torch.argmax(x).item()

In [None]:
from torchsummary import summary

# our q value approximator model
q_val = Q_approximator().to(args["device"])
summary(q_val, input_size=(4, 84, 84), device='cuda' if args["USE_GPU"] else 'cpu')

# 5. Replay buffer and epsilon

In [None]:
from utils.common import Replay

# our replay buffer
replay_buffer = Replay(capacity=args["replay_buffer_capacity"])

In [None]:
from utils.common import EpsilonGenerator

eps = EpsilonGenerator(
    start=args["eps_start"], end=args["eps_end"],
    frame_num=args["eps_dec_frame"], ftype=args["eps_type"])
# our epsilon generator
epsilon = lambda frame : eps.epsilon(frame) if frame < args["eps_dec_frame"] else args["eps_end"]

plt.plot([epsilon(i) for i in range(args["frame_num"])])

# 6. Train

In [None]:
def reform_replay(replay: tuple):
    prev_obs = torch.stack([torch.tensor(obs) for obs, _, _, _, _ in replay])
    action = torch.stack([torch.tensor([act]) for _, act, _, _, _ in replay])
    reward = torch.stack([torch.tensor([rew]) for _, _, rew, _, _ in replay])
    curr_obs = torch.stack([torch.tensor(obs) for _, _, _, obs, _ in replay])
    done = torch.stack([torch.tensor(don) for _, _, _, _, don in replay])
    
    return prev_obs.to(args['device']).type(torch.cuda.FloatTensor), action.to(args['device']), reward.to(args['device']), curr_obs.to(args['device']).type(torch.cuda.FloatTensor), done.to(args['device']).type(torch.cuda.FloatTensor)

In [None]:
def plot(rewards, losses):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('reward: %s' % (np.mean(rewards[-10:])))
    plt.plot(range(len(rewards)), rewards)
    plt.subplot(132)
    plt.title('loss')
    plt.plot(range(len(losses)-5), losses[5:])
    plt.show()

In [None]:
observation = env.reset()
optimizer = torch.optim.Adam(q_val.parameters(), lr = args["learning_rate"])

episode_num = []
loss_list = []
reward_sum_list = []
reward_sum = 0

print(observation)
q_val = q_val.eval()
for i in range(args["frame_num"]):
    buffer = [observation]
    
    # epsilon-greedy behaviour policy
    action = 0
    if epsilon(i) > random.uniform(0, 1):
        action = env.action_space.sample()
    else:
        state = torch.from_numpy(observation).unsqueeze(0).to(args["device"]).type(torch.cuda.FloatTensor)
        action = q_val.action(state)
        del state
        
    observation, reward, done, information = env.step(action)
    reward_sum += reward
    
    # save to replay buffer
    buffer += [action, reward, observation, done]
    replay_buffer.push(tuple(buffer))
    
    # reset environment
    if done:
        reward_sum_list.append(reward_sum)
        reward_sum = 0
        observation = env.reset()
    
    # train
    if len(replay_buffer) > args["replay_initial"]:
        # get data from replay buffer
        prev_obs, actions, rewards, curr_obs, dones = reform_replay(replay_buffer.batch_replay(args["batch_size"]))
        
        # loss function
        expected_gain = rewards + args["discount_factor"] * q_val(curr_obs).max(1)[0].unsqueeze(-1) * (1 - dones)
        current_gain = q_val(prev_obs).gather(1, actions)
        loss = (expected_gain.detach() - current_gain).pow(2).mean()

        # optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())
    if (i+1) % 10000 == 0:
        plot(reward_sum_list, loss_list)
        print(i)