In [None]:
import sys, os
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import urllib.request
urllib.request.urlretrieve('http://www.atarimania.com/roms/Roms.rar','Roms.rar')
!pip install unrar
!unrar x Roms.rar
!mkdir rars
!mv HC\ ROMS.zip   rars
!mv ROMS.zip  rars
!python -m atari_py.import_roms rars

In [None]:
import torch
import torchvision
import numpy as np
import random
import gym
from gym.spaces import Box
from collections import deque


class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        transform = torchvision.transforms.Grayscale()
        observation = transform(observation)
        return observation


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        self.shape = (shape, shape) if isinstance(shape, int) else tuple(shape)
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transforms = torchvision.transforms.Compose([torchvision.transforms.Resize(self.shape),
                                                     torchvision.transforms.Normalize(0, 255)])
        return transforms(observation).squeeze(0)


class ExperienceReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def __len__(self):
        return len(self.memory)

    def store(self, state, next_state, action, reward, done):
        state = state.__array__()
        next_state = next_state.__array__()
        self.memory.append((state, next_state, action, reward, done))

    def sample(self, batch_size):
        # TODO: uniformly sample batches of Tensors for: state, next_state, action, reward, done
        # ...
        
        return torch.tensor(state), torch.tensor(next_state), torch.tensor(action), torch.tensor(reward), torch.tensor(done)

In [None]:
import torch
import gym
import numpy as np
import copy
from gym.wrappers import FrameStack


env = gym.make("BreakoutNoFrameskip-v4")
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
env = FrameStack(env, num_stack=4)
image_stack, h, w = env.observation_space.shape
num_actions = env.action_space.n

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

seed = 61
env.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.backends.cudnn.enabled:
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Parameters
batch_size = 32
alpha = 0.00025
gamma = 0.99
eps, eps_decay = 1.0, 0.999
max_train_episodes = 1000000
max_test_episodes = 10
max_train_frames = 10000
burn_in_phase = 50000
sync_target = 10000
curr_step = 0
buffer = ExperienceReplayMemory(50000)


In [None]:
def convert(x):
    return torch.tensor(x.__array__()).float()


class DeepQNet(torch.nn.Module):
    def __init__(self, h, w, image_stack, num_actions):
        super(DeepQNet, self).__init__()
        # TODO: create a convolutional neural network
        # ...

    def forward(self, x):
        # TODO: forward pass from the neural network
        # ...


# TODO: create an online and target DQN (Hint: Use copy.deepcopy() and requires_grad utilities!)
# ...
online_dqn = ...
target_dqn = ...
online_dqn.to(device)
target_dqn.to(device)


# TODO: create the appropriate MSE criterion and Adam optimizer
# ...
optimizer = ...
criterion = ...


In [None]:
def policy(state, is_training):
    global eps
    state = convert(state).unsqueeze(0).to(device)

    # TODO: Implement an epsilon-greedy policy
    # ...
    pass


def compute_loss(state, action, reward, next_state, done):
    state = convert(state).to(device)
    next_state = convert(next_state).to(device)
    action = action.to(device)
    reward = reward.to(device)
    done = done.to(device)
    
    # TODO: Compute the DQN (or DDQN) loss based on the criterion
    # ...
    pass


def run_episode(curr_step, buffer, is_training, is_rendering=False):
    global eps
    episode_reward, episode_loss = 0, 0.
    state = env.reset()
    if is_rendering:
        env.render("rgb_array")

    for t in range(max_train_frames):
        action = policy(state, is_training)
        curr_step += 1

        next_state, reward, done, _ = env.step(action)
        if is_rendering:
            env.render("rgb_array")

        episode_reward += reward

        if is_training:
            buffer.store(state, next_state, action, reward, done)

            if curr_step > burn_in_phase:
                state_batch, next_state_batch, action_batch, reward_batch, done_batch = buffer.sample(batch_size)

                if curr_step % sync_target == 0:
                    # TODO: Periodically update your target_dqn at each sync_target frames
                    # ...
                    pass

                loss = compute_loss(state_batch, action_batch, reward_batch, next_state_batch, done_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                episode_loss += loss.item()
        else:
            with torch.no_grad():
                episode_loss += compute_loss(state, action, reward, next_state, done).item()

        state = next_state

        if done:
            break

    return dict(reward=episode_reward, loss=episode_loss / t)


def update_metrics(metrics, episode):
    for k, v in episode.items():
        metrics[k].append(v)


def print_metrics(it, metrics, is_training, window=100):
    reward_mean = np.mean(metrics['reward'][-window:])
    loss_mean = np.mean(metrics['loss'][-window:])
    mode = "train" if is_training else "test"
    print(f"Episode {it:4d} | {mode:5s} | reward {reward_mean:5.5f} | loss {loss_mean:5.5f}")


In [None]:
train_metrics = dict(reward=[], loss=[])
for it in range(max_train_episodes):
    episode_metrics = run_episode(curr_step, buffer, is_training=True)
    update_metrics(train_metrics, episode_metrics)
    if it % 10 == 0:
        print_metrics(it, train_metrics, is_training=True)
    eps *= eps_decay

In [None]:
test_metrics = dict(reward=[], loss=[])
for it in range(max_test_episodes):
    episode_metrics = run_episode(buffer, is_training=False)
    update_metrics(test_metrics, episode_metrics)
    print_metrics(it + 1, test_metrics, is_training=False)

# TODO: Plot your train_metrics and test_metrics
# ...
