<h1>DQN algorithm in Atari 2600 games</h1>

<h3>Imports</h3>

In [None]:
import math, random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
from torch.autograd import Variable

import json

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

device

<h3>Setting up the environment with preprocessing</h3>

In [None]:
env_id = 'PongNoFrameskip-v4'

In [None]:
env = gym.make(env_id, render_mode='rgb_array')
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
env = gym.wrappers.FrameStack(env, 4)

<h3>Deep Q Network</h3>

In [None]:
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()

        self.c = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.LeakyReLU(),
            nn.Flatten(),
            nn.Linear(7 * 7 * 64, 512),
            nn.LeakyReLU()
        )

        self.v = nn.Sequential(
            nn.Linear(512, 1)
        )

        self.adv = nn.Sequential(
            nn.Linear(512, env.action_space.n)
        )

    def forward(self, state):
        state = state.to(device)
        c = self.c(state)

        value = self.v(c)
        advantage = self.adv(c)

        Q = value + advantage - advantage.mean()

        return Q

    def act(self, state, epsilon):
        if random.random() > epsilon:
            state = Variable(torch.FloatTensor(state).unsqueeze(0))
            q_value = self.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(env.action_space.n)
        return action

<h3>Prioritized Experience Replay</h3>

In [None]:
class ReplayBuffer(object):
    def __init__(self, capacity, prob_alpha=0.6):
        self.prob_alpha = prob_alpha
        self.capacity = capacity
        self.buffer = []
        self.pos = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)

    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)

        max_prio = self.priorities.max() if self.buffer else 1.0

        if len(self.buffer) < self.capacity:
            self.buffer.append((state, action, reward, next_state, done))
        else:
            self.buffer[self.pos] = (state, action, reward, next_state, done)

        self.priorities[self.pos] = max_prio
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) == self.capacity:
            prios = self.priorities
        else:
            prios = self.priorities[:self.pos]

        prios_a = prios ** self.prob_alpha
        probs = prios_a / prios_a.sum()

        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]

        total = len(self.buffer)
        weights = (1 / total * 1 / probs[indices]) ** beta
        weights /= weights.max()
        weights = np.array(weights, dtype=np.float32)

        batch = list(zip(*samples))
        states, actions, rewards, next_states, dones = batch

        states = np.concatenate(states)
        next_states = np.concatenate(next_states)

        return states, actions, rewards, next_states, dones, indices, weights

    def update_priorities(self, batch_indices, batch_priorities):
        for idx, prio in zip(batch_indices, batch_priorities):
            self.priorities[idx] = prio

    def __len__(self):
        return len(self.buffer)

<h3>Hyperparameters over time</h3>

In [None]:
epsilon_start = 1
epsilon_final = 0.01
epsilon_decay = 30000

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

plt.plot([epsilon_by_frame(i) for i in range(1000000)])

In [None]:
beta_start = 0.4
beta_frames = 300000
beta_by_frame = lambda frame_idx: min(1.0, beta_start + frame_idx * (1.0 - beta_start) / beta_frames)

plt.plot([beta_by_frame(i) for i in range(1000000)])

<h3>Setting up the models</h3>

In [None]:
def update_target(current_model, target_model):
    target_model.load_state_dict(current_model.state_dict())

In [None]:
current_model = DQN().to(device)
target_model  = DQN().to(device)

gamma = 0.99

optimizer = optim.Adam(current_model.parameters(), lr=0.00001)

update_target(current_model, target_model)

<h3>Filling the Replay Buffer</h3>

In [None]:
penalty = 100 #penalty for losing a life

In [None]:
buffer_len = 50000

replay_buffer = ReplayBuffer(buffer_len)

state, lives = env.reset()
lives = lives['lives']
for i in range(buffer_len):
    action = random.randrange(env.action_space.n)
    next_state, reward, done, _, info = env.step(action)

    if info['lives'] < lives:
        replay_buffer.push(state, action, reward - penalty, next_state, True)
    else:
        replay_buffer.push(state, action, reward, next_state, done)

    lives = info['lives']

    if done:
        state, lives = env.reset()
        lives = lives['lives']

<h3>Learn</h3>

In [None]:
def compute_td_loss(batch_size, beta):
    state, action, reward, next_state, done, indices, weights = replay_buffer.sample(batch_size, beta)

    state = Variable(torch.FloatTensor(np.float32(state))).to(device)
    next_state = Variable(torch.FloatTensor(np.float32(next_state))).to(device)
    action = Variable(torch.LongTensor(action)).to(device)
    reward = Variable(torch.FloatTensor(reward)).to(device)
    done = Variable(torch.FloatTensor(done)).to(device)
    weights = Variable(torch.FloatTensor(weights)).to(device)

    q_values = current_model.forward(state)
    next_q_values = target_model.forward(next_state)

    q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value = next_q_values.max(1)[0]
    expected_q_value = reward + gamma * next_q_value * (1 - done)

    diff = q_value - expected_q_value
    prios = abs(diff) + 0.1
    loss = (diff * weights).pow(2)
    loss = loss.mean()

    optimizer.zero_grad()
    loss.backward()
    replay_buffer.update_priorities(indices, prios)
    optimizer.step()

    return loss

<h3>Plot the progress</h3>

In [None]:
def plot(frame_idx, rewards, losses):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))
    plt.plot(rewards)
    plt.subplot(132)
    plt.title('loss')
    plt.plot(losses)
    plt.show()

<h3>Training</h3>

In [None]:
num_frames = 200000
batch_size = 32
update_freq = 2000

losses = []
all_rewards = []
episode_reward = 0

state, lives = env.reset()
lives = lives['lives']

for frame_idx in range(1, num_frames + 1):
    epsilon = epsilon_by_frame(frame_idx)
    action = current_model.act(state, epsilon)

    next_state, reward, done, _, info = env.step(action)

    if info['lives'] < lives:
        replay_buffer.push(state, action, reward - penalty, next_state, True)
        env.step(1)
    else:
        replay_buffer.push(state, action, reward, next_state, done)

    lives = info['lives']

    state = next_state
    episode_reward += reward

    if done:
        state, lives = env.reset()
        env.step(1)
        lives = lives['lives']
        all_rewards.append(episode_reward)
        episode_reward = 0

    if len(replay_buffer) > batch_size:
        beta = beta_by_frame(frame_idx)
        loss = compute_td_loss(batch_size, beta)
        losses.append(loss.item())

    if frame_idx % 1000 == 0:
        plot(frame_idx, all_rewards, losses)

    if frame_idx % update_freq == 0:
        update_target(current_model, target_model)

In [None]:
torch.save(current_model, 'model')

with open('losses.json', 'w') as f:
    json.dump(losses, f, indent=2)

with open('rewards.json', 'w') as f:
    json.dump(all_rewards, f, indent=2)

k = 50
plt.title('rewards')
plt.plot([np.mean(all_rewards[i:min(len(all_rewards), i + k)]) for i in range(0, len(all_rewards), k)])

<h3>Seeing the results</h3>

In [None]:
env1 = gym.make(env_id, render_mode='rgb_array')

env1 = gym.wrappers.AtariPreprocessing(env1, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
env1 = gym.wrappers.FrameStack(env1, 4)
#env1 = gym.wrappers.RecordVideo(env1, 'video')

model = torch.load('model', map_location=torch.device(device))

scores = []

games = 100
for game in range(1, games + 1):
    cur_scores = []
    seeds = []
    for i in range(10):
        seeds.append(random.randint(0, 100))
    for seed in seeds:
        state, lives = env1.reset(seed=seed)
        lives = lives['lives']
        done = False
        score = 0
        steps = 0
        while not done:
            steps += 1
            action = model.act(state, 0)
            next_state, reward, done, _, info = env1.step(action)
            score += reward
            next_state = (next_state)
            state = next_state
            if lives > info['lives']:
                env1.step(1)
            lives = info['lives']
        cur_scores.append(score)
    scores.append(np.mean(cur_scores))
env1.close()

In [None]:
plt.title('rewards')
plt.plot(scores)

with open('scores.json', 'w') as f:
    json.dump(scores, f, indent=2)

np.mean(scores), np.max(scores)