In [None]:
# Imports and initializations
import os, math
import random
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import time
from torch.utils.tensorboard import SummaryWriter
from gym.wrappers import atari_preprocessing, FrameStack, TransformReward, RecordEpisodeStatistics

# GPU OR CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import urllib.request
urllib.request.urlretrieve('http://www.atarimania.com/roms/Roms.rar','Roms.rar')
!pip install unrar
!unrar x Roms.rar
!mkdir rars
!mv HC\ ROMS.zip   rars
!mv ROMS.zip  rars
!python -m atari_py.import_roms rars


Collecting unrar
  Downloading unrar-0.4-py3-none-any.whl (25 kB)
Installing collected packages: unrar
Successfully installed unrar-0.4

UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from Roms.rar

Extracting  HC ROMS.zip                                                   36%  OK 
Extracting  ROMS.zip                                                      74% 99%  OK 
All OK
copying adventure.bin from HC ROMS/BY ALPHABET (PAL)/A-G/Adventure (PAL).bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/adventure.bin
copying air_raid.bin from HC ROMS/BY ALPHABET (PAL)/A-G/Air Raid (PAL).bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/air_raid.bin
copying alien.bin from HC ROMS/BY ALPHABET (PAL)/A-G/REMAINING NTSC ORIGINALS/Alien.bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/alien.bin
copying crazy_climber.bin from HC ROMS/BY ALPHABET (PAL)/A-G/REMAINING NTSC ORIGINALS/Crazy Climber.bin to 

In [None]:
# Hyperparameters
num_frames = 2000000
batch_size = 32
memory_capacity = 1000000
learning_rate = 0.00025
gamma = 0.99
replay_initial = 500
update_freq = 525
checkpoint_freq = 525
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 30000


In [None]:
# Deep Q-Network
class NatureDQN(nn.Module):
    def __init__(self, num_actions, learning_rate):
        super(NatureDQN, self).__init__()

        self.num_actions = num_actions
        self.learning_rate = learning_rate

        self.features = nn.Sequential(

            nn.Conv2d(1, 32, kernel_size=8, stride=4),
            nn.ReLU(),

            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),

            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        self.fully_connected = nn.Sequential(
            nn.Linear(self.feature_size(), 512),
            nn.ReLU(),

            nn.Linear(512, self.num_actions)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)

    def feature_size(self):
        return self.features(torch.zeros(1, *(1, 84, 84))).view(1, -1).size(1)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fully_connected(x)
        return x

    def select_best_action(self, state, writer):
        state_ = torch.tensor(state,dtype=torch.float32,device=device).unsqueeze(0)
        q_values = self.forward(state_).detach()
        q_action_dict = {'action' + str(i):x for i,x in enumerate(q_values[0].tolist())}
        writer.add_scalars('Q value of each action', q_action_dict, frame_idx)
        writer.flush()
        return q_values.max(1)[1].item()

    def save_variables(self, model_file):
        torch.save({
            'model_state_dict': self.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }, os.path.join(model_file, 'natureDQN.pt'))

    def load_variables(self, PATH):
        checkpoint = torch.load(PATH)
        self.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.eval()

    def turn_on_training(self):
        self.train()


In [None]:
# ReplayMemory buffer for agent
class ReplayMemory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        # older transitions are constantly replaced with new ones when we reach the size limit
        if len(self.memory) >= self.capacity:
            self.memory.pop(0)
        self.memory.append((state, action, reward, next_state, done))

    def get_batch_sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.memory, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done


In [None]:
# Agent
class Agent():
    def __init__(self, env, epsilon, learning_rate, gamma, capacity):
        self.env = env
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.dqn = NatureDQN(env.action_space.n, learning_rate).to(device)
        self.target_dqn = NatureDQN(env.action_space.n, learning_rate).to(device)
        self.target_dqn.load_state_dict(self.dqn.state_dict())
        self.agent_memory = ReplayMemory(capacity)

    def get_action(self, state, writer):
        if random.uniform(0, 1) < self.epsilon:
            # Explore
            action = random.choice(range(env.action_space.n))
        else:
            action = self.dqn.select_best_action(state, writer)
        return action

    def push_to_memory(self, state, action, reward, next_state, done):
        self.agent_memory.push(state, action, reward, next_state, done)

    def update_target(self):
        self.target_dqn.load_state_dict(self.dqn.state_dict())

    def compute_loss(self, state, action, reward, next_state, done):
        states = torch.tensor(state, dtype=torch.float32, device=device)
        next_states = torch.tensor(next_state, dtype=torch.float32, device=device)
        actions = torch.tensor(action, dtype=torch.long, device=device)
        rewards = torch.tensor(reward, dtype=torch.float32, device=device)
        dones = torch.tensor(done, dtype=torch.float32, device=device)

        q_values = self.dqn.forward(states)
        next_q_values = self.target_dqn.forward(next_states).detach()

        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_values.max(1)[0]

        expected_q_value = rewards + self.gamma * next_q_value * (1 - dones)

        loss = nn.SmoothL1Loss()
        loss = loss(expected_q_value, q_value)

        self.dqn.optimizer.zero_grad()
        # Calculate gradients
        loss.backward()
        self.dqn.optimizer.step()

        return loss


In [None]:
# Game settings and initialization
game = "PongNoFrameskip-v4"
render_video = True
env = gym.make(game)
env = atari_preprocessing.AtariPreprocessing(env)
agent = Agent(env, epsilon_start, learning_rate, gamma, memory_capacity)

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

all_rewards = []
episode_reward = 0
state = env.reset()
state = np.expand_dims(state, 0)

if render_video:
    env.render()

# FOR LOGGING
PATH_to_log_dir = 'log'
# Declare Tensorboard writer
timestr = time.strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(PATH_to_log_dir + timestr)
model_file = os.path.join(PATH_to_log_dir, game[:10]+'model_test')


NoSuchDisplayException: ignored

In [None]:
# Train
for frame_idx in range(1, num_frames + 1):
    agent.epsilon = epsilon_by_frame(frame_idx)

    # Get action to take
    action = agent.get_action(state, writer)
    next_state, reward, done, info = env.step(action)
    next_state = np.expand_dims(next_state, 0)
    agent.push_to_memory(state, action, reward, next_state, done)

    if render_video:
        env.render()

    state = next_state
    episode_reward += reward

    if done:
        state = env.reset()
        state = np.expand_dims(state, 0)
        all_rewards.append(episode_reward)
        num_episodes = len(all_rewards)
        writer.add_scalar('Reward of each episode', episode_reward, num_episodes)
        writer.add_scalar('Average Reward All Time', np.mean(all_rewards), frame_idx)
        if num_episodes % 10 == 0:
            writer.add_scalar('Average reward of past 10 episodes', np.mean(all_rewards[-10]), num_episodes // 10)
        writer.flush()
        episode_reward = 0


    # Only start updating the network based on parameter replay_initial
    if len(agent.agent_memory.memory) > replay_initial:
        loss = agent.compute_loss(*agent.agent_memory.get_batch_sample(batch_size))

    if frame_idx % update_freq == 0:
        agent.update_target()

    # if (frame_idx > batch_size and frame_idx % checkpoint_freq == 0):
        # agent.dqn.save_variables(model_file)

writer.close()
# agent.dqn.save_variables(model_file)


KeyboardInterrupt: ignored