In [None]:
import os
import warnings
warnings.filterwarnings("ignore")

import gym
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
class FramePreprocessor:
    def __init__(self, session, resize_width, resize_height):
        self.session = session
        self.frame = tf.placeholder(shape=(210, 160, 3), dtype=tf.uint8)
        self.gray = tf.image.rgb_to_grayscale(self.frame)
        self.crop = tf.image.crop_to_bounding_box(self.gray, 34, 0, 160, 160)
        self.resize = tf.image.resize_images(self.crop, (resize_width, resize_height), 
                                             method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
    
    def process(self, frame):
        return self.session.run(self.resize, {self.frame: frame})

In [None]:
class DQN:
    def __init__(self, input_width, input_height, num_stacked_frames, num_actions, learning_rate):
        self.input = tf.placeholder(shape=(None, input_width, input_height, num_stacked_frames), dtype=tf.float32)
        self.normalized = self.input / 255
        self.conv1 = tf.layers.conv2d(inputs=self.normalized, filters=32, kernel_size=(8,8), strides=4, kernel_initializer=tf.initializers.variance_scaling(scale=2), padding='valid', activation=tf.nn.relu, use_bias=False, name='conv1')
        self.conv2 = tf.layers.conv2d(inputs=self.conv1, filters=64, kernel_size=(4,4), strides=2, kernel_initializer=tf.initializers.variance_scaling(scale=2), padding='valid', activation=tf.nn.relu, use_bias=False, name='conv2')
        self.conv3 = tf.layers.conv2d(inputs=self.conv2, filters=64, kernel_size=(3,3), strides=1, kernel_initializer=tf.initializers.variance_scaling(scale=2), padding='valid', activation=tf.nn.relu, use_bias=False, name='conv3')
        self.conv4 = tf.layers.conv2d(inputs=self.conv3, filters=1024, kernel_size=(7,7), strides=1, kernel_initializer=tf.initializers.variance_scaling(scale=2), padding='valid', activation=tf.nn.relu, use_bias=False, name='conv4')
        
        self.split1, self.split2 = tf.split(self.conv4, 2, 3)
        self.split1 = tf.layers.flatten(self.split1)
        self.split2 = tf.layers.flatten(self.split2)
        self.valuefn = tf.layers.dense(inputs=self.split1, units=1, kernel_initializer=tf.initializers.variance_scaling(scale=2), name='valuefn')
        self.advantagefn = tf.layers.dense(inputs=self.split2, units=num_actions, kernel_initializer=tf.initializers.variance_scaling(scale=2), name='advantagefn')
        
        self.q_actual = self.valuefn + tf.subtract(self.advantagefn, tf.reduce_mean(self.advantagefn, axis=1, keepdims=True))
        self.best_action = tf.argmax(self.q_actual, axis=1)
        
        self.q_target = tf.placeholder(shape=(None), dtype=tf.float32)
        self.action = tf.placeholder(shape=(None), dtype=tf.int32)
        self.q_pred = tf.reduce_sum(tf.multiply(self.q_actual, tf.one_hot(self.action, num_actions, dtype=tf.float32)), axis=1)
        self.loss = tf.reduce_mean(tf.losses.huber_loss(labels=self.q_target, predictions=self.q_pred))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.update = self.optimizer.minimize(self.loss)

In [None]:
class Breakout:
    def __init__(self, session, frame_width, frame_height, num_stacked_frames):
        self.env = gym.make("BreakoutDeterministic-v4")
        self.fp = FramePreprocessor(session, frame_width, frame_height)
        self.lives = self.env.ale.lives()
        self.state = None
        self.num_stacked_frames = num_stacked_frames
    
    def reset(self):
        frame = self.env.reset()
        self.lives = self.env.ale.lives()
        self.state = np.repeat(self.fp.process(frame), self.num_stacked_frames, axis=2)
        return self.state
    
    def step(self, action):
        frame, reward, done, _ = self.env.step(action)
        life_lost = done or (self.env.ale.lives() < self.lives)
        self.lives = self.env.ale.lives()
        self.state = np.append(self.state[:, :, 1:], self.fp.process(frame), axis=2)
        return self.state, reward, done, life_lost
    
    def close():
        self.env.close()

In [None]:
class ActionSelector:
    def __init__(self, session, dqn, num_actions, epsilon_decay_intervals, epsilon_values_at_intervals):
        self.session = session
        self.dqn = dqn
        self.num_actions = num_actions
        self.edi = epsilon_decay_intervals
        self.evai = epsilon_values_at_intervals
    
    def epsilon(self, frame_number):
        i = len(self.edi) - 1
        for j in range(len(self.edi) - 1):
            if self.edi[j] <= frame_number < self.edi[j+1]:
                i = j
                break
        m = (self.evai[i] - self.evai[i+1]) / (self.edi[i] - self.edi[i+1])
        c = self.evai[i] - m * self.edi[i]
        return m * frame_number + c
    
    def epsilon_greedy_action(self, frame_number, state):
        if np.random.uniform() < self.epsilon(frame_number):
            return np.random.randint(0, self.num_actions)
        return greedy_action(state)
    
    def greedy_action(self, state):
        return self.session.run(self.dqn.best_action, {self.dqn.input: [state]})[0]
    
    def plot_epsilon(self):
        plt.plot([self.epsilon(i) for i in range(0, self.edi[-1], 1000)])

In [None]:
class ReplayMemory:
    def __init__(self, capacity, batch_size, frame_width, frame_height, num_stacked_frames):
        self.capacity = capacity
        self.batch_size = batch_size
        self.num_stacked_frames = num_stacked_frames
        self.frames = np.empty((self.capacity, frame_width, frame_height), dtype=np.uint8)
        self.actions = np.empty(self.capacity, dtype=np.int32)
        self.rewards = np.empty(self.capacity, dtype=np.float32)
        self.dones = np.empty(self.capacity, dtype=np.bool)
        self.current = 0
        self.count = 0
        self.indices = np.empty(batch_size, dtype=np.int32)
        self.prestates = np.empty((batch_size, num_stacked_frames, frame_width, frame_height), dtype=np.uint8)
        self.poststates = np.empty((batch_size, num_stacked_frames, frame_width, frame_height), dtype=np.uint8)
    
    def add(self, frame, action, reward, done):
        self.frames[self.current] = frame
        self.actions[self.current] = action
        self.rewards[self.current] = reward
        self.dones[self.current] = done
        self.count = max(self.count, self.current + 1)
        self.current = (self.current + 1) % self.capacity
    
    def sample(self):
        indices = []
        for i in range(self.batch_size):
            while True:
                index = np.random.randint(0, self.count - (self.num_stacked_frames + 1))
                if self.dones[index : index + self.num_stacked_frames].any():
                    continue
                break
            self.indices[i] = index + self.num_stacked_frames - 1
            self.prestates[i] = self.frames[index : index + self.num_stacked_frames, ...]
            self.poststates[i] = self.frames[index + 1 : index + self.num_stacked_frames + 1, ...]
        return (np.transpose(self.prestates, axes=(0, 2, 3, 1)), 
                self.actions[self.indices], 
                self.rewards[self.indices], 
                np.transpose(self.poststates, axes=(0, 2, 3, 1)), 
                self.dones[self.indices])

In [None]:
class FixedDQNUpdater:
    def __init__(self, session, main_dqn_scope, fixed_dqn_scope):
        self.session = session
        self.main_dqn_vars = tf.trainable_variables(scope=main_dqn_scope)
        self.fixed_dqn_vars = tf.trainable_variables(scope=fixed_dqn_scope)
            
    def update(self):
        for i, var in enumerate(self.main_dqn_vars):
            self.session.run(self.fixed_dqn_vars[i].assign(var.value()))

\begin{align}
Q_\text{target}(s,a) &= r + \gamma \; \textrm{max}\; Q_\text{target}(s',a') &\text{Normal DQN}\\
Q_\text{target}(s,a) &= r + \gamma \; Q_\text{target} \left(s',\text{argmax} \; Q_\text{fixed}(s',a') \right)&\text{Double DQN} \\
Q_\text{target}(s,a) &= r \quad\quad \text{if} \; s' \; \text{is terminal} &\text{Double DQN}
\end{align}

In [None]:
class Trainer:
    FRAME_WIDTH = 84
    FRAME_HEIGHT = 84
    NUM_STACKED_FRAMES = 4
    BATCH_SIZE = 32

    DISCOUNT_FACTOR = 0.99
    LEARNING_RATE = 0.00001

    REPLAY_MEMORY_CAPACITY = 1000000

    EPSILON_DECAY_INTERVALS = [0, 50000, 1000000, 30000000]
    EPSILON_VALUES_AT_INTERVALS = [1, 1, 0.1, 0.01]

    MAIN_DQN_SCOPE = "main_dqn"
    FIXED_DQN_SCOPE = "fixed_dqn"
    MAX_TRAIN_FRAMES = 30000000
    MAX_EPISODE_LENGTH = 18000
    MIN_REPLAY_MEMORY_SIZE = 50000
    FIXED_DQN_UPDATE_FREQ = 10000
    MAIN_DQN_UPDATE_FREQ = 4

    METRICS_OUTPUT_FREQ = 10
    METRICS_RUNNING_MEAN_LOOKBACK = 100

    def __init__(self, session):
        self.session = session
        self.game = Breakout(session, self.FRAME_WIDTH, self.FRAME_HEIGHT, self.NUM_STACKED_FRAMES)
        num_actions = self.game.env.action_space.n
        with tf.variable_scope(self.MAIN_DQN_SCOPE):
            self.main_dqn = DQN(self.FRAME_WIDTH, self.FRAME_HEIGHT, self.NUM_STACKED_FRAMES, num_actions, self.LEARNING_RATE)
        with tf.variable_scope(self.FIXED_DQN_SCOPE):
            self.fixed_dqn = DQN(self.FRAME_WIDTH, self.FRAME_HEIGHT, self.NUM_STACKED_FRAMES, num_actions, self.LEARNING_RATE)
        self.action_selector = ActionSelector(session, self.main_dqn, num_actions, self.EPSILON_DECAY_INTERVALS, self.EPSILON_VALUES_AT_INTERVALS)
        self.memory = ReplayMemory(self.REPLAY_MEMORY_CAPACITY, self.BATCH_SIZE, self.FRAME_WIDTH, self.FRAME_HEIGHT, self.NUM_STACKED_FRAMES)
        self.fixed_dqn_updator = FixedDQNUpdater(session, self.MAIN_DQN_SCOPE, self.FIXED_DQN_SCOPE)
        self.frame_number = 0
        self.episode_number = 0
        self.losses = []
        self.rewards = []

    def train(self):
        self.session.run(tf.global_variables_initializer())
        while self.frame_number < self.MAX_TRAIN_FRAMES:
            state = self.game.reset()
            episode_reward = 0
            for _ in range(self.MAX_EPISODE_LENGTH):
                action = self.action_selector.epsilon_greedy_action(self.frame_number, state)
                next_state, reward, done, life_lost = self.game.step(action)
                self.frame_number += 1
                episode_reward += reward
                self.memory.add(state[:, :, -1], action, reward, life_lost)
                state = next_state
                if self.frame_number % self.MAIN_DQN_UPDATE_FREQ == 0 and self.frame_number > self.MIN_REPLAY_MEMORY_SIZE:
                    self.losses.append(self.run_update())
                if self.frame_number % self.FIXED_DQN_UPDATE_FREQ == 0 and self.frame_number > self.MIN_REPLAY_MEMORY_SIZE:
                    self.fixed_dqn_updator.update()
                if done:
                    break
            self.rewards.append(episode_reward)
            self.episode_number += 1
            if self.episode_number % self.METRICS_OUTPUT_FREQ == 0:
                print(f"Episodes run: {self.episode_number} | Frames seen: {self.frame_number} | Rewards running mean: {np.mean(self.rewards[-self.METRICS_RUNNING_MEAN_LOOKBACK:])}")

    def run_update(self):
        prestates, actions, rewards, poststates, dones = self.memory.sample()
        best_actions_in_poststates = self.session.run(self.main_dqn.best_action, {self.main_dqn.input: poststates})
        all_actions_q_values = self.session.run(self.fixed_dqn.q_actual, {self.fixed_dqn.input: poststates})
        best_action_q_values = all_actions_q_values[range(self.BATCH_SIZE), best_actions_in_poststates]
        q_target = rewards + (self.DISCOUNT_FACTOR * best_action_q_values * (1 - dones))
        loss, _ = self.session.run([self.main_dqn.loss, self.main_dqn.update], 
                                   {self.main_dqn.input: prestates, 
                                    self.main_dqn.q_target: q_target, 
                                    self.main_dqn.action: actions})
        return loss

In [None]:
tf.reset_default_graph()
with tf.Session() as session:
    trainer = Trainer(session)
    trainer.train()