In [None]:
import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from skimage.color import rgb2gray
from skimage import transform
import keras
from collections import deque
from IPython import display
import imageio
from skimage.transform import resize
import os

In [None]:
env = gym.make('BreakoutDeterministic-v4')
frame = env.reset()
# print(env.unwrapped.get_action_meanings())

# Initialize parameters

FRAME_HEIGHT = 84
FRAME_WIDTH = 84
STACK_SIZE = 4
STATE_SIZE = [FRAME_HEIGHT, FRAME_WIDTH, STACK_SIZE]
ACTION_SIZE = env.action_space.n
LEARNING_RATE = 0.0001
DISCOUNT_RATE = 0.99

# e-greedy with frames
'''
EPS_START = 1
EPS_END = 0.1
FRAME_START = 50 * 1000
FRAME_END = 1 * 1000 * 1000
'''
# e-greedy with episodes

EPS_START = 1
EPS_END = 0.1

eps = EPS_START
EPS_FACTOR = 0.986

BATCH_SIZE = 32
MAX_FRAMES = 5 * 1000 * 1000
MAX_FRAMES_PER_EP = 18 * 1000

EXPERIENCE_SIZE = 8 * 1000

NO_OP_STEPS = 10

UPDATE_NETWORK_FREQ = 15000

EVALUATE_FREQ = 200 * 1000

EVALUATE_STEPS = 10000

START_TRAINING = 50000

RUN_NUM = 14
PATH = 'results'
SUMMARIES = 'summaries'
RUN_ID = 'run_' + str(RUN_NUM)
PATH = os.path.join(PATH, RUN_ID)
os.makedirs(PATH, exist_ok=True)
os.makedirs(os.path.join(SUMMARIES, RUN_ID), exist_ok=True)
print(PATH)

In [None]:
print(env.observation_space)
print(env.action_space.n)
plt.imshow(frame)
plt.show()

In [None]:
'''
def process(frame):
    gray = rgb2gray(frame)
    crop = gray[4:-15,5:-5]
    norm = crop / 255.0
    transformed = transform.resize(norm, [FRAME_HEIGHT, FRAME_WIDTH, 1])
    return transformed
'''

In [None]:
class ProcessFrame:
    def __init__(self, FRAME_HEIGHT, FRAME_WIDTH):
        self.frame_height = FRAME_HEIGHT
        self.frame_width = FRAME_WIDTH
        self.frame = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
        self.gray = tf.image.rgb_to_grayscale(self.frame)
        self.crop = tf.image.crop_to_bounding_box(self.gray, 34, 0, 160, 160)
        self.transformed = tf.image.resize_images(self.crop, [self.frame_height, self.frame_width], 
                                                 method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
        
    def process(self, sess, frame):
        return sess.run(self.transformed, feed_dict={self.frame : frame})

In [None]:
class Qnetwork:
    def __init__(self, STATE_SIZE, ACTION_SIZE, LEARNING_RATE, name='Qnetwork'):
        self.state_size = STATE_SIZE
        self.action_size = ACTION_SIZE
        self.learning_rate = LEARNING_RATE
        
        self.input = tf.placeholder(dtype=tf.float32, shape=[None, *self.state_size], name='input')
        self.scaled = self.input / 255
        self.target_Q = tf.placeholder(dtype=tf.float32, shape=[None], name='target')
            
        self.conv1 = tf.layers.conv2d(
                inputs=self.scaled, filters=32, kernel_size=8, strides=4,
                kernel_initializer=tf.variance_scaling_initializer(scale=2), padding='VALID', 
                activation=tf.nn.relu, use_bias=False, name='conv1')
            
        self.conv2 = tf.layers.conv2d(
                inputs=self.conv1, filters=64, kernel_size=4, strides=2,
                kernel_initializer=tf.variance_scaling_initializer(scale=2), padding='VALID', 
                activation=tf.nn.relu, use_bias=False, name='conv2')
            
        self.conv3 = tf.layers.conv2d(
                inputs=self.conv2, filters=64, kernel_size=3, strides=1,
                kernel_initializer=tf.variance_scaling_initializer(scale=2), padding='VALID', 
                activation=tf.nn.relu, use_bias=False, name='conv3')
            
            
        self.flatten = tf.contrib.layers.flatten(self.conv3)
            
        self.ful_con1 = tf.layers.dense(inputs=self.flatten, units=512, 
                                        kernel_initializer=tf.variance_scaling_initializer(scale=2),
                                        activation=tf.nn.relu, name='ful_con1')
            
        #self.ful_con2 = tf.layers.dense(inputs=self.ful_con1, units=128, 
        #                                activation=tf.nn.relu, name='ful_con2')
            
        self.output = tf.layers.dense(inputs=self.ful_con1, 
                                        kernel_initializer=tf.variance_scaling_initializer(scale=2),
                                        units=self.action_size, activation=None)
            
        self.best_action = tf.argmax(self.output, axis=1)
            
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        self.predicted_Q = tf.reduce_sum(tf.multiply(self.output, tf.one_hot(self.action, self.action_size, dtype=tf.float32)), axis=1)
            
        self.loss = tf.reduce_mean(tf.losses.huber_loss(labels=self.target_Q, predictions=self.predicted_Q))
            
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.update = self.optimizer.minimize(self.loss)

In [None]:
class ExperienceBuffer:
    def __init__(self, size, FRAME_HEIGHT, frame_width, STACK_SIZE, BATCH_SIZE):
        self.size = size
        self.frame_height = FRAME_HEIGHT
        self.frame_width = FRAME_WIDTH
        self.stack_size = STACK_SIZE
        self.batch_size = BATCH_SIZE
        self.now_size = 0
        self.now_pos = 0
        self.actions = np.empty(self.size, dtype=np.int32)
        self.frames = np.empty((self.size, FRAME_HEIGHT, FRAME_WIDTH), dtype=np.uint8)
        self.rewards = np.empty(self.size, dtype=np.float32)
        self.end_life = np.empty(self.size, dtype=np.bool)
        
        self.states = np.empty((self.batch_size, self.stack_size, FRAME_HEIGHT, 
                                FRAME_WIDTH), dtype=np.uint8)
        self.next_states = np.empty((self.batch_size, self.stack_size, FRAME_HEIGHT, 
                                     FRAME_WIDTH), dtype=np.uint8)
        self.indices = np.empty(self.batch_size, dtype=np.int32)
        
    def add(self, action, frame, reward, end_life):
        self.actions[self.now_pos] = action
        self.frames[self.now_pos, ...] = frame
        self.rewards[self.now_pos] = reward
        self.end_life[self.now_pos] = end_life
        self.now_size = max(self.now_size, self.now_pos + 1)
        self.now_pos = (self.now_pos + 1) % self.size
        
    def get_indices(self):
        # do not accept frames from different life spans
        for i in range(self.batch_size):
            while True:
                index = np.random.randint(self.stack_size, self.now_size - 1)
                if self.end_life[index - self.stack_size:index].any():
                    continue
                if index < self.stack_size:
                    continue
                if index >= self.now_pos and (index - self.stack_size) <= self.now_pos:
                    continue
                break
            self.indices[i] = index
    
    def get_batch(self):
        self.get_indices()
        for i, j in enumerate(self.indices):
            self.states[i] = self.frames[j - self.stack_size:j, ...]
            self.next_states[i] = self.frames[j - self.stack_size + 1:j + 1, ...]
        return np.transpose(self.states, axes=(0, 2, 3, 1)), self.actions[self.indices], self.rewards[self.indices], np.transpose(self.next_states, axes=(0, 2, 3, 1)), self.end_life[self.indices]

In [None]:
def get_eps(count_frames, test):
    eps = 0
    if test:
        eps = 0
    elif count_frames < FRAME_START:
        eps = EPS_START
    elif count_frames >= FRAME_START and count_frames < FRAME_END:
        eps = ((count_frames - FRAME_START) * (EPS_END - EPS_START)) / (FRAME_END - FRAME_START) + EPS_START
    else:
        eps = EPS_END
    return eps
'''    
def get_action(sess, count_frames, state, network, test=False):
    eps = get_eps(count_frames, test)
    
    if np.random.rand(1) < eps:
        action = np.random.randint(0, ACTION_SIZE)
    else:
        action = sess.run(network.best_action, feed_dict={network.input : state.reshape((1, *state.shape))})[0]
    return action, eps
'''

def get_action(sess, count_frames, state, network, eps):
    if np.random.rand(1) < eps:
        action = np.random.randint(0, ACTION_SIZE)
    else:
        action = sess.run(network.best_action, feed_dict={network.input : [state]})[0]
    return action
        

In [None]:
'''
def env_reset(sess, env, lives_left, test=False):
    frame = env.reset()
    lives_left = 0
    if_life_lost = True
    
    if test:
        for i in range(np.random.randint(1, NO_OP_STEPS)):
            frame, _, _, _ = env.step(1)
    
    processed_frame = frame_processor.process(sess, frame)
    state = np.repeat(processed_frame, STACK_SIZE, axis=2)
    return state, if_life_lost

def env_step(sess, env, action, lives_left, state):
    next_frame, reward, done, info = env.step(action)
    if info['ale.lives'] < lives_left:
        if_life_lost = True
    else:
        if_life_lost = done
    lives_left = info['ale.lives']
    
    processed_next_frame = frame_processor.process(sess, next_frame)
    next_state = np.append(state[:, :, 1:], processed_next_frame, axis=2)
    return next_state, reward, done, lives_left, if_life_lost, processed_next_frame, next_frame
'''

In [None]:
# wrapper for environment

class GameWrapper():
    def __init__(self, env, NO_OP_STEPS, STACK_SIZE):
        self.env = env
        self.frame_processor = ProcessFrame(FRAME_HEIGHT, FRAME_WIDTH)
        self.state = None
        self.lives_left = 0
        self.no_op_steps = NO_OP_STEPS
        self.stack_size = STACK_SIZE
    
    def reset(self, sess, test=False):
        frame = self.env.reset()
        self.lives_left = 0
        if_life_lost = True
        
        if test:
            for i in range(np.random.randint(1, NO_OP_STEPS)):
                frame, _, _, _ = self.env.step(1)
        
        processed_frame = self.frame_processor.process(sess, frame)
        
        self.state = np.repeat(processed_frame, STACK_SIZE, axis=2)
        
        return if_life_lost
    
    def step(self, sess, action):
        next_frame, reward, done, info = self.env.step(action)
        
        if info['ale.lives'] < self.lives_left:
            if_life_lost = True
        else:
            if_life_lost = done
        
        self.lives_left = info['ale.lives']
        
        processed_next_frame = self.frame_processor.process(sess, next_frame)
        next_state = np.append(self.state[:, :, 1:], processed_next_frame, axis=2)
        self.state = next_state
        
        return reward, done, if_life_lost, processed_next_frame, next_frame

In [None]:
# update target network variables
'''
def update_network_variables(sess, network_vars, target_network_vars):
    assigns = []
    for i, var in enumerate(network_vars):
        copy = target_network_vars[i].assign(var.value())
        assigns.append(copy)
    for var in assigns:
        sess.run(var)
'''

In [None]:
class TargetNetworkUpdater:
    def __init__(self, network_vars, target_network_vars):
        self.n_vars = network_vars
        self.tn_vars = target_network_vars
    
    def update_network(self, sess):
        assigns = []
        for i, var in enumerate(self.n_vars):
            copy = self.tn_vars[i].assign(var.value())
            assigns.append(copy)
        
        for var in assigns:
            sess.run(var)

In [None]:
tf.reset_default_graph()

game = GameWrapper(env, NO_OP_STEPS, STACK_SIZE)

with tf.variable_scope('network'):
    network = Qnetwork(STATE_SIZE, ACTION_SIZE, LEARNING_RATE)
    
with tf.variable_scope('target_network'):
    target_network = Qnetwork(STATE_SIZE, ACTION_SIZE, LEARNING_RATE)

init = tf.global_variables_initializer()
saver = tf.train.Saver()
WRITER = tf.summary.FileWriter(os.path.join(SUMMARIES, RUN_ID))

network_variables = tf.trainable_variables(scope='network')
target_network_variables = tf.trainable_variables(scope='target_network')

network_updater = TargetNetworkUpdater(network_variables, target_network_variables)

In [None]:
# scalars in tensorboard

with tf.name_scope('Performance'):
    LOSS_PH = tf.placeholder(tf.float32, shape=None, name='loss_summary')
    LOSS_SUMMARY = tf.summary.scalar('loss', LOSS_PH)
    REWARD_PH = tf.placeholder(tf.float32, shape=None, name='reward_summary')
    REWARD_SUMMARY = tf.summary.scalar('reward', REWARD_PH)
    EVAL_SCORE_PH = tf.placeholder(tf.float32, shape=None, name='evaluation_summary')
    EVAL_SCORE_SUMMARY = tf.summary.scalar('evaluation_score', EVAL_SCORE_PH)

PERFORMANCE_SUMMARIES = tf.summary.merge([LOSS_SUMMARY, REWARD_SUMMARY])


# histograms in tensorboard

LAYERS = ['conv1', 'conv2', 'conv3', 'dense1Value', 'dense1Bias', 'dense2Value', 'dense2Bias']

with tf.name_scope('Parameters'):
    ALL_PARAM_SUMMURIES = []
    for i, j in enumerate(LAYERS):
        with tf.name_scope('DQN/'):
            DQN_KERNEL = tf.summary.histogram(j, tf.reshape(network_variables[i], shape=[-1]))
        ALL_PARAM_SUMMURIES.extend([DQN_KERNEL])
PARAM_SUMMURIES = tf.summary.merge(ALL_PARAM_SUMMURIES)

In [None]:
def generate_gif(frame_count, frames_list, reward, path):
    for ind, frame_ind in enumerate(frames_list):
        frames_list[ind] = resize(frame_ind, (420, 320, 3), preserve_range=True, order=0).astype(np.uint8)
        
    imageio.mimsave(f'{path}{"ATARI_frame_{0}_reward_{1}.gif".format(frame_count, reward)}', 
                    frames_list, duration=1/30)

In [None]:
# TRAIN

experience = ExperienceBuffer(EXPERIENCE_SIZE, FRAME_HEIGHT, FRAME_WIDTH, STACK_SIZE, BATCH_SIZE)

with tf.Session() as sess:
    sess.run(init)
    
    frame_count = 0
    episode_count = 0
    episodes_rewards = []
    losses = []
    
    while frame_count < MAX_FRAMES:
        
        # TRAIN
        
        epoch_frame = 0
        while epoch_frame < EVALUATE_FREQ:
            
            episode_reward = 0
            if_life_lost = game.reset(sess)
            
            for i in range(MAX_FRAMES_PER_EP):
                action = get_action(sess, frame_count, game.state, network, 0)

                reward, done, if_life_lost, processed_next_frame, next_frame = game.step(sess, action)

                frame_count += 1
                episode_reward += reward
                epoch_frame += 1

                experience.add(action, processed_next_frame[:, :, 0], reward, if_life_lost)

                # learning
                if frame_count % 4 == 0 and frame_count > START_TRAINING:
                    states, actions, rewards, next_states, end_life = experience.get_batch()

                    best_actions = sess.run(network.best_action, feed_dict={network.input : next_states})
                    q_values = sess.run(target_network.output, feed_dict={target_network.input : next_states})
                    double_q_values = q_values[range(BATCH_SIZE), best_actions]
                    target_q = rewards + (DISCOUNT_RATE * double_q_values * (1 - end_life))
                    loss, _ = sess.run([network.loss, network.update], 
                                       feed_dict={network.input : states,
                                                  network.action : actions, 
                                                  network.target_Q : target_q})
                    losses.append(loss)

                # update target network and eps
                if frame_count % UPDATE_NETWORK_FREQ == 0 and frame_count > START_TRAINING:
                    network_updater.update_network(sess)
                    eps = max(eps * EPS_FACTOR, EPS_END)

                if done:
                    done = False
                    break

            episodes_rewards.append(episode_reward)
            episode_count += 1

            if episode_count % 10 == 0:
                # scalar summuries for tensorboard
                if frame_count > 0:
                    summ = sess.run(PERFORMANCE_SUMMARIES, 
                                    feed_dict={LOSS_PH:np.mean(losses), 
                                               REWARD_PH:np.mean(episodes_rewards[-100:])})

                    WRITER.add_summary(summ, frame_count)
                    losses = []


                # histogram summuries for tensorboard
                summ_param = sess.run(PARAM_SUMMURIES)
                WRITER.add_summary(summ_param, frame_count)

                print('Episodes =', episode_count, 'Frames =', frame_count, 'Mean reward =', 
                      np.mean(episodes_rewards[-100:]), 'Eps =', eps)

        
        # EVALUATE
        
        done = True
        gif = True
        frames_list = []
        eval_rewards = []
        eval_frame_count = 0
        
        for i in range(EVALUATE_STEPS):
            if done:
                episode_reward = 0
                done = False
                if_life_lost = game.reset(sess, test=True)
            
            if if_life_lost:
                action = 1
            else:
                action = get_action(sess, frame_count, game.state, network, eps)
            
            reward, done, if_life_lost, processed_next_frame, next_frame = game.step(sess, action)
            eval_frame_count += 1
            episode_reward += reward
            
            if gif:
                frames_list.append(next_frame)
                
            if done:
                eval_rewards.append(episode_reward)
                gif = False
        
        print('      EVALUATION SCORE:\n', '      ', np.mean(eval_rewards))
        
        generate_gif(frame_count, frames_list, eval_rewards[0], PATH + '/')
        
        saver.save(sess, PATH + '/model', global_step=frame_count)
        
        summ = sess.run(EVAL_SCORE_SUMMARY, feed_dict={EVAL_SCORE_PH : np.mean(eval_rewards)})
        WRITER.add_summary(summ, frame_count)

In [None]:
# TEST

gif_path = 'GIF/'
os.makedirs(gif_path, exist_ok=True)

with tf.Session() as sess:
    saver = tf.train.import_meta_graph(PATH + '/model-1029685.meta')
    saver.restore(sess, tf.train.latest_checkpoint(PATH + '/'))
    
    frames_list = []
    total_reward = 0
    state, if_life_lost = env_reset(sess, env, lives_left)
    
    while True:
        if if_life_lost:
            action = 1
        else:
            action = get_action(sess, frame_count, state, network, 0)
            
        #print(action, end=' ')
        
        reward, done, if_life_lost, processed_next_frame, next_frame = game.step(sess, action)
        total_reward += reward
        frames_list.append(next_frame)
        if done:
            break
    
    env.close()
    print('Reward:', total_reward)
    generate_gif(RUN_NUM, frames_list, total_reward, gif_path)
    print('Gif generated')