In [None]:
import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from skimage.color import rgb2gray
from skimage import transform
import keras
from collections import deque
from IPython import display
import imageio
from skimage.transform import resize
import os

In [None]:
env = gym.make('BreakoutDeterministic-v4')
frame = env.reset()
# print(env.unwrapped.get_action_meanings())

# Initialize parameters

frame_height = 84
frame_width = 84
state_size = [frame_height, frame_width, 4]
action_size = env.action_space.n
learning_rate = 0.00001
discount_rate = 0.99

# e-greedy with frames
'''
eps_start = 1
eps_end1 = 0.1
eps_end2 = 0.01
frame_start1 = 1 * 1000
frame_end1 = 5 * 1000
'''
# e-greedy with episodes

eps_start = 1
eps_end = 0.1

eps = eps_start
eps_factor = 0.98

batch_size = 32
max_frames = 5 * 1000 * 1000
max_frames_per_ep = 18 * 1000

experience_size = 800 * 1000

stack_size = 4

no_op_steps = 10

update_network_freq = 10000

evaluate_freq = 200 * 1000

evaluate_steps = 10000



RUN_NUM = 11
PATH = 'results'
SUMMARIES = 'summaries'
RUN_ID = 'run_' + str(RUN_NUM)
PATH = os.path.join(PATH, RUN_ID)
os.makedirs(PATH, exist_ok=True)
os.makedirs(os.path.join(SUMMARIES, RUN_ID), exist_ok=True)
print(PATH)

In [None]:
print(env.observation_space)
print(env.action_space.n)
plt.imshow(frame)
plt.show()

In [None]:
def process(frame):
    gray = rgb2gray(frame)
    crop = gray[4:-15,5:-5]
    norm = crop / 255.0
    transformed = transform.resize(norm, [frame_height, frame_width, 1])
    return transformed

In [None]:
class Qnetwork:
    def __init__(self, state_size, action_size, learning_rate, name='Qnetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            self.input = tf.placeholder(dtype=tf.float32, shape=[None, *self.state_size], name='input')
            self.target_Q = tf.placeholder(dtype=tf.float32, shape=[None], name='target')
            
            self.conv1 = tf.layers.conv2d(
                inputs=self.input, filters=32, kernel_size=8, strides=4,
                kernel_initializer=tf.variance_scaling_initializer(scale=2), padding='VALID', 
                activation=tf.nn.relu, use_bias=False, name='conv1')
            
            self.conv2 = tf.layers.conv2d(
                inputs=self.conv1, filters=64, kernel_size=4, strides=2,
                kernel_initializer=tf.variance_scaling_initializer(scale=2), padding='VALID', 
                activation=tf.nn.relu, use_bias=False, name='conv2')
            
            self.conv3 = tf.layers.conv2d(
                inputs=self.conv2, filters=64, kernel_size=3, strides=1,
                kernel_initializer=tf.variance_scaling_initializer(scale=2), padding='VALID', 
                activation=tf.nn.relu, use_bias=False, name='conv3')
            
            
            self.flatten = tf.contrib.layers.flatten(self.conv3)
            
            self.ful_con1 = tf.layers.dense(inputs=self.flatten, units=512, 
                                            kernel_initializer=tf.variance_scaling_initializer(scale=2),
                                            activation=tf.nn.relu, name='ful_con1')
            
            #self.ful_con2 = tf.layers.dense(inputs=self.ful_con1, units=128, 
            #                        activation=tf.nn.relu, name='ful_con2')
            
            self.output = tf.layers.dense(inputs=self.ful_con1, 
                                          kernel_initializer=tf.variance_scaling_initializer(scale=2),
                                          units=self.action_size, activation=None)
            
            self.best_action = tf.argmax(self.output, axis=1)
            
            self.action = tf.placeholder(shape=[None], dtype=tf.int32)
            self.predicted_Q = tf.reduce_sum(tf.multiply(self.output, tf.one_hot(self.action, action_size, dtype=tf.float32)), axis=1)
            
            self.loss = tf.reduce_mean(tf.losses.huber_loss(labels=self.target_Q, predictions=self.predicted_Q))
            
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            self.update = self.optimizer.minimize(self.loss)

In [None]:
class ExperienceBuffer:
    def __init__(self, size, frame_height, frame_width, stack_size, batch_size):
        self.size = size
        self.frame_height = frame_height
        self.frame_width = frame_width
        self.stack_size = stack_size
        self.batch_size = batch_size
        self.now_size = 0
        self.now_pos = 0
        self.actions = np.zeros(self.size, dtype=np.uint8)
        self.frames = np.zeros((self.size, frame_height, frame_width), dtype=np.uint8)
        self.rewards = np.zeros(self.size, dtype=np.float32)
        self.end_life = np.zeros(self.size, dtype=np.bool)
        
        self.states = np.zeros((self.batch_size, self.stack_size, frame_height, 
                                frame_width), dtype=np.uint8)
        self.next_states = np.zeros((self.batch_size, self.stack_size, frame_height, 
                                     frame_width), dtype=np.uint8)
        self.indices = np.zeros(self.batch_size, dtype=np.int32)
        
    def add(self, action, frame, reward, end_life):
        self.actions[self.now_pos] = action
        self.frames[self.now_pos, ...] = frame
        self.rewards[self.now_pos] = reward
        self.end_life[self.now_pos] = end_life
        self.now_pos = (self.now_pos + 1) % self.size
        self.now_size = max(self.now_size, self.now_pos)
        
    def get_indices(self):
        # do not accept frames from different life spans
        for i in range(self.batch_size):
            while True:
                index = np.random.randint(self.stack_size, self.now_size)
                if self.end_life[index - self.stack_size:index].any():
                    continue
                if index < self.stack_size + 1:
                    continue
                if index >= self.now_pos and (index - self.stack_size) <= self.now_pos:
                    continue
                break
            self.indices[i] = index
    
    def get_batch(self):
        self.get_indices()
        for i, j in enumerate(self.indices):
            self.states[i] = self.frames[j - 1 - self.stack_size:j - 1, ...]
            self.next_states[i] = self.frames[j - self.stack_size:j, ...]
        return np.transpose(self.states, axes=(0, 2, 3, 1)), self.actions[self.indices], self.rewards[self.indices], np.transpose(self.next_states, axes=(0, 2, 3, 1)), self.end_life[self.indices]

In [None]:
'''
def get_eps1(count_frames, test):
    eps = 0
    if test:
        eps = 0
    elif count_frames < frame_start1:
        eps = eps_start
    elif count_frames >= frame_start1 and count_frames < frame_end1:
        eps = ((count_frames - frame_start1) * (eps_end1 - eps_start)) / (frame_end1 - frame_start1) + eps_start
    else:
        eps = eps_end1
    return eps
'''

def get_action(sess, count_frames, state, network, test, eps):
    #eps = get_eps(count_frames, test)
    
    if np.random.rand(1) < eps:
        action = np.random.randint(0, action_size)
    else:
        output = sess.run(network.output, feed_dict={network.input : state.reshape((1, *state.shape))})
        #print(output, end=' ')
        action = np.argmax(output, axis=1)
        #print(action)
    return action
        

In [None]:
def env_reset(env, lives_left, test=False):
    frame = env.reset()
    lives_left = 0
    if_life_lost = True
    
    if test:
        for i in range(np.random.randint(1, no_op_steps)):
            frame, _, _, _ = env.step(1)
    
    processed_frame = process(frame)
    state = np.repeat(processed_frame, stack_size, axis=2)
    return state, if_life_lost

def env_step(env, action, lives_left, state):
    next_frame, reward, done, info = env.step(action)
    if info['ale.lives'] < lives_left:
        if_life_lost = True
    else:
        if_life_lost = done
    lives_left = info['ale.lives']
    
    processed_next_frame = process(next_frame)
    next_state = np.append(state[:, :, 1:], processed_next_frame, axis=2)
    return next_state, reward, done, lives_left, if_life_lost, processed_next_frame, next_frame

In [None]:
# update target network variables

def update_network_variables(sess, network_vars, target_network_vars):
    assigns = []
    for i, var in enumerate(network_vars):
        copy = target_network_vars[i].assign(var.value())
        assigns.append(copy)
    for var in assigns:
        sess.run(var)

In [None]:
tf.reset_default_graph()

with tf.variable_scope('network'):
    network = Qnetwork(state_size, action_size, learning_rate)
    
with tf.variable_scope('target_network'):
    target_network = Qnetwork(state_size, action_size, learning_rate)

init = tf.global_variables_initializer()
saver = tf.train.Saver()
WRITER = tf.summary.FileWriter(os.path.join(SUMMARIES, RUN_ID))

network_variables = tf.trainable_variables(scope='network')
target_network_variables = tf.trainable_variables(scope='target_network')

In [None]:
# scalars in tensorboard

with tf.name_scope('Performance'):
    LOSS_PH = tf.placeholder(tf.float32, shape=None, name='loss_summary')
    LOSS_SUMMARY = tf.summary.scalar('loss', LOSS_PH)
    REWARD_PH = tf.placeholder(tf.float32, shape=None, name='reward_summary')
    REWARD_SUMMARY = tf.summary.scalar('reward', REWARD_PH)

PERFORMANCE_SUMMARIES = tf.summary.merge([LOSS_SUMMARY, REWARD_SUMMARY])


# histograms in tensorboard

LAYERS = ['conv1', 'conv2', 'conv3', 'dense1Value', 'dense1Bias', 'dense2Value', 'dense2Bias']

with tf.name_scope('Parameters'):
    ALL_PARAM_SUMMURIES = []
    for i, j in enumerate(LAYERS):
        with tf.name_scope('DQN/'):
            DQN_KERNEL = tf.summary.histogram(j, tf.reshape(network_variables[i], shape=[-1]))
        ALL_PARAM_SUMMURIES.extend([DQN_KERNEL])
PARAM_SUMMURIES = tf.summary.merge(ALL_PARAM_SUMMURIES)

In [None]:
def generate_gif(frame_count, frames_list, reward, path):
    for ind, frame_ind in enumerate(frames_list):
        frames_list[ind] = resize(frame_ind, (420, 320, 3), preserve_range=True, order=0).astype(np.uint8)
        
    imageio.mimsave(f'{path}{"ATARI_frame_{0}_reward_{1}.gif".format(frame_count, reward)}', 
                    frames_list, duration=1/30)

In [None]:
# TRAIN

experience = ExperienceBuffer(experience_size, frame_height, frame_width, stack_size, batch_size)

with tf.Session() as sess:
    sess.run(init)
    
    frame_count = 0
    episode_count = 0
    episodes_rewards = []
    losses = []
    
    while frame_count < max_frames:
        
        # TRAIN
        
        epoch_frame = 0
        while epoch_frame < evaluate_freq:
            
            episode_reward = 0
            lives_left = 0
            state, if_life_lost = env_reset(env, lives_left)
            for i in range(max_frames_per_ep):
                action = get_action(sess, frame_count, state, network, False, eps)

                next_state, reward, done, lives_left, if_life_lost, processed_next_frame, next_frame = env_step(env, action, lives_left, state)

                frame_count += 1
                episode_reward += reward
                epoch_frame += 1

                experience.add(action, processed_next_frame[:, :, 0], reward, if_life_lost)

                # learning
                if frame_count % 4 == 0 and eps < eps_start:
                    states, actions, rewards, next_states, end_life = experience.get_batch()

                    best_actions = sess.run(network.best_action, feed_dict={network.input : next_states})
                    q_values = sess.run(target_network.output, feed_dict={target_network.input : next_states})
                    double_q_values = q_values[range(batch_size), best_actions]
                    target_q = rewards + (discount_rate * double_q_values * (1 - end_life))
                    loss, _ = sess.run([network.loss, network.update], 
                                       feed_dict={network.input : states,
                                                  network.action : actions, 
                                                  network.target_Q : target_q})
                    losses.append(loss)

                # update target network and eps
                if frame_count % update_network_freq == 0 and frame_count > 0:
                    update_network_variables(sess, network_variables, target_network_variables)
                    eps = max(eps * eps_factor, eps_end)

                if done:
                    done = False
                    break

            episodes_rewards.append(episode_reward)
            episode_count += 1

            if episode_count % 10 == 0:
                # scalar summuries for tensorboard
                if eps < eps_start:
                    summ = sess.run(PERFORMANCE_SUMMARIES, 
                                    feed_dict={LOSS_PH:np.mean(losses), 
                                               REWARD_PH:np.mean(episodes_rewards[-100:])})

                    WRITER.add_summary(summ, frame_count)
                    losses = []


                # histogram summuries for tensorboard
                summ_param = sess.run(PARAM_SUMMURIES)
                WRITER.add_summary(summ_param, frame_count)

                print('Episodes =', episode_count, 'Frames =', frame_count, 'Mean reward =', 
                      np.mean(episodes_rewards[-10:]), 'Eps =', eps)

                saver.save(sess, PATH + '/model', global_step=frame_count)
        
        
        # EVALUATE
        
        done = True
        gif = True
        frames_list = []
        eval_rewards = []
        eval_frame_count = 0
        
        for i in range(evaluate_steps):
            if done:
                lives_left = 0
                episode_reward = 0
                done = False
                state, if_life_lost = env_reset(env, lives_left)
            
            if if_life_lost:
                action = 1
            else:
                get_action(sess, 0, state, network, False, 0)
                
            next_state, reward, done, lives_left, if_life_lost, processed_next_frame, next_frame = env_step(env, action, lives_left, state)
            eval_frame_count += 1
            episode_reward += reward
            
            if gif:
                frames_list.append(next_frame)
                
            if done:
                eval_rewards.append(episode_reward)
                gif = False
        
        print('      EVALUATION SCORE:\n', '      ', np.mean(eval_rewards))
        
        generate_gif(frame_count, frames_list, eval_rewards[0], PATH + '/')
        
        saver.save(sess, PATH + '/model', global_step=frame_count)

In [None]:
# TEST

gif_path = 'GIF/'
os.makedirs(gif_path, exist_ok=True)

with tf.Session() as sess:
    saver = tf.train.import_meta_graph(PATH + '/model-1029685.meta')
    saver.restore(sess, tf.train.latest_checkpoint(PATH + '/'))
    
    frames_list = []
    total_reward = 0
    lives_left = 0
    state, if_life_lost = env_reset(env, lives_left)
    
    while True:
        if if_life_lost:
            action = 1
        else:
            action = get_action(sess, 0, state, network, True, 0)
            action = int(action)
        print(action, end=' ')
        next_state, reward, done, lives_left, if_life_lost, processed_next_frame, next_frame = env_step(env, action, lives_left, state)
        total_reward += reward
        frames_list.append(next_frame)
        if done:
            break
    
    env.close()
    print('Reward:', total_reward)
    generate_gif(RUN_NUM, frames_list, total_reward, gif_path)
    print('Gif generated')