In [27]:
import os
import random
from collections import deque
import gym
import cv2
import tqdm
import numpy as np
from PIL import Image
import tensorflow as tf
import keras
tf.__version__, keras.__version__

('1.3.0', '2.0.8')

In [28]:
from keras.models import Model
from keras.layers import Input, Conv2D, Flatten, Dense

In [29]:
import ipywidgets
from io import BytesIO
def to_png(a):
    with BytesIO() as bio:
        Image.fromarray(a).save(bio, 'png')
        return bio.getvalue()

In [39]:
! rm -rf log

In [40]:
ENV_ID = 'Pong-v0'
RESIZE_WIDTH, RESIZE_HEIGHT = (84, 84)
AGENT_HISTORY_LENGTH = 4 # the nomber of most recent fraames experienced by the agent that are given as input to the Q nn
ACTION_REPEAT = 1
γ = 0.99 # discount factor 
INIT_ϵ = 1.
FINAL_ϵ = .1
FINAL_ϵ_FRAME = 1000000
REPLAY_START_SIZE = 20000 #50000 paper original
REPLAY_MEMORY_SIZE = 500000 #1000000 paper original
BATCH_SIZE = 32
TARGET_NET_UPDATE_FREQ = 10000
UPDATE_FREQ = 4
LEARNING_RATE = 0.00025 
GRAD_MOMENTUM = 0.95
SQUARED_GRAD_MOMENTUM = 0.95
MIN_SQUARED_GRAD = 0.01
NO_OP_MAX = 30
NUM_EVAL = 30 # The trained agents were evaluated by playing each game 30 times

SAVE_FREQ = 100
TRAINING = True
SAVE_NN_PATH = 'nn/%s'%ENV_ID
SAVE_LOG_PATH = 'log/%s'%ENV_ID

if not os.path.exists(SAVE_NN_PATH):
    os.makedirs(SAVE_NN_PATH)

if not os.path.exists(SAVE_LOG_PATH):
    os.makedirs(SAVE_LOG_PATH)

EPISODES = 15000  # Number of episodes the agent plays

In [41]:
class DQN_Agent:
    # iniitialize the Deep Q Learning Agent 
    def __init__(self, env, restore=False, episode=None):
        self.action_dim = env.action_space.n # action number
        self.ϵ = INIT_ϵ
        self.ϵ_step = ((INIT_ϵ - FINAL_ϵ) / FINAL_ϵ_FRAME)*ACTION_REPEAT
        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
        self.T = 0
        
        # Create q network
        self.s, self.q_values, q_network = self.create_Q_network(nn_name='q')
        q_network_weights = q_network.trainable_weights

        # Create target network
        self.st, self.target_q_values, target_network = self.create_Q_network(nn_name='target')
        target_network_weights = target_network.trainable_weights
        
        # Create target network update operation
        self.update_target_network = [target_network_weight.assign(q_network_weights[i]) for i, target_network_weight in enumerate(target_network_weights)]
        
        # Create loss func and gradient descent operation
        self.act, self.y, self.loss, self.grad_des = self.loss_function(q_network_weights)
        
        self.sess = tf.InteractiveSession()
        
        # user saver to save q_network weights
        self.saver = tf.train.Saver(q_network_weights, max_to_keep=0)

        self.sess.run(tf.global_variables_initializer())
        # Initialize target network weights with q network weights
        self.sess.run(self.update_target_network)
        
        if restore:
            self.restore_network(episode)
        
        # logging
        self.total_reward = 0
        self.total_q_max = 0
        self.total_loss = 0
        self.duration = 0
        self.episode = 0
        
        self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary()
        self.summary_writer = tf.summary.FileWriter(SAVE_LOG_PATH, self.sess.graph)
        
    # create DQN network
    def create_Q_network(self, nn_name):
        input_state = Input(shape=(RESIZE_WIDTH, RESIZE_HEIGHT, AGENT_HISTORY_LENGTH), dtype='float32', name='inputs_%s'%nn_name)
        conv = Conv2D(filters=32, kernel_size=8, strides=(4,4), activation='relu', name='conv1_%s'%nn_name)(input_state)
        conv = Conv2D(filters=64, kernel_size=4, strides=(2,2), activation='relu', name='conv2_%s'%nn_name)(conv)
        conv = Conv2D(filters=64, kernel_size=3, strides=(1,1), activation='relu', name='conv3_%s'%nn_name)(conv)
        flat = Flatten(name='flatten_%s'%nn_name)(conv)
        fc = Dense(512, activation='relu',  name='fc1_%s'%nn_name)(flat)
        Q_pred = Dense(self.action_dim, name='q_pred_%s'%nn_name)(fc)

        model = Model(inputs=[input_state], outputs=[Q_pred], name=nn_name)
        return input_state, Q_pred, model
        
    
    # DQN loss function
    def loss_function(self, q_network_weights):
        with tf.name_scope('action'):
            act = tf.placeholder(tf.int64, [None])
        with tf.name_scope('y'):
            y = tf.placeholder(tf.float32, [None])
        with tf.name_scope('action_one_hot'):
            act_one_hot = tf.one_hot(act, self.action_dim, 1.0, 0.0)
        with tf.name_scope('q_action'):
            q_action = tf.reduce_sum(tf.multiply(self.q_values, act_one_hot),
                                     reduction_indices=1)
        with tf.name_scope('loss_function'):
            # loss = tf.reduce_mean(tf.square(y-q_action))
            # error clipping further improved the stability of the algorithm
            error = tf.abs(y - q_action)
            quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
            linear_part = error - quadratic_part
            loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)
        with tf.name_scope('RMSprop'):
            optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, 
                                                  momentum=GRAD_MOMENTUM, 
                                                  epsilon=MIN_SQUARED_GRAD)
        with tf.name_scope('minimize_loss'):
            gradient_descent = optimizer.minimize(loss, var_list=q_network_weights)
        return act, y, loss, gradient_descent
    
    # DQN skill stack frame
    def preprocessing_init_state(self, state, previous_state):
        preprocessing_state = self.preprocessing(state, previous_state)
        ϕ = np.repeat(preprocessing_state, AGENT_HISTORY_LENGTH, axis=-1)
        return ϕ
        
    # DQN skill replay memory buffer
    def experience_replay(self, ϕt, action, reward, gg, state_t):
        #ϕt1 = ϕt.copy()
        #ϕt1[:,:,:-1] = ϕt1[:,:,1:]
        #ϕt1[:,:,-1] = state_t[:,:,0]
        ϕt1 = np.append(ϕt[:, :, 1:], state_t, axis=2)
        
        # Clipping the rewards in this manner limits the scale of the error derivatives and
        # makes it easier to use the same learning rate across multiple games.
        reward = np.clip(reward, -1, 1)
        
        # Store transition (ϕt, at, rt, ϕt+1) in replay memory but we need to store whether gg to calculate j
        self.replay_memory.append((ϕt, action, reward, ϕt1, gg))
        
        # for statics
        self.total_reward += reward
        if self.duration % ACTION_REPEAT == 0:
            self.total_q_max += np.max(self.q_values.eval(feed_dict={self.s: [ϕt.astype(np.float32)/255.]}))
        self.duration += 1
        
        if gg:
            self.episode += 1
            if self.T >= REPLAY_START_SIZE:
                statics = [self.total_reward, self.total_q_max/self.duration/ACTION_REPEAT, 
                           self.duration, self.total_loss/self.duration/UPDATE_FREQ, self.ϵ]
                for i, var in enumerate(statics):
                    self.sess.run(self.update_ops[i], feed_dict={self.summary_placeholders[i]: var})
                summary_str = self.sess.run(self.summary_op)
                self.summary_writer.add_summary(summary_str, self.episode)
            
            self.total_reward = 0
            self.total_q_max = 0
            self.total_loss = 0
            self.duration = 0
            
        return ϕt1
    
    def preprocessing_ϕ(self, ϕt, state_t):
        return np.append(ϕt[:, :, 1:], state_t, axis=2)

    # define to train DNQ Agent
    def training(self):
        if self.T >= REPLAY_START_SIZE:
            # Train network
            if self.T % UPDATE_FREQ == 0:
                # Sample random minibatch of transition from replay memory buffer
                minibatch = np.array(random.sample(self.replay_memory, BATCH_SIZE))
                ϕt_batch = flat_obj_array(minibatch[:,0], dtype=np.float32)/255.
                action_batch = minibatch[:,1].astype(np.int64)
                reward_batch = minibatch[:,2].astype(np.float32)
                ϕt1_batch = flat_obj_array(minibatch[:,3], dtype=np.float32)/255.
                # our y = r + γmax_aQ^(ϕ_1, a; θ`) when not gg if gg y = r so we convert gg True->0 False->1 
                # so we can simply multiply it to get y
                gg_batch = 1. - minibatch[:,4].astype(np.float32)

                target_q_values_batch = self.target_q_values.eval(feed_dict={self.st: ϕt1_batch})
                y_batch = reward_batch + γ*np.max(target_q_values_batch, axis=1)*gg_batch

                loss, _ = self.sess.run([self.loss, self.grad_des], 
                                          feed_dict={
                                                    self.s: ϕt_batch,
                                                    self.act: action_batch,
                                                    self.y: y_batch
                                                    })
                # log loss
                self.total_loss += loss
                

            # Update target network
            if self.T % TARGET_NET_UPDATE_FREQ == 0:
                self.sess.run(self.update_target_network)
                
            # Save DQN session to restore
            self.save_dqn_sess()
                
            
        
        self.T += 1
    
    def save_dqn_sess(self):
        if self.episode % SAVE_FREQ == 0 and self.duration == 0:
            chkp_path = self.saver.save(self.sess, os.path.join(SAVE_NN_PATH, ENV_ID), global_step=self.episode)
            # print('Saved DQN in {}'.format(chkp_path)
            
    # ϵ-greedy action
    def ϵ_greedy_action(self, ϕ):
        if np.random.random() <= self.ϵ and self.T < REPLAY_START_SIZE:
            action = np.random.randint(self.action_dim)
        else:
            action = np.argmax(self.q_values.eval(feed_dict={self.s: [ϕ.astype(np.float32)/255.]}))
        
        if self.ϵ > FINAL_ε and self.T >= REPLAY_START_SIZE:
            self.ϵ -= self.ϵ_step
        return action
    
    # action when test
    def action_at_test(self, ϕ):
        # according to original paper it can avoid the overfiting with set ϵ=0.05
        if np.random.random() <= 0.05:
            action = np.random.randint(self.action_dim)
        else:
            action = np.argmax(self.q_values.eval(feed_dict={self.s: [ϕ.astype(np.float32)/255.]}))
        self.T += 1 ## not sure whether testing need to count
        return action
    
    # directly output the action from Q network
    def action(self, ϕ):
        return np.argmax(self.q_values.eval(feed_dict={self.s: [ϕ.astype(np.float32)/255.]}))
        
    # DQN skill preprocess the input images
    def preprocessing(self, state, previous_state):
        encode_frame = np.maximum(state, previous_state)
        extract_Y_channel = cv2.cvtColor(encode_frame, cv2.COLOR_RGB2YUV)[:,:,0]
        resize_frame = cv2.resize((extract_Y_channel), (RESIZE_WIDTH, RESIZE_HEIGHT))
        return resize_frame[:,:,None]
    
    def restore_network(self, episode):
        meta_path = '{}-{}.meta'.format(os.path.join(SAVE_NN_PATH, ENV_ID), episode)
        #self.saver = tf.train.import_meta_graph(meta_path, clear_devices=True)
        self.saver.restore(self.sess, save_path='{}-{}'.format(os.path.join(SAVE_NN_PATH, ENV_ID), episode))
        print("Restore DQN episode:{}".format(episode))      
        
    def setup_summary(self):
        episode_total_reward = tf.Variable(0.)
        tf.summary.scalar(ENV_ID + '/Episode_Reward', episode_total_reward)
        episode_avg_max_q = tf.Variable(0.)
        tf.summary.scalar(ENV_ID + '/Episode_Max_Q', episode_avg_max_q)
        episode_duration = tf.Variable(0.)
        tf.summary.scalar(ENV_ID + '/Episode_Frame_Count', episode_duration)
        episode_avg_loss = tf.Variable(0.)
        tf.summary.scalar(ENV_ID + '/Episode_Loss', episode_avg_loss)
        ϵ_tf = tf.Variable(0.)
        tf.summary.scalar(ENV_ID + '/Epsilon', ϵ_tf)
        summary_vars = [episode_total_reward, episode_avg_max_q, episode_duration, episode_avg_loss, ϵ_tf]
        summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))]
        update_ops = [summ_var.assign(summary_placeholders[i]) for i, summ_var in enumerate(summary_vars)]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

In [42]:
def flat_obj_array(arr, dtype=np.float32):
    return np.array([a for a in arr], dtype=dtype)

In [43]:
display_box = ipywidgets.Box([ipywidgets.Image()])
display_box

In [44]:
env = gym.make(ENV_ID)
agent = DQN_Agent(env)

In [45]:
SHOW_TRAINING = True

# training
for _ in tqdm.tqdm_notebook(range(50)):
    gg = False
    act_repeat_count = 0
    state = env.reset()

    if SHOW_TRAINING:
        display_box.children[0].value = to_png(state)
    for _ in range(np.random.randint(1, NO_OP_MAX)):
        previous_state = state
        state, reward, gg, info = env.step(0)
        if SHOW_TRAINING:
            display_box.children[0].value = to_png(state)
    ϕ = agent.preprocessing_init_state(state, previous_state)
    while not gg:
        previous_state = state
        if act_repeat_count % ACTION_REPEAT == 0:
            action = agent.ϵ_greedy_action(ϕ)
            if SHOW_TRAINING:
                display_box.children[0].value = to_png(state)
        act_repeat_count += 1
        state, reward, gg, info = env.step(action)
        preprocessing_state = agent.preprocessing(state, previous_state)
        ϕ = agent.experience_replay(ϕ, action, reward, gg,  preprocessing_state)
        agent.training()




In [9]:
## testing
agent.restore_network(episode=20)
for _ in tqdm.tqdm_notebook(range(30)):
    gg = False
    act_repeat_count = 0
    state = env.reset()
    display_box.children[0].value = to_png(state)

    previous_state = state
    state, reward, gg, info = env.step(0)
    ϕ = agent.preprocessing_init_state(state, previous_state)
    
    while not gg:
        previous_state = state
        if act_repeat_count % ACTION_REPEAT == 0:
            action = agent.action(ϕ)
            display_box.children[0].value = to_png(state)
        act_repeat_count += 1
        state, reward, gg, info = env.step(action)
        preprocessing_state = agent.preprocessing(state, previous_state)
        ϕ = agent.preprocessing_φ(ϕ, preprocessing_state)

INFO:tensorflow:Restoring parameters from nn/Pong-v0/Pong-v0-20
Restore DQN episode:20



