<h1 align="center">Implementing a standard DQN</h1> 

Implementing a basic version of the original DQN. The code was writen based/inspired on the contributions from [Denny Britz](https://github.com/dennybritz/reinforcement-learning/tree/master/DQN) and [Devsisters](https://github.com/devsisters/DQN-tensorflow).

In [1]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
import random
import cv2
import os
import time
from tqdm import tqdm



## Configuration parameters

Configuration parameteres as presented in the [original](http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html) paper.

In [2]:
max_steps = 50000000
# Print performance
test_step = 50000

# Input size
screen_width = 84
screen_height = 84
history_length = 4

memory_size = 1000000       # Replay memory size
batch_size = 32             # Number of training cases ove which SGD update is computed
gamma = 0.99                # Discount factor

learning_rate = 0.00025
learning_rate_decay = 0.96       # Missing
learning_rate_decay_step = 50000 # Missing

learn_start = 50000          # Replay start size
random_start = 30            # no-op max: Maximum number of 'do nothing' actions at the start of an episode

# Exploration parameters
ep_min = 0.1                 # Final exploration
ep_start = 1.0               # Initial exploration
ep_end_time = 1000000        # Final exploration frame

target_q_update_step = 10000 # Target network update frequency

# Train batch
train_frequency = 4          # Update frequency

# Clip rewards
min_reward = -1.0
max_reward = 1.0

# How many times should the same action be taken (avoids taking a new decision after every environment iteration)
action_repeat = 4

# Whether or not to render the environment 
display = False

checkpoint_dir = 'Models/'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
    
log_dir = 'Logs/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    
log_performance_file = 'log_file.txt'

## Load the game environment

Wrapper class arund the gym environment. 

In [3]:
class GymEnvironment():
    def __init__(self,name):
        self.env = gym.make(name)            # Initialize Gym environment
        self._screen = None
        
    def execute_action(self,action,is_training=True):
        # This function execute the selected action for 'action_repeat' number of times and returns the cumulative reward
        # and final state
        cum_reward = 0
        start_lives = self.env.ale.lives()
        
        for _ in xrange(action_repeat):
            self._screen, reward, terminal, _ = self.env.step(action)
            cum_reward += reward
            
            if is_training and start_lives > self.env.ale.lives():
                cum_reward -= 1
                terminal = True
                
            if terminal:
                break
                
        reward = cum_reward
        self.render()
        return self.screen, reward, terminal
    
    @property
    def action_size(self):
        return self.env.action_space.n        # Number of available actions

    @property
    def screen(self):     # Method to resize the screen provided by gym to the desired values
        return cv2.resize(cv2.cvtColor(self._screen,cv2.COLOR_RGB2GRAY)/255.,(screen_width,screen_height))
    
    def new_game(self):
        if self.env.ale.lives() == 0:
            self._screen = self.env.reset()
        
        self._screen, reward, terminal, _ = self.env.step(0)
        self.render()
        return self.screen, 0, 0, terminal
    
    def render(self):    # Renders the environment only if display == True
        if display:
            self.env.render()
    
    def new_random_game(self):  # Starts a random new game after doing 'nothing' for a random number of frames
        _,_,_,terminal = self.new_game()
        for _ in xrange(random.randint(0,random_start-1)):
            self._screen, reward, terminal, _ = self.env.step(0)
        
        self.render()
        return self.screen, 0, 0, terminal

## Experience Replay
This class will allow us to store the experiences and to take random samples to update our target network parameters.

In [4]:
class Experience_Buffer():
    def __init__(self,memory_size):
        self.memory_size = memory_size
        # These are the arrays where we will store the experiences
        self.actions = np.empty(self.memory_size,dtype=np.uint8)
        self.rewards = np.empty(self.memory_size,dtype=np.integer)
        self.screens = np.empty((self.memory_size, screen_height, screen_width),dtype=np.float16)
        self.terminals = np.empty(self.memory_size,dtype=np.bool)
        
        self.prestates = np.empty((batch_size,history_length,screen_height, screen_width),dtype=np.float16)
        self.poststates = np.empty((batch_size,history_length,screen_height, screen_width),dtype=np.float16)
        self.current = 0     # Pointer to the current saving location
        self.count = 0       # Number of collected experiences
        
    def add(self, screen, reward, action, terminal):
        # Adds an experience to replay memory and increases pointers
        self.actions[self.current] = action
        self.rewards[self.current] = reward
        self.screens[self.current,...] = screen
        self.terminals[self.current] = terminal
        
        self.count = max(self.count, self.current+1)
        self.current = (self.current + 1) % self.memory_size    # Pointer resets when reaching memory_size
        
    def getState(self,index):
        index = index % self.count
        # If index is not in the beginning, just use simple slicing
        if index >= history_length-1:
            return self.screens[(index-(history_length-1)):(index+1),...]
        # Otherwise determine the list of indexes which need to be returned
        else:
            indexes = [(index-i) % self.count for i in reversed(range(history_length))]
            return self.screens[indexes,...]
        
    def sample_from_replay(self):
        # Sample random indexes
        indexes = []
        while len(indexes) < batch_size:
            while True:
                index = random.randint(history_length,self.count-1)
                # If index wraps over current pointer, get new one
                if index >= self.current and index - history_length < self.current:
                    continue
                # If index wraps over terminal state, get new one
                if self.terminals[(index-history_length):index].any():
                    continue
                # Use the index otherwise
                break
            self.prestates[len(indexes),...] = self.getState(index-1)
            self.poststates[len(indexes),...] = self.getState(index)
            indexes.append(index)
            
        actions = self.actions[indexes]
        rewards = self.rewards[indexes]
        terminals = self.terminals[indexes]
        
        return np.transpose(self.prestates,(0,2,3,1)),actions,rewards,np.transpose(self.poststates,(0,2,3,1)),terminals    

## History
This class will allow us to stack the last K screens to use them as the input to the network (history of states).

In [5]:
class History:
    def __init__(self):
        self.history = np.zeros([history_length,screen_height,screen_width],dtype=np.float32)
        
    def add(self, screen):
        self.history[:-1] = self.history[1:]
        self.history[-1] = screen
    
    def reset(self):
        self.history *= 0
        
    def get(self):
        return np.transpose(self.history,(1,2,0))

## Defining our network architecture

In [6]:
class ConvNet_Estimator():
    '''Q-value estimator neural network
       This architecture will be used both for the Q-network and the Target network.
    '''
    
    def __init__(self,num_actions,scope='estimator',summaries_dir=None):
        self.scope = scope
        self.num_actions = num_actions

        with tf.variable_scope(scope):
            # Build the graph
            self._build_model()
                
    def _build_model(self):
        '''
        Building the network architecture
        '''
        initializer = tf.constant_initializer(0.0)  # Change to None to remove biases from conv layers 
        
        # Our input: 4(or history_length) Frames taken from the environment
        self.X_pl = tf.placeholder(tf.float32,[None, screen_height, screen_width, history_length],name='X')
        # The TD target value
        self.y_pl = tf.placeholder(tf.float32,[None],name='y')
        # Integer id of selected action
        self.action_pl = tf.placeholder(tf.int32,[None],name='action')
        
        # Three convolutional layers
        conv1 = tf.contrib.layers.conv2d(self.X_pl,32,8,4,padding = 'VALID',
                                         activation_fn=tf.nn.relu,biases_initializer=initializer)
        conv2 = tf.contrib.layers.conv2d(conv1,64,4,2,padding = 'VALID',
                                         activation_fn=tf.nn.relu,biases_initializer=initializer)
        conv3 = tf.contrib.layers.conv2d(conv2,63,3,1,padding = 'VALID',
                                         activation_fn=tf.nn.relu,biases_initializer=initializer)
        
        # Fully connected layers
        flattened = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flattened,512,activation_fn=tf.nn.relu,biases_initializer=initializer)
        self.predictions = tf.contrib.layers.fully_connected(fc1,self.num_actions,biases_initializer=initializer)
        self.best_action = tf.argmax(self.predictions,dimension=1)
        
        # One hot of the action which was taken
        action_one_hot = tf.one_hot(self.action_pl,self.num_actions,1.0,0.0)
        # Get the prediction of the chosen actions only
        self.action_predictions = tf.reduce_sum(self.predictions * action_one_hot,reduction_indices=1)
        
        
        self.delta = self.y_pl-self.action_predictions
        self.clipped_delta = tf.clip_by_value(self.delta,-1.0,1.0)
        self.loss = tf.reduce_mean(tf.square(self.clipped_delta))
        # Calculate the loss
        #self.losses = tf.squared_difference(self.y_pl,self.action_predictions)
        #self.loss = tf.reduce_mean(self.losses)
        
        # Optimizer using parameteres from original paper
        self.optimizer = tf.train.RMSPropOptimizer(learning_rate,momentum=0.95,epsilon=0.01)
        self.train_op = self.optimizer.minimize(self.loss,global_step=tf.contrib.framework.get_global_step())
        
        # Summaries for Tensorboard
        self.summaries = tf.merge_summary([
                tf.scalar_summary('loss', self.loss),
                tf.histogram_summary('q_values_hist',self.predictions),
                tf.scalar_summary('max_q_value',tf.reduce_max(self.predictions))
            ])
        
    def predict(self,sess,state):
        '''
        Predicts action values.
        '''
        return sess.run(self.predictions,{self.X_pl:state})
    
    def determine_action(self,sess,state,current_step,test_ep=None):
        '''
        Predicts action based on q values for current state and exploration strategy
        '''
        # Calculate exploration prob. epsilon => This is a decaying epsilon starting at 1 and decreasing until 0.1 once the learning process start
        ep = test_ep or (ep_min + max(0.,(ep_start-ep_min)*(ep_end_time-max(0.,current_step-learn_start))/ep_end_time))
        if random.random() < ep:
            # Explore: random action
            action = random.randrange(self.num_actions)
        else:            
            action = sess.run(self.best_action,{self.X_pl:[state]})[0]

        return action
    
    def update(self,sess,state,action,target):
        '''
        Updates the estimator towards the given targets
        '''
        feed_dict = {self.X_pl:state, self.y_pl:target, self.action_pl:action}
        q_t, summaries, _, loss = sess.run([self.predictions, self.summaries,self.train_op,self.loss],feed_dict)

        return q_t, loss, summaries

## Learning agent

In [7]:
class Agent():
    def __init__(self,environment,sess):
        self.sess = sess
        self.env = environment
        self.exp_replay = Experience_Buffer(memory_size)
        self.history = History()
        self.q_estimator = ConvNet_Estimator(self.env.action_size,scope='q',summaries_dir=log_dir)
        self.target_estimator = ConvNet_Estimator(self.env.action_size,scope='target_q')
        
        with tf.variable_scope('step'):
            self.step_op = tf.Variable(0,trainable=False,name='step')
            self.step_input = tf.placeholder(tf.int32,None,name='step_input')
            self.step_assign_op = self.step_op.assign(self.step_input)
        
        # Add summaries
        with tf.variable_scope('summary'):
            scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', 'episode.max reward', 'episode.min reward',\
                                  'episode.avg reward', 'episode.num of games']
            self.summary_placeholders = {}
            self.summary_ops = {}
            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(tf.float32,None,name=tag.replace(' ','_'))
                self.summary_ops[tag] = tf.scalar_summary(tag, self.summary_placeholders[tag])
            
            histogram_summary_tags = ['episode.rewards','episode.actions']
            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(tf.float32,None,name=tag.replace(' ','_'))
                self.summary_ops[tag] = tf.histogram_summary(tag, self.summary_placeholders[tag])
                
            self.writer = tf.train.SummaryWriter(log_dir,self.sess.graph)
        
        tf.initialize_all_variables().run()
        self.saver = tf.train.Saver(max_to_keep = 3)
        
        self.load_model()
        self.update_target_q_network(self.sess)

    def update_target_q_network(self,sess):
        e1_params = [t for t in tf.trainable_variables() if t.name.startswith(self.q_estimator.scope)]
        e1_params = sorted(e1_params, key=lambda v: v.name)
        e2_params = [t for t in tf.trainable_variables() if t.name.startswith(self.target_estimator.scope)]
        e2_params = sorted(e2_params, key=lambda v: v.name)

        update_ops = []
        for e1_v, e2_v in zip(e1_params,e2_params):
            op = e2_v.assign(e1_v)
            update_ops.append(op)

        sess.run(update_ops)
            
    def load_model(self,step=None):
        latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
        if latest_checkpoint:
            print('[*] Loading Checkpoint {}...\n'.format(latest_checkpoint))
            self.saver.restore(self.sess,latest_checkpoint)
        else:
            print('[!] Could not find any previous checkpoint')
        
    def save_model(self,step=None):
        f = open(log_performance_file, 'a')
        f.write('[*] Saving checkpoints...')
        f.close()

        self.saver.save(self.sess,os.path.join(checkpoint_dir,'model'),global_step=step)    
        
    def train(self):
        start_step = self.step_op.eval()

        num_game, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []
        
        screen,_,_,_ = self.env.new_random_game()
        
        # Stacking the screen in the first input buffer
        for _ in range(history_length):
            self.history.add(screen)
            
        # Training:
        for self.step in tqdm(range(start_step,max_steps),ncols=70,initial=start_step):
            if self.step == learn_start:
                num_game, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []
                
            # 1. Predict: Use our training network to select an action
            action = self.q_estimator.determine_action(self.sess,self.history.get(),self.step)
            
            # 2. Execute the action
            screen, reward, terminal = self.env.execute_action(action,is_training=True)
            
            # 3. New observation
            self.observe(screen,reward,action,terminal)
            
            if terminal:
                screen,_,_,_ = self.env.new_random_game()
                num_game += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0
            else:
                ep_reward += reward
                total_reward += reward
                actions.append(action)
            
            if self.step >= learn_start and self.step % test_step == test_step - 1:
                avg_reward = total_reward / test_step
                avg_loss = self.total_loss / self.update_count
                avg_q = self.total_q / self.update_count

                try:
                    max_ep_reward = np.max(ep_rewards)
                    min_ep_reward = np.min(ep_rewards)
                    avg_ep_reward = np.mean(ep_rewards)
                except:
                    max_ep_reward, min_ep_reward, avg_ep_reward = 0,0,0
                
                f = open(log_performance_file, 'a')
                f.write('\nAvg. reward: %.4f, Avg. loss: %.6f, Avg. Q: %3.6f' % (avg_reward, avg_loss, avg_q))
                f.write('Avg. Ep. Reward: %.4f, Max Ep. Reward: %.4f, Min Ep. Reward: %.4f, # Game: %d' \
                        % (avg_ep_reward, max_ep_reward, min_ep_reward, num_game))
                f.close()

                if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                    self.step_assign_op.eval({self.step_input: self.step+1})
                    self.save_model(self.step+1)
                    max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward)

                self.inject_summary({
                        'average.reward' : avg_reward,
                        'average.loss': avg_loss,
                        'average.q': avg_q,
                        'episode.max reward':max_ep_reward,
                        'episode.min reward':min_ep_reward,
                        'episode.avg reward':avg_ep_reward,
                        'episode.num of games':num_game,
                        'episode.rewards': ep_rewards,
                        'episode.actions': actions,
                    })

                num_game = 0
                total_reward = 0.
                self.total_loss = 0.
                self.total_q = 0.
                self.update_count = 0
                ep_reward = 0
                ep_rewards = []       
                actions = []
    
    def observe(self,screen,reward,action,terminal):
        # Clip reward
        reward = max(min_reward,min(max_reward,reward))
        
        # Add to history
        self.history.add(screen)
        # Add to exp. replay
        self.exp_replay.add(screen,reward,action,terminal)
        
        if self.step > learn_start:
            # If it is time to train the network
            if self.step % train_frequency == 0:
                self.q_learning_mini_batch()
                
            # If it is time to update Target network
            if self.step % target_q_update_step == target_q_update_step -1:
                self.update_target_q_network(self.sess)
        
    def q_learning_mini_batch(self):
        if self.exp_replay.count < history_length:
            # Not enough experiences
            return
        else:
            init_state, action, reward, end_state, terminal = self.exp_replay.sample_from_replay()
            
            # Standard DQN implementation
            # Get the Q-value of the next state
            q_t_plus_1 = self.target_estimator.predict(self.sess,end_state)
            terminal = np.array(terminal) + 0.
            # Get max Q_t+1
            max_q_t_plus_1 = np.max(q_t_plus_1, axis=1)
            # The target q-value (if is not terminal state) will be:
            target_q_t = (1. - terminal) * gamma * max_q_t_plus_1 + reward
            
            q_t, loss, summaries = self.q_estimator.update(self.sess,init_state,action,target_q_t)
            #q_t = self.q_estimator.predict(self.sess,init_state)

            self.writer.add_summary(summaries,self.step)
            self.total_loss += loss
            self.total_q += q_t.mean()
            self.update_count += 1      
            
    def inject_summary(self,tag_dir):
        summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dir.keys()], {
                self.summary_placeholders[tag]: value for tag , value in tag_dir.items()
            })
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str,self.step)
            
    def play(self,n_step=10000,n_episode=100,test_epsilon=None):
        if test_epsilon == None:
            test_epsilon = ep_min
        
        test_history = History()
        best_reward, best_idx = 0, 0
        for idx in xrange(n_episode):
            screen,_,_,_ = self.env.new_random_game()
            current_reward = 0
            
            for _ in range(history_length):
                test_history.add(screen)
                
            for t in tqdm(range(n_step),ncols=70):
                # 1. Predict
                action = self.q_estimator.determine_action(self.sess,test_history.get(),0,test_epsilon)
            
                # 2. Execute the action
                screen, reward, terminal = self.env.execute_action(action,is_training=False)                
                
                # 3. New observation
                test_history.add(screen)
                
                current_reward += reward
                if terminal:
                    break
            
            if current_reward > best_reward:
                best_reward = current_reward
                best_idx = idx
                
            print('='*30)
            print('[%d] Best reward: %d' % (best_idx, best_reward))
            print('='*30)
            
        self.env.env.render(close=True)        

## Main code:

In [None]:
is_train = True

gpu_config = tf.ConfigProto()
gpu_config.gpu_options.allow_growth = True
gpu_config.gpu_options.per_process_gpu_memory_fraction = 1.0
gpu_config.log_device_placement = True

with tf.Session(config=gpu_config) as sess:
    env = GymEnvironment('Breakout-v0')
    agent = Agent(env,sess)
    
    if is_train:
        display = False
        agent.train()
    else:
        display = False
        agent.play()

[2016-12-12 16:46:48,065] Making new env: Breakout-v0


[!] Could not find any previous checkpoint


  0%|                    | 14333/50000000 [01:25<87:28:39, 158.73it/s]

## Test on CPU

In [1]:
is_train = True

with tf.Session() as sess:
    env = GymEnvironment('Breakout-v0')
    agent = Agent(env,sess)
    
    if is_train:
        display = False
        agent.train()
    else:
        display = True
        agent.play()

NameError: name 'tf' is not defined

In [6]:
def conv2d(x,output_dim,kernel_size,stride,
           initializer=tf.contrib.layers.xavier_initializer(),
           activation_fn = tf.nn.relu,
           padding = 'VALID',
           name = 'conv2d'):
    with tf.variable_scope(name):
        stride = [1,stride[0],stride[1],1]
        kernel_shape = [kernel_size[0],kernel_size[1],x.get_shape()[-1],output_dim]
        
        w = tf.get_variable('w',kernel_shape,tf.float32,initializer=initializer)
        conv = tf.nn.conv2d(x,w,stride,padding)
        b = tf.get_variable('biases',[output_dim],initializer=tf.constant_initializer(0.0))
        out = activation_fn(tf.nn.bias_add(conv,b))
        
    return out,w,b       

def linear(input_,output_size,stddev=0.02,bias_start=0.0,activation_fn=None,name='linear'):
    shape = input_.get_shape().as_list()
    
    with tf.variable_scope(name):
        w = tf.get_variable('Matrix',[shape[1],output_size],tf.float32,
                            tf.random_normal_initializer(stddev=stddev))
        b = tf.get_variable('bias',[output_size],initializer=tf.constant_initializer(bias_start))
        
        out = tf.nn.bias_add(tf.matmul(input_,w),b)
        
    if activation_fn!=None:
        return activation_fn(out), w, b
    else:
        return out,w,b

In [7]:
class Init_Agent():
    def __init__(self,environment,sess):
        self.sess = sess
        self.env = environment
        self.exp_replay = Experience_Buffer(memory_size)
        self.history = History()
        
        with tf.variable_scope('step'):
            self.step_op = tf.Variable(0,trainable=False,name='step')
            self.step_input = tf.placeholder(tf.int32,None,name='step_input')
            self.step_assign_op = self.step_op.assign(self.step_input)
        
        # Build Deep network
        self.build_dqn()
        
    def build_dqn(self):
        self.w = {}
        self.t_w = {}
        
        initializer = tf.truncated_normal_initializer(0, 0.02)
        activation_fn = tf.nn.relu
        
        # TRAINING NETWORK
        with tf.variable_scope('prediction'):
            self.input_state = tf.placeholder(tf.float32,[None, screen_height, screen_width, history_length],
                                              name='input_state')
            self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(self.input_state,32,[8,8],[4,4],initializer, 
                                                             activation_fn, name='l1')
            self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(self.l1,64,[4,4],[2,2],initializer, 
                                                             activation_fn, name='l2')
            self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(self.l2,64,[3,3],[1,1],initializer, 
                                                             activation_fn, name='l3')
            
            shape = self.l3.get_shape().as_list()
            self.l3_flat = tf.reshape(self.l3,[-1,reduce(lambda x,y: x*y,shape[1:])])
            
            # Standard DQN implementation
            self.l4, self.w['l4_w'], self.w['l4_b'] = linear(self.l3_flat,512,activation_fn=activation_fn,name='l4')
            self.q, self.w['q_w'],self.w['q_b'] = linear(self.l4,self.env.action_size,name='q')
            
            self.q_action = tf.argmax(self.q,dimension=1)
            
            # Add output summaries
            q_summary = []
            avg_q = tf.reduce_mean(self.q,0)  # Mean q_value per action for each batch
            for idx in xrange(self.env.action_size):
                q_summary.append(tf.histogram_summary('q/%s' % idx, avg_q[idx]))
            self.q_summary = tf.merge_summary(q_summary,'q_summary')
                           
        # TARGET NETWORK
        with tf.variable_scope('target'):
            self.target_state = tf.placeholder(tf.float32,[None, screen_height, screen_width, history_length],
                                              name='target_state')
            self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = conv2d(self.target_state,32,[8,8],[4,4],initializer, 
                                                             activation_fn, name='target_l1')
            self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = conv2d(self.target_l1,64,[4,4],[2,2],initializer, 
                                                             activation_fn, name='target_l2')
            self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = conv2d(self.target_l2,64,[3,3],[1,1],initializer, 
                                                             activation_fn, name='target_l3')
            
            shape = self.target_l3.get_shape().as_list()
            self.target_l3_flat = tf.reshape(self.target_l3,[-1,reduce(lambda x,y: x*y,shape[1:])])
            
            # Standard DQN
            self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = linear(self.target_l3_flat,512,
                                                                    activation_fn=activation_fn,name='target_l4')
            self.target_q, self.t_w['q_w'],self.t_w['q_b'] = linear(self.target_l4,self.env.action_size,
                                                                    name='target_q')
        
        # COPY TRAINING NETWORK INTO TARGET
        with tf.variable_scope('pred_to_target'):
            self.t_w_input = {}
            self.t_w_assign_op = {}
            
            for name in self.w.keys():
                self.t_w_input[name] = tf.placeholder(tf.float32,self.t_w[name].get_shape().as_list(),name=name)
                self.t_w_assign_op[name] = self.t_w[name].assign(self.t_w_input[name])
                    
        # OPTIMIZER
        with tf.variable_scope('optimizer'):
            self.target_q_t = tf.placeholder(tf.float32,[None],name='target_q_t')
            self.action = tf.placeholder(tf.int64,[None],name='action')
            
            # One hot of the action which was taken
            action_one_hot = tf.one_hot(self.action,self.env.action_size,1.0,0.0,name='action_one_hot')
            # Extract the q_value of the action
            q_acted = tf.reduce_sum(self.q * action_one_hot,reduction_indices=1,name='q_acted')
            
            self.delta = self.target_q_t - q_acted      # Error
            self.clipped_delta = tf.clip_by_value(self.delta,-1.0,1.0,name='clipped_delta')
            
            # PAY ATTENTION TO THIS ONE!
            self.global_step = tf.Variable(0,trainable=False)
            
            self.loss = tf.reduce_mean(tf.square(self.clipped_delta),name='loss')
            self.learning_rate_step = tf.placeholder(tf.int64,None,name='learning_rate_step')
            self.learning_rate_op = tf.maximum(0.00025,tf.train.exponential_decay(
                learning_rate,self.learning_rate_step,learning_rate_decay_step,learning_rate_decay,staircase=True))
            self.optim = tf.train.RMSPropOptimizer(self.learning_rate_op,momentum=0.95,epsilon=0.01).minimize(self.loss)
            
        # Add summaries
        with tf.variable_scope('summary'):
            scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', 'episode.max reward', 'episode.min reward',\
                                  'episode.avg reward', 'episode.num of games', 'training.learning_rate']
            self.summary_placeholders = {}
            self.summary_ops = {}
            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(tf.float32,None,name=tag.replace(' ','_'))
                self.summary_ops[tag] = tf.scalar_summary(tag, self.summary_placeholders[tag])
            
            histogram_summary_tags = ['episode.rewards','episode.actions']
            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(tf.float32,None,name=tag.replace(' ','_'))
                self.summary_ops[tag] = tf.histogram_summary(tag, self.summary_placeholders[tag])
                
            self.writer = tf.train.SummaryWriter(log_dir,self.sess.graph)
        
        tf.initialize_all_variables().run()
        self._saver = tf.train.Saver(self.w.values() + [self.step_op], max_to_keep = 10)
        
        self.load_model()
        self.update_target_q_network()
                    
    def update_target_q_network(self):
        for name in self.w.keys():
            self.t_w_assign_op[name].eval({self.t_w_input[name]:self.w[name].eval()})
            
    def load_model(self,step=None):
        print('[*] Loading Checkpoints...')

        ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
        if ckpt and ckpt.model_checkpoint_path:
            ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
            fname = os.path.join(checkpoint_dir,ckpt_name)
            self._saver.restore(self.sess,fname)
            print('[*] Load succesfull: %s' % fname)
            return True
        else:
            print('[!] Load failed: %s' % checkpoint_dir)
            return False
        
    def save_model(self,step=None):
        print('[*] Saving checkpoints...')
        self._saver.save(self.sess,os.path.join(checkpoint_dir,'model'),global_step=step)    
        
    def train(self):
        start_step = self.step_op.eval()

        num_game, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []
        
        # New game (we could modify this to start randomly -> taking random initial actions)
        #screen = self.env.new_game()
        screen,_,_,_ = self.env.new_random_game()
        
        # Stacking the screen in the first input buffer
        for _ in range(history_length):
            self.history.add(screen)
            
        # Training:
        for self.step in tqdm(range(start_step,max_steps),ncols=70,initial=start_step):
            if self.step == learn_start:
                num_game, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []
                
            # 1. Predict: Use our training network to select an action
            action = self.predict(self.history.get())
            
            # 2. Execute the action
            screen, reward, terminal = self.env.execute_action(action,is_training=True)
            
            # 3. New observation
            self.observe(screen,reward,action,terminal)
            
            if terminal:
                screen,_,_,_ = self.env.new_random_game()
                #screen = self.env.new_game()
                num_game += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0
            else:
                ep_reward += reward
                total_reward += reward
                actions.append(action)
            
            if self.step >= learn_start and self.step % test_step == test_step - 1:
                avg_reward = total_reward / test_step
                avg_loss = self.total_loss / self.update_count
                avg_q = self.total_q / self.update_count

                try:
                    max_ep_reward = np.max(ep_rewards)
                    min_ep_reward = np.min(ep_rewards)
                    avg_ep_reward = np.mean(ep_rewards)
                except:
                    max_ep_reward, min_ep_reward, avg_ep_reward = 0,0,0

                print('\nAvg. reward: %.4f, Avg. loss: %.6f, Avg. Q: %3.6f' % (avg_reward, avg_loss, avg_q))
                print('\nAvg. Ep. Reward: %.4f, Max Ep. Reward: %.4f, Min Ep. Reward: %.4f, # Game: %d'\
                     % (avg_ep_reward, max_ep_reward, min_ep_reward, num_game))

                if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                    self.step_assign_op.eval({self.step_input: self.step+1})
                    self.save_model(self.step+1)
                    max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward)

                if self.step > 180:
                    self.inject_summary({
                            'average.reward' : avg_reward,
                            'average.loss': avg_loss,
                            'average.q': avg_q,
                            'episode.max reward':max_ep_reward,
                            'episode.min reward':min_ep_reward,
                            'episode.avg reward':avg_ep_reward,
                            'episode.num of games':num_game,
                            'episode.rewards': ep_rewards,
                            'episode.actions': actions,
                            'training.learning_rate': self.learning_rate_op.eval({self.learning_rate_step:self.step})
                        })

                num_game = 0
                total_reward = 0.
                self.total_loss = 0.
                self.total_q = 0.
                self.update_count = 0
                ep_reward = 0
                ep_rewards = []       
                actions = []
            
    def predict(self,current_state,test_ep=None):
        # Calculate exploration prob. epsilon => This is a decaying epsilon starting at 1 and decreasing until 0.1
        # once the learning process start
        ep = test_ep or (ep_min + max(0.,(ep_start-ep_min)*(ep_end_time-max(0.,self.step-learn_start))/ep_end_time))
        if random.random() < ep:
            # Explore: random action
            action = random.randrange(self.env.action_size)
        else:
            action = self.q_action.eval({self.input_state:[current_state]})[0]
            
        return action
    
    def observe(self,screen,reward,action,terminal):
        # Clip reward
        reward = max(min_reward,min(max_reward,reward))
        
        # Add to history
        self.history.add(screen)
        # Add to exp. replay
        self.exp_replay.add(screen,reward,action,terminal)
        
        if self.step > learn_start:
            # If it is time to train the network
            if self.step % train_frequency == 0:
                self.q_learning_mini_batch()
                
            # If it is time to update Target network
            if self.step % target_q_update_step == target_q_update_step -1:
                self.update_target_q_network()
        
    def q_learning_mini_batch(self):
        if self.exp_replay.count < history_length:
            # Not enough experiences
            return
        else:
            init_state, action, reward, end_state, terminal = self.exp_replay.sample_from_replay()
            
            # Standard DQN implementation
            # Get the Q-value of the next state
            q_t_plus_1 = self.target_q.eval({self.target_state: end_state})
            terminal = np.array(terminal) + 0.
            # Get max Q_t+1
            max_q_t_plus_1 = np.max(q_t_plus_1, axis=1)
            # The target q-value (if is not terminal state) will be:
            target_q_t = (1. - terminal) * gamma * max_q_t_plus_1 + reward

            _, q_t, loss, summary_str = self.sess.run([self.optim,self.q,self.loss,self.q_summary], {
                    self.target_q_t: target_q_t,
                    self.action: action,
                    self.input_state: init_state,
                    self.learning_rate_step: self.step
                })

            self.writer.add_summary(summary_str,self.step)
            self.total_loss += loss
            self.total_q += q_t.mean()
            self.update_count += 1      
            
    def inject_summary(self,tag_dir):
        summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dir.keys()], {
                self.summary_placeholders[tag]: value for tag , value in tag_dir.items()
            })
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str,self.step)
            
    def play(self,n_step=10000,n_episode=100,test_epsilon=None,render=False):
        if test_epsilon == None:
            test_epsilon = ep_min
        
        test_history = History()
        best_reward, best_idx = 0, 0
        for idx in xrange(n_episode):
            screen = self.env.new_game()
            self.env.env.render()
            current_reward = 0
            
            for _ in range(history_length):
                test_history.add(screen)
                
            for t in tqdm(range(n_step),ncols=70):
                # 1. Predict
                action = self.predict(test_history.get(),test_epsilon)
            
                # 2. Execute the action
                screen, reward, terminal = self.env.execute_action(action,is_training=False)                
                self.env.env.render()
                
                # 3. New observation
                test_history.add(screen)
                
                current_reward += reward
                if terminal:
                    break
            
            if current_reward > best_reward:
                best_reward = current_reward
                best_idx = idx
                
            print('='*30)
            print('[%d] Best reward: %d' % (best_idx, best_reward))
            print('='*30)
            
        self.env.env.render(close=True)
        