# Implementation of Async. one-step Q-learning

In [1]:
import itertools
import shutil
import threading
import multiprocessing
import gym
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
%matplotlib inline
import random
import sys
import cv2
import os
import time
from tqdm import tqdm
import Gym_Auxiliar as Aux_Gym

from inspect import getsourcefile
current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0)))
import_path = os.path.abspath(os.path.join(current_path, ".."))

if import_path not in sys.path:
    sys.path.append(import_path)

import commonOps as cops

In [2]:
conf_parameters = {
    'checkpoint_dir': 'Models/',
    'log_dir': 'Logs/',
    'env_name': 'Breakout-v0',
    'max_global_episodes': 1000000,
    
    'save_rate': 5000,             # Save model after X amount of episodes
    
    'update_target': 1000,         # Copy parameters from training into target network
    'async_update': 10,            # How often to perform an asynchronous update
    
    'eval_freq': 300,              # How often to test the performance of the network
    'eval_episodes': 20,           # Number of episodes to run when testing
    
    'reset': False,                # If set, delete the existing model directory and start training from scratch.
    'num_threads': 4,              # Number of threads to run.
    
    # Input size
    'screen_width': 84,
    'screen_height': 84,
    'history_length': 4,
    
    # Gym environment
    'pool_frame_size': 1,
    'random_start': 30,           # Maximum number of 'do nothing' actions at the start of an episode
    'action_repeat': 1,           # How many times should the same action be taken
    
    'gamma': 0.99,                # Discount factor
    'learning_rate': 0.00025,     # Learning rate
    
    'min_reward': -1,
    'max_reward': 1,
}

# Set the number of workers
num_workers = multiprocessing.cpu_count()
if conf_parameters['num_threads']:
    num_workers = conf_parameters['num_threads']

# Optionally empty model directory
if conf_parameters['reset']:
    shutil.rmtree(conf_parameters['checkpoint_dir'], ignore_errors=True)
    shutil.rmtree(conf_parameters['log_dir'], ignore_errors=True)

if not os.path.exists(conf_parameters['checkpoint_dir']):
    os.makedirs(conf_parameters['checkpoint_dir'])
    
if not os.path.exists(conf_parameters['log_dir']):
    os.makedirs(conf_parameters['log_dir'])

## Class defining global shared networks

In [3]:
class ConvNet_Estimator():
    '''Q-value estimator neural network
       This architecture will be used both for the Q-network and the Target network.
    '''
    
    def __init__(self,conf,num_actions,net_type=0):
        if net_type == 0:
            self.scope = 'Global_Training'
            self.collection = 'Normal' 
        else:
            self.scope = 'Global_Target'
            self.collection = 'Target'
            
        self.num_actions = num_actions
        self.screen_width = conf['screen_width']
        self.screen_height = conf['screen_height']
        self.history_length = conf['history_length']
        self.learning_rate = conf['learning_rate']
        self.gamma = conf['gamma']
      
        with tf.variable_scope(self.scope):
            # Build the graph
            self._build_model()
                
    def _build_model(self):
        '''
        Building the network architecture
        '''
        self.state_ph = tf.placeholder(tf.float32,[None, self.screen_height, self.screen_width, self.history_length],name='X')
        
        self.conv1 = slim.conv2d(activation_fn=tf.nn.elu,
                inputs=self.state_ph,num_outputs=16,
                kernel_size=[8,8],stride=[4,4],padding='VALID', scope="conv1")
        self.conv2 = slim.conv2d(activation_fn=tf.nn.elu,
                inputs=self.conv1,num_outputs=32,
                kernel_size=[4,4],stride=[2,2],padding='VALID', scope="conv2")
        hidden = slim.fully_connected(slim.flatten(self.conv2),256,activation_fn=tf.nn.elu, scope="FC")

        self.Q_output = slim.fully_connected(hidden,self.num_actions,
                activation_fn=tf.nn.softmax,
                biases_initializer=None, scope='Output')
        
        with tf.name_scope("Best_Action"):
            self.best_action = tf.argmax(self.Q_output,dimension=1)
        
        if self.scope != 'Global_Target':  
            with tf.name_scope("loss"):
                with tf.name_scope("Inputs"):
                    # The TD target value
                    self.QT_ph = tf.placeholder(tf.float32,[None],name='QT_ph')
                    # Integer id of selected action
                    self.action_ph = tf.placeholder(tf.int32,[None],name='action_ph')

                    self.reward_ph = tf.placeholder(tf.float32, [None], name="reward_ph")
                    self.terminal_ph = tf.placeholder(tf.float32, [None], name="terminal_ph")
                
                with tf.name_scope("Acted_Q"):
                    # One hot of the action which was taken
                    action_one_hot = tf.one_hot(self.action_ph,self.num_actions, 1., 0., name='action_one_hot')
                    # Get the prediction of the chosen actions only
                    acted_Q = tf.reduce_sum(self.Q_output * action_one_hot, reduction_indices=1, name='DQN_acted')

                with tf.name_scope("Target_Q"):
                    Y = self.reward_ph + self.gamma * self.QT_ph * (1 - self.terminal_ph)
                    Y = tf.stop_gradient(Y)

                loss_batch = cops.clipped_l2(Y, acted_Q)
                loss = tf.reduce_sum(loss_batch, name="loss")
        
                self.train_op, grads = cops.graves_rmsprop_optimizer(loss, self.learning_rate, 0.95, 0.01, 1)
        
            with tf.name_scope("Summaries"):
                # Summaries for Tensorboard
                self.summaries = tf.merge_summary([
                        tf.scalar_summary('losses/loss', loss),
                        tf.scalar_summary("losses/loss_max", tf.reduce_max(loss_batch)),
                        tf.scalar_summary('Q/avg_q',tf.reduce_mean(self.Q_output)),
                        tf.histogram_summary('Q/q_values_hist',self.Q_output),
                        tf.scalar_summary('Q/max_q_value',tf.reduce_max(self.Q_output)),
                        tf.scalar_summary('Others/reward_max', tf.reduce_max(self.reward_ph))
                        ])  
                
    def predict(self,sess,state):
        '''
        Predicts action values.
        '''
        return sess.run(self.Q_output,{self.state_ph:state})
    
    def determine_action(self,sess,state,epsilon):
        '''
        Predicts action based on q values for current state and exploration strategy
        '''
        if random.random() < epsilon:
            # Explore: random action
            action = random.randrange(self.num_actions)
        else:            
            action = sess.run(self.best_action,{self.state_ph:[state]})[0]

        return action
    
    def update(self,sess,state,action,reward,target,terminal):
        '''
        Updates the estimator towards the given targets
        '''
        feed_dict = {self.state_ph:state, self.QT_ph:target, 
                     self.action_ph:action, self.reward_ph:reward,
                     self.terminal_ph:terminal}
        _, summaries = sess.run([self.train_op, self.summaries],feed_dict)

        return summaries

## History
This class will allow us to stack the last K screens to use them as the input to the network (history of states).

In [4]:
class History:
    def __init__(self, conf):
        self.history = np.zeros([conf['history_length'],conf['screen_height'],conf['screen_width']],dtype=np.float32)
        
    def add(self, screen):
        self.history[:-1] = self.history[1:]
        self.history[-1] = screen
        
    def get(self):
        return np.transpose(self.history,(1,2,0))

## Evaluation worker

This worker will be in charged of updating the target network, evaluating the performance of the training network and saving the model.

In [5]:
class Monitoring_Worker():
    '''
    Monitors the global step, updates target network and test progress.
    '''
    def __init__(self, name, conf, Q_net, Target_net, global_episodes, summary_writer=None):
        self.name = name
        self.max_global_episodes = conf['max_global_episodes']
        self.update_target_freq = conf['update_target']
        self.global_Q_net = Q_net
        self.global_Target_net = Target_net
        self.global_episodes = global_episodes
        self.writer = summary_writer
        self.updated = False
        
        self.eval_env = Aux_Gym.GymEnvironment(conf['env_name'],conf)
        self.eval_freq = conf['eval_freq']
        self.eval_episodes = conf['eval_episodes']
        self.history_length = conf['history_length']
        
        self.eval_history = History(conf)
        
        self.saver = tf.train.Saver(max_to_keep = 3)
        self.checkpoint_dir = conf['checkpoint_dir']
        self.save_rate = conf['save_rate']
                
        with tf.name_scope("Copy_Parameters"):
            self.update_target_ops = self.update_target_graph(self.global_Q_net.scope,self.global_Target_net.scope) 
        
    def update_target_graph(self,from_scope,to_scope):
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

        op_holder = []
        for from_var,to_var in zip(from_vars,to_vars):
            op_holder.append(to_var.assign(from_var))
        return op_holder
    
    def play(self,sess,test_epsilon=0.01):
        ep_rewards = []
                
        for episode in range(self.eval_episodes):
            # New testing game
            screen, terminal, score = self.eval_env.new_game(), False, 0.
            for _ in range(self.history_length):
                self.eval_history.add(screen)

            while not terminal:
                # Take action with exploration e= 0.01
                action = self.global_Q_net.determine_action(sess,self.eval_history.get(),epsilon=test_epsilon)
                # Play game in test mode (Episodes do not end when losing a life)
                screen, reward, terminal = self.eval_env.execute_action(action,is_training=False)
                self.eval_history.add(screen)
                # Record every reward
                score += reward

            ep_rewards.append(score)
        
        return ep_rewards 
    
    def save_model(self,sess,step=None):
        print('Saving model at step %d' % step)
        self.saver.save(sess,os.path.join(self.checkpoint_dir,'model'),global_step=step) 
        
    def run(self, sess, coord):
        print "Starting " + str(self.name)
        with sess.as_default(), sess.graph.as_default():
            try:
                while not coord.should_stop():
                    episode_count = sess.run(self.global_episodes)
                
                    # Copy parameters from training to target network
                    if episode_count % self.update_target_freq == 0:
                        if self.updated == False:
                            print('Updating target network')
                            sess.run(self.update_target_ops)
                            self.updated = True                            
                    else:
                        self.updated = False
                        
                    # Evaluate performance
                    if episode_count % self.eval_freq == self.eval_freq - 1:  
                        print('Evaluating at step %d' % episode_count)
                        ep_rewards = self.play(sess,test_epsilon=0.01)

                        performance_summary = tf.Summary(
                            value=[
                                tf.Summary.Value(
                                    tag="score/average",
                                    simple_value=sum(ep_rewards)/len(ep_rewards)),
                                tf.Summary.Value(
                                    tag="score/max",
                                    simple_value=max(ep_rewards)),
                                tf.Summary.Value(
                                    tag="score/min",
                                    simple_value=min(ep_rewards)),
                            ])

                        self.writer.add_summary(performance_summary, episode_count)  

                    # Save the model
                    if (episode_count % self.save_rate == 0 and episode_count != 0) or (episode_count == self.max_global_episodes):
                        self.save_model(sess,episode_count) 
                      
                    # Stop thread
                    if episode_count >= self.max_global_episodes:
                        print "Finishing " + str(self.name)
                        coord.request_stop()
                        return

            except Exception, e:
                # Report exceptions to the coordinator.
                coord.request_stop(e)
                return

## Training workers

Each worker will run in a different training thread

In [6]:
class Worker():
    def __init__(self,name, conf, Q_net, Target_net, global_episodes, epsilon, summary_writer=None):
        self.name = name 
        self.max_global_episodes = conf['max_global_episodes']
        self.global_Q_net = Q_net
        self.global_Target_net = Target_net
        self.global_episodes = global_episodes
        self.summary_writer = summary_writer
        self.async_update = conf['async_update']
        
        self.env = Aux_Gym.GymEnvironment(conf['env_name'],conf)
        self.history_length = conf['history_length']        
        self.history = History(conf) 
        
        self.min_reward = conf['min_reward']
        self.max_reward = conf['max_reward']
        
        self.epsilon = epsilon
        
        if self.name == 'Worker_1':
            with tf.name_scope("Increment_Episode"):
                self.increment = self.global_episodes.assign_add(1)
        
    def train(self,sess,episode_buffer):
        state, action , reward, end_state, terminal = map(np.array, zip(*episode_buffer))
        
        # Get the Q-value of the next state
        q_t_plus_1 = self.global_Target_net.predict(sess,end_state)
        # Get max Q_t+1
        target_q = np.max(q_t_plus_1, axis=1)
        
        summary = self.global_Q_net.update(sess,state,action,reward,target_q,terminal)

        return summary
        
    def run(self,sess,coord):
        print "Starting " + str(self.name)
        with sess.as_default(), sess.graph.as_default(): 
            try:
                while not coord.should_stop():
                    episode_count = sess.run(self.global_episodes)
                    episode_buffer = []
                    num_steps = 0
                    ep_rewards = []
                                        
                    screen, terminal = self.env.new_game(), False

                    for _ in range(self.history_length):
                        self.history.add(screen)

                    while not terminal:
                        # 1. Predict: Use our training network to select an action or explore according to epsilon
                        action = self.global_Q_net.determine_action(sess,self.history.get(),self.epsilon)

                        # 2. Execute the action
                        screen, reward, terminal = self.env.execute_action(action,is_training=True)
                        
                        # Compute stats to add to summaries
                        num_steps += 1
                        ep_rewards.append(reward)

                        # Clip reward
                        reward = max(self.min_reward,min(self.max_reward,reward))

                        prev_state = self.history.get()
                        # 3. Add screen to buffer
                        self.history.add(screen)

                        episode_buffer.append([prev_state,action,reward,self.history.get(),terminal])

                        if len(episode_buffer) == self.async_update and not terminal:
                            _ = self.train(sess,episode_buffer)
                            episode_buffer = []
                            
                    # Update the network using the experience buffer at the end of the episode.
                    if len(episode_buffer) != 0:
                        _ = self.train(sess,episode_buffer)  
                        
                    if self.summary_writer is not None:
                        performance_summary = tf.Summary(
                            value=[
                                tf.Summary.Value(
                                    tag="Online/average",
                                    simple_value=sum(ep_rewards)/len(ep_rewards)),
                                tf.Summary.Value(
                                    tag="Online/max",
                                    simple_value=max(ep_rewards)),
                                tf.Summary.Value(
                                    tag="Online/min",
                                    simple_value=min(ep_rewards)),
                                tf.Summary.Value(
                                    tag="Online/num_steps",
                                    simple_value=num_steps),
                                tf.Summary.Value(
                                    tag="Online/Score",
                                    simple_value=sum(ep_rewards)),
                            ])
                        self.summary_writer.add_summary(performance_summary, episode_count)

                    # Stop thread
                    if episode_count >= self.max_global_episodes:
                        print "Finishing " + str(self.name)
                        coord.request_stop()
                        return
                    
                    if self.name == 'Worker_1':
                        sess.run(self.increment)
                    
            except Exception, e:
                # Report exceptions to the coordinator.
                coord.request_stop(e)
                return                

In [None]:
env = Aux_Gym.GymEnvironment(conf_parameters['env_name'],conf_parameters)
num_actions = env.env.action_space.n
env.env.close()

Q_Network = ConvNet_Estimator(conf_parameters,num_actions,net_type=0)
Target_Network = ConvNet_Estimator(conf_parameters,num_actions,net_type=1)
    
with tf.device("/cpu:0"):        
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    summary_writer = tf.train.SummaryWriter(conf_parameters['log_dir'], flush_secs=30)

    workers = []
    Eval_worker = Monitoring_Worker(
            name='Worker_0',
            conf=conf_parameters,
            Q_net=Q_Network,
            Target_net=Target_Network,
            global_episodes=global_episodes,
            summary_writer = summary_writer)
    
    workers.append(Eval_worker)
    
    # We are going to assign different exploration probabilities to each worker
    epsilon_slope = 0.8 / float(num_workers-2)
    # Create workers
    for worker_id in range(1,num_workers):
        worker_summary_writer = None
        if worker_id == 1:
            worker_summary_writer = summary_writer
            
        worker = Worker(
            name="Worker_{}".format(worker_id),
            conf=conf_parameters,
            Q_net=Q_Network,
            Target_net=Target_Network,
            global_episodes=global_episodes,
            epsilon=(worker_id-1)*epsilon_slope + 0.1,
            summary_writer = worker_summary_writer)
            
        workers.append(worker)
    
with tf.Session() as sess: 
    summary_writer.add_graph(sess.graph)
    
    sess.run(tf.initialize_all_variables())
    coord = tf.train.Coordinator()
    
    # Start a thread
    worker_threads = []
    for worker in workers:
        worker_fn = lambda: worker.run(sess, coord)
        t = threading.Thread(target=worker_fn)
        t.start()
        worker_threads.append(t)

    # Wait for all workers to finish
    coord.join(worker_threads)    