# Implementation of Async. one-step Q-learning

In [1]:
import sys
import cv2
import os
import itertools
import shutil
import threading
import multiprocessing
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random
import time

from inspect import getsourcefile
current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0)))
import_path = os.path.abspath(os.path.join(current_path, ".."))

if import_path not in sys.path:
    sys.path.append(import_path)
    
sys.path.remove('/home/camilog/tensorflow/_python_build')

import commonOps as cops


import tensorflow as tf
import tensorflow.contrib.slim as slim



In [2]:
conf_parameters = {
    'num_episodes':50000,
    
    'eval_freq': 500,
    'eval_episodes': 5,
    
    # Input size
    'screen_width': 84,
    'screen_height': 84,
    'history_length': 4,
    
    'gamma': 0.99,                # Discount factor
    'learning_rate': 0.00025,     # Learning rate
         
    'random_start': 30,           # Maximum number of 'do nothing' actions at the start of an episode
    
    # Exploration parameters
    'ep_min': 0.1,                 # Final exploration
    'ep_start': 1.0,               # Initial exploration
 
    'target_q_update_step': 50,   # Target network update frequency
    'log_online_summary_rate': 50,   
    'save_rate': 1000,
    'async_update': 10,
    
    # Clip rewards
    'min_reward': -1.0,
    'max_reward': 1.0,

    'checkpoint_dir': 'Models/',
    'log_dir': 'Logs/',
    
    'env_name': 'Breakout-v0',
    'num_threads': 10
}

# Set the number of workers
num_workers = multiprocessing.cpu_count()
if conf_parameters['num_threads']:
    num_workers = conf_parameters['num_threads']

if not os.path.exists(conf_parameters['checkpoint_dir']):
    os.makedirs(conf_parameters['checkpoint_dir'])
    
if not os.path.exists(conf_parameters['log_dir']):
    os.makedirs(conf_parameters['log_dir'])

## Class defining global shared networks

In [3]:
class ConvNet_Estimator():
    '''Q-value estimator neural network
       This architecture will be used both for the Q-network and the Target network.
    '''
    
    def __init__(self,conf,num_actions,net_type=0):
        if net_type == 0:
            self.scope = 'Global_Training'
            self.collection = 'Normal' 
        else:
            self.scope = 'Global_Target'
            self.collection = 'Target'
            
        self.num_actions = num_actions
        self.screen_width = conf['screen_width']
        self.screen_height = conf['screen_height']
        self.history_length = conf['history_length']
        self.learning_rate = conf['learning_rate']
        self.gamma = conf['gamma']
      
        with tf.variable_scope(self.scope):
            # Build the graph
            self._build_model()
                
    def _build_model(self):
        '''
        Building the network architecture
        '''
        self.state_ph = tf.placeholder(tf.float32,[None, self.screen_height, self.screen_width, self.history_length],name='X')
        self.norm_input = tf.div(self.state_ph,256., name='normalized_input')
        
        self.conv1 = slim.conv2d(activation_fn=tf.nn.elu,
                inputs=self.norm_input,num_outputs=16,
                kernel_size=[8,8],stride=[4,4],padding='VALID', scope="conv1")
        self.conv2 = slim.conv2d(activation_fn=tf.nn.elu,
                inputs=self.conv1,num_outputs=32,
                kernel_size=[4,4],stride=[2,2],padding='VALID', scope="conv2")
        hidden = slim.fully_connected(slim.flatten(self.conv2),256,activation_fn=tf.nn.elu, scope="FC")

        self.Q_output = slim.fully_connected(hidden,self.num_actions,
                activation_fn=tf.nn.softmax,
                biases_initializer=None, scope='Output')
        
        with tf.name_scope("Best_Action"):
            self.best_action = tf.argmax(self.Q_output,dimension=1)
        
        if self.scope != 'Global_Target':  
            with tf.name_scope("loss"):
                with tf.name_scope("Inputs"):
                    # The TD target value
                    self.QT_ph = tf.placeholder(tf.float32,[None],name='QT_ph')
                    # Integer id of selected action
                    self.action_ph = tf.placeholder(tf.int32,[None],name='action_ph')

                    self.reward_ph = tf.placeholder(tf.float32, [None], name="reward_ph")
                    self.terminal_ph = tf.placeholder(tf.float32, [None], name="terminal_ph")
            
                with tf.name_scope("Acted_Q"):
                    # One hot of the action which was taken
                    action_one_hot = tf.one_hot(self.action_ph,self.num_actions, 1., 0., name='action_one_hot')
                    # Get the prediction of the chosen actions only
                    acted_Q = tf.reduce_sum(self.Q_output * action_one_hot, reduction_indices=1, name='DQN_acted')

                with tf.name_scope("Target_Q"):
                    Y = self.reward_ph + self.gamma * self.QT_ph * (1 - self.terminal_ph)
                    Y = tf.stop_gradient(Y)

                loss_batch = cops.clipped_l2(Y, acted_Q)
                loss = tf.reduce_sum(loss_batch, name="loss")
        
                self.train_op, grads = cops.graves_rmsprop_optimizer(loss, self.learning_rate, 0.95, 0.01, 1)
        
            with tf.name_scope("Summaries"):
                # Summaries for Tensorboard
                self.summaries = tf.merge_summary([
                        tf.scalar_summary('losses/loss', loss),
                        tf.scalar_summary("losses/loss_max", tf.reduce_max(loss_batch)),
                        tf.scalar_summary('Q/avg_q',tf.reduce_mean(self.Q_output)),
                        tf.histogram_summary('Q/q_values_hist',self.Q_output),
                        tf.scalar_summary('Q/max_q_value',tf.reduce_max(self.Q_output)),
                        tf.scalar_summary('Others/reward_max', tf.reduce_max(self.reward_ph))
                        ])  
                
    def predict(self,sess,state):
        '''
        Predicts action values.
        '''
        return sess.run(self.Q_output,{self.state_ph:state})
    
    def determine_action(self,sess,state,epsilon):
        '''
        Predicts action based on q values for current state and exploration strategy
        '''
        if random.random() < epsilon:
            # Explore: random action
            action = random.randrange(self.num_actions)
        else:            
            action = sess.run(self.best_action,{self.state_ph:state})[0]

        return action
    
    def update(self,sess,state,action,reward,target,terminal):
        '''
        Updates the estimator towards the given targets
        '''
        feed_dict = {self.state_ph:state, self.QT_ph:target, 
                     self.action_ph:action, self.reward_ph:reward,
                     self.terminal_ph:terminal}
        _, summaries = sess.run([self.train_op, self.summaries],feed_dict)

        return summaries

## Training workers

Each worker will run in a different training thread

In [4]:
class Worker():
    def __init__(self,name, conf, Q_net, Target_net, global_episodes, epsilon, summary_writer=None):
        self.name = name 
        self.config = conf
        self.global_Q_net = Q_net
        self.global_Target_net = Target_net
        self.global_episodes = global_episodes
        self.summary_writer = summary_writer

        self.env = gym.make(conf['env_name'])      
        self.game_state = np.zeros((1, conf['screen_width'], conf['screen_height'], conf['history_length']), dtype=np.uint8)
        self.reset_game()
        
        self.epsilon = epsilon                
        
        if self.name == 'Worker_0':
            self.saver = tf.train.Saver(max_to_keep=5)
            
            with tf.name_scope("Increment_Episode"):
                self.increment = self.global_episodes.assign_add(1)
                
            with tf.name_scope("Copy_Parameters"):
                self.update_target_ops = self.update_target_graph(self.global_Q_net.scope,self.global_Target_net.scope) 
        
    def train(self,sess,episode_buffer):
        state, action , reward, end_state, terminal = map(np.array, zip(*episode_buffer))
        
        # Get the Q-value of the next state
        q_t_plus_1 = self.global_Target_net.predict(sess,end_state)
        # Get max Q_t+1
        target_q = np.max(q_t_plus_1, axis=1)
        
        summary = self.global_Q_net.update(sess,state,action,reward,target_q,terminal)

        return summary
    
    def update_target_graph(self,from_scope,to_scope):
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

        op_holder = []
        for from_var,to_var in zip(from_vars,to_vars):
            op_holder.append(to_var.assign(from_var))
        return op_holder
    
    def test_play(self,sess,test_epsilon=0.01):
        ep_rewards = []
                
        for episode in range(self.config['eval_episodes']):
            # New game
            screen, terminal, score = self.env.reset(), False, 0

            while not terminal:
                action = self.step(screen,test_epsilon)
                screen, r, terminal, _ = self.env.step(action)
                ep_rewards.append(r)
                # Record every reward
                score += r

            ep_rewards.append(score)
            self.reset_game()
        
        return ep_rewards 
    
    def save_model(self,sess,step=None):
        print('Saving model at step %d' % step)
        self.saver.save(sess,os.path.join(self.config['checkpoint_dir'],'model'),global_step=step) 
    
    def reset_game(self):
        self.game_state.fill(0)

    def observe(self, x):
        x_ = cv2.resize(x, (self.config['screen_width'], self.config['screen_height']))
        x_ = cv2.cvtColor(x_, cv2.COLOR_RGB2GRAY)
        self.game_state = np.roll(self.game_state, -1, axis=3)
        self.game_state[0, :, :, -1] = x_
            
    def step(self, screen, eps):
        self.observe(screen)
        self.game_action = self.global_Q_net.determine_action(sess,self.game_state, eps)
        return self.game_action
        
    def run(self,sess,coord):
        print "Starting " + str(self.name)
        with sess.as_default(), sess.graph.as_default(): 
            try:
                while not coord.should_stop():
                    episode_count = sess.run(self.global_episodes)
                    
                    # UPDATE TARGET NETWORK
                    if self.name == 'Worker_0':
                        if episode_count % self.config['target_q_update_step'] == 0:
                            print(self.name + ': Updating target network')
                            sess.run(self.update_target_ops)
                           
                    # TRAIN
                    episode_buffer = []
                    num_steps = 0
                    ep_rewards = []
                                     
                    screen, r, terminal = self.env.reset(), 0, False
                    
                    while not terminal:
                        prev_state = self.game_state.reshape(self.game_state.shape[1:]) 
                        action = self.step(screen,self.epsilon)
                        screen, r, terminal, _ = self.env.step(action)
                        num_steps += 1
                        ep_rewards.append(r)
                        r = max(self.config['min_reward'], min(self.config['max_reward'], r)) 
                        
                        new_state = self.game_state.reshape(self.game_state.shape[1:]) 
                        episode_buffer.append([prev_state,action,r,new_state,terminal])
                        
                        if len(episode_buffer) == self.config['async_update'] and not terminal:
                            _ = self.train(sess,episode_buffer)
                            episode_buffer = []
                    
                    self.reset_game()
                            
                    # Update the network using the experience buffer at the end of the episode.
                    if len(episode_buffer) != 0:
                        _ = self.train(sess,episode_buffer)  

                    # STOP THREAD
                    if episode_count >= self.config['num_episodes']:
                        print "Finishing " + str(self.name)
                        coord.request_stop()
                        return
                    
                    
                    if self.name == 'Worker_0':
                        if episode_count % self.config['log_online_summary_rate'] == 0:
                            performance_summary = tf.Summary(
                                value=[
                                    tf.Summary.Value(
                                        tag="Online/average",
                                        simple_value=sum(ep_rewards)/float(len(ep_rewards))),
                                    tf.Summary.Value(
                                        tag="Online/max",
                                        simple_value=max(ep_rewards)),
                                    tf.Summary.Value(
                                        tag="Online/min",
                                        simple_value=min(ep_rewards)),
                                    tf.Summary.Value(
                                        tag="Online/num_steps",
                                        simple_value=num_steps),
                                    tf.Summary.Value(
                                        tag="Online/Score",
                                        simple_value=sum(ep_rewards)),
                                ])
                            self.summary_writer.add_summary(performance_summary, episode_count)
                            
                        # TEST PERFORMANCE
                        if episode_count % self.config['eval_freq'] == 0:  
                            print('Evaluating at step %d' % episode_count)
                            ep_rewards = self.test_play(sess,test_epsilon=0.01)

                            performance_summary = tf.Summary(
                                value=[
                                    tf.Summary.Value(
                                        tag="score/average",
                                        simple_value=sum(ep_rewards)/float(len(ep_rewards))),
                                    tf.Summary.Value(
                                        tag="score/max",
                                        simple_value=max(ep_rewards)),
                                    tf.Summary.Value(
                                        tag="score/min",
                                        simple_value=min(ep_rewards)),
                                ])

                            self.summary_writer.add_summary(performance_summary, episode_count)  

                        # SAVE THE MODEL
                        if (episode_count % self.config['save_rate'] == 0 and episode_count != 0) or (episode_count == self.config['num_episodes']):
                            self.save_model(sess,episode_count) 
                            
                        sess.run(self.increment)                                            
                    
            except Exception, e:
                # Report exceptions to the coordinator.
                coord.request_stop(e)
                return                

In [None]:
env = gym.make('Breakout-v0')
num_actions = env.action_space.n 
env.close()

Q_Network = ConvNet_Estimator(conf_parameters,num_actions,net_type=0)
Target_Network = ConvNet_Estimator(conf_parameters,num_actions,net_type=1)
    
with tf.device("/cpu:0"):        
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    summary_writer = tf.train.SummaryWriter(conf_parameters['log_dir'], flush_secs=30)

    workers = []
    # We are going to assign different exploration probabilities to each worker
    epsilon_slope = 0.8 / float(num_workers-1)
    # Create workers
    for worker_id in range(0,num_workers):
        worker_summary_writer = None
        if worker_id == 0:
            worker_summary_writer = summary_writer
            
        worker = Worker(
            name="Worker_{}".format(worker_id),
            conf=conf_parameters,
            Q_net=Q_Network,
            Target_net=Target_Network,
            global_episodes=global_episodes,
            epsilon= worker_id*epsilon_slope + 0.1,
            summary_writer = worker_summary_writer)
            
        workers.append(worker)
    
with tf.Session() as sess: 
    summary_writer.add_graph(sess.graph)
    
    sess.run(tf.initialize_all_variables())
    coord = tf.train.Coordinator()
    
    # Start a thread
    worker_threads = []
    for worker in workers:
        worker_fn = lambda: worker.run(sess, coord)
        t = threading.Thread(target=worker_fn)
        t.start()
        worker_threads.append(t)

    # Wait for all workers to finish
    coord.join(worker_threads)    