In [1]:
import gym
import numpy as np
import tensorflow as tf
import random
import sys
import cv2
import os
import time
from tqdm import tqdm
import threading
import time

from inspect import getsourcefile
current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0)))
import_path = os.path.abspath(os.path.join(current_path, ".."))

if import_path not in sys.path:
    sys.path.append(import_path)

import commonOps as cops

In [2]:
conf_parameters = {
    'num_episodes': 100000,
    
    'eval_freq': 1000,
    'eval_steps': 20,
    
    # Input size
    'screen_width': 84,
    'screen_height': 84,
    'history_length': 4,
    'pool_frame_size': 1,
    
    'memory_size': 1000000,       # Replay memory size
    'batch_size': 32,             # Number of training cases ove which SGD update is computed
    'gamma': 0.99,                # Discount factor
    'learning_rate': 0.00025,     # Learning rate
         
    'random_start': 30,           # Maximum number of 'do nothing' actions at the start of an episode
    
    # Exploration parameters
    'ep_min': 0.1,                 # Final exploration
    'ep_start': 1.0,               # Initial exploration
    'exploration_steps': 250000,   # Final exploration frame
 
    'target_q_update_step': 10000, # Target network update frequency
    'log_online_summary_rate': 100,
    'steps_before_training': 12500,   
    'save_rate': 1000,
    'update_summary_rate': 50000,
    'sync_rate': 2500,
    
    # Clip rewards
    'min_reward': -1.0,
    'max_reward': 1.0,

    # How many times should the same action be taken
    'action_repeat': 1,
    
    'checkpoint_dir': 'Models/',
    'log_dir': 'Logs/',
    'device': '/gpu:0'
}

if not os.path.exists(conf_parameters['checkpoint_dir']):
    os.makedirs(conf_parameters['checkpoint_dir'])
    
if not os.path.exists(conf_parameters['log_dir']):
    os.makedirs(conf_parameters['log_dir'])

In [3]:
class ReplayMemory:
    def __init__(self, config):
        self.capacity = self.memory_size = config['memory_size']
        self.batch_size = config['batch_size']    
        self.buff_size = config['history_length']
        self.screens = np.empty((self.capacity, config['screen_width'],config['screen_height']), dtype=np.uint8)
        self.actions = np.empty((self.capacity), dtype=np.uint8)
        self.rewards = np.empty((self.capacity), dtype=np.int8)
        self.terminals = np.empty((self.capacity), dtype=np.bool)
        self.next_state_batch = np.empty((self.batch_size, config['screen_width'],config['screen_height'], self.buff_size), dtype=np.uint8)
        self.state_batch = np.empty((self.batch_size, config['screen_width'],config['screen_height'], self.buff_size), dtype=np.uint8)
        self.current = 0
        self.step = 0
        self.filled = False

    def add(self, screen, action, reward, terminal):
        self.screens[self.current] = screen
        self.actions[self.current] = action
        self.rewards[self.current] = reward
        self.terminals[self.current] = terminal
        self.current += 1
        self.step += 1
        if self.current == self.capacity:
            self.current = 0
            self.filled = True

    def get_state(self, index):
        if self.filled == False:
            assert index < self.current, "%i index has note been added yet"%index
        #Fast slice read
        if index >= self.buff_size - 1:
            state = self.screens[(index - self.buff_size+1):(index + 1), ...]
        #Slow list read
        else:
            indexes = [(index - i) % self.capacity for i in reversed(range(self.buff_size))]
            state = self.screens[indexes, ...]
        # different screens should be in the 3rd dim as channels
        return np.transpose(state, [1,2,0])

    def sample_transition_batch(self):
        if self.filled == False:
            assert self.current >= self.batch_size, "There is not enough to sample. Call add bathc_size times"
        indexes = []
        while len(indexes) != self.batch_size:
            # index, is the index of state, and index + 1 of next_state
            if self.filled:
                index = random.randint(0, self.capacity - 2) # -2 because index +1 will be used for next state
                # if index is in the space we are currently writing
                if index >= self.current and index - self.buff_size < self.current:
                    continue
            else:
                # can't start from 0 because get_state would loop back to the end -- wich is uninitialized.
                # index +1 can be terminal
                index = random.randint(self.buff_size -1, self.current -2)

            # We check that current state is not terminal
            if self.terminals[(index - self.buff_size + 1):index+1].any():
                continue
            self.state_batch[len(indexes)] = self.get_state(index)
            self.next_state_batch[len(indexes)] = self.get_state(index + 1)
            indexes.append(index)
        action_batch = self.actions[indexes]
        reward_batch = self.rewards[indexes]
        terminal_batch = self.terminals[indexes]
        return self.state_batch, action_batch, reward_batch, self.next_state_batch, terminal_batch, indexes

In [4]:
class Agent():

    def __init__(self, config, session, num_actions):
        self.config = config
        self.sess = session
        self.num_actions = num_actions
        
        self.gamma = config['gamma']
        self.learning_rate = config['learning_rate']
        
        self.exp_replay = ReplayMemory(self.config)
        self.game_state = np.zeros((1, config['screen_width'], config['screen_height'], config['history_length']), dtype=np.uint8)
        
        self.update_thread = threading.Thread(target=lambda: 0)
        self.update_thread.start()
        
        self.step_count = 0
        self.episode = 0
        self.isTesting = False
        
        self.reset_game()
        self.timeout_option = tf.RunOptions(timeout_in_ms=5000)
        
        # build the net
        with tf.device(config['device']):
            # Create all variables 
            self.state_ph = tf.placeholder(tf.float32, [None, config['screen_width'], config['screen_height'], config['history_length']], name='state_ph')
            self.stateT_ph = tf.placeholder(tf.float32, [None, config['screen_width'], config['screen_height'], config['history_length']], name='stateT_ph')
            self.action_ph = tf.placeholder(tf.int64, [None], name='action_ph')
            self.reward_ph = tf.placeholder(tf.float32, [None], name='reward_ph')
            self.terminal_ph = tf.placeholder(tf.float32, [None], name='terminal_ph')
            # Define training network
            with tf.variable_scope('Q'):
                self.Q = self.Q_network(self.state_ph, config, 'Normal')
            # Define Target network
            with tf.variable_scope('QT'):
                self.QT = self.Q_network(self.stateT_ph, config, 'Target')
            
            # Define training operation
            self.train_op = self.train_op(self.Q, self.QT, self.action_ph, self.reward_ph, self.terminal_ph, config, 'Normal')
            
            # Define operation to copy parameteres from training to target net.
            with tf.variable_scope('Copy_parameters'):
                self.sync_QT_op = []
                for W_pair in zip(tf.get_collection('Target_weights'),tf.get_collection('Normal_weights')):
                    self.sync_QT_op.append(W_pair[0].assign(W_pair[1]))
                
            # Define the summary ops
            self.Q_summary_op = tf.merge_summary(tf.get_collection('Normal_summaries'))

        self.summary_writter = tf.train.SummaryWriter(config['log_dir'], self.sess.graph, flush_secs=20)

    def update(self):
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch, _ = self.exp_replay.sample_transition_batch()
    
        feed_dict={self.state_ph: state_batch,
                    self.stateT_ph: next_state_batch,
                    self.action_ph: action_batch,
                    self.reward_ph: reward_batch,
                    self.terminal_ph: terminal_batch}
        if self.step_count % self.config['update_summary_rate'] == 0:
            _, Q_summary_str = self.sess.run([self.train_op, self.Q_summary_op], feed_dict, options=self.timeout_option)
            self.summary_writter.add_summary(Q_summary_str, self.step_count)
        else:
            _ = self.sess.run(self.train_op, feed_dict, options=self.timeout_option)

        if self.step_count % self.config['sync_rate'] == 0:
            self.sess.run(self.sync_QT_op)

    def Q_network(self, input_state, config, Collection=None):
        conv_stack_shape=[(32,8,4),
                    (64,4,2),
                    (64,3,1)]

        head = cops.conv_stack(input_state, conv_stack_shape, Collection)
        head = cops.flatten(head)
        head = cops.add_relu_layer(head, size=512, Collection=Collection)
        Q = cops.add_linear_layer(head, self.num_actions, Collection, layer_name="Q")

        return Q

    def train_op(self, Q, QT, action, reward, terminal, config, Collection):
        with tf.name_scope('Loss'):
            action_one_hot = tf.one_hot(action, self.num_actions, 1., 0., name='action_one_hot')
            acted_Q = tf.reduce_sum(Q * action_one_hot, reduction_indices=1, name='DQN_acted')

            QT_max_action = tf.reduce_max(QT, 1)
            Y = reward + self.gamma * QT_max_action * (1 - terminal)
            Y = tf.stop_gradient(Y)

            loss_batch = cops.clipped_l2(Y, acted_Q)
            loss = tf.reduce_sum(loss_batch, name='loss')

            tf.scalar_summary('losses/loss', loss, collections=[Collection + '_summaries'])
            tf.scalar_summary('losses/loss_0', loss_batch[0],collections=[Collection + '_summaries'])
            tf.scalar_summary('losses/loss_max', tf.reduce_max(loss_batch),collections=[Collection + '_summaries'])
            tf.scalar_summary('main/Y_0', Y[0], collections=[Collection + '_summaries'])
            tf.scalar_summary('main/Y_max', tf.reduce_max(Y), collections=[Collection + '_summaries'])
            tf.scalar_summary('main/QT_max_action_0', QT_max_action[0], collections=[Collection + '_summaries'])
            tf.scalar_summary('main/acted_Q_0', acted_Q[0], collections=[Collection + '_summaries'])
            tf.scalar_summary('main/acted_Q_max', tf.reduce_max(acted_Q), collections=[Collection + '_summaries'])
            tf.scalar_summary('main/reward_max', tf.reduce_max(reward), collections=[Collection + '_summaries'])

        train_op, grads = cops.graves_rmsprop_optimizer(loss, self.learning_rate, 0.95, 0.01, 1)

        return train_op
    
    def testing(self, t=True):
        self.isTesting = t

    def reset_game(self):
        self.episode_begining = True
        self.game_state.fill(0)

    def epsilon(self):
        if self.step_count < self.config['exploration_steps']:
            return self.config['ep_start'] - ((self.config['ep_start'] - self.config['ep_min']) / self.config['exploration_steps']) * self.step_count
        else:
            return self.config['ep_min']

    def e_greedy_action(self, epsilon):
        if np.random.uniform() < epsilon:
            action = random.randint(0, self.num_actions - 1)
        else:
            action = np.argmax(self.sess.run(self.Q, feed_dict={self.state_ph: self.game_state})[0])
        return action
    
    def done(self):
        if not self.isTesting:
            self.exp_replay.add(self.game_state[:, :, :, -1],self.game_action, self.game_reward, True)
        self.reset_game()

    def observe(self, x, r):
        self.game_reward = r
        x_ = cv2.resize(x, (self.config['screen_width'], self.config['screen_height']))
        x_ = cv2.cvtColor(x_, cv2.COLOR_RGB2GRAY)
        self.game_state = np.roll(self.game_state, -1, axis=3)
        self.game_state[0, :, :, -1] = x_
            
    def step(self, x, r):
        r = max(self.config['min_reward'], min(self.config['max_reward'], r))
        if not self.isTesting:
            if not self.episode_begining:
                self.exp_replay.add(self.game_state[:, :, :, -1], self.game_action, self.game_reward, False)
            else:
                for i in range(self.config['history_length'] - 1):
                    # add the resetted buffer
                    self.exp_replay.add(self.game_state[:, :, :, i], 0, 0, False)
                self.episode_begining = False
            self.observe(x, r)
            self.game_action = self.e_greedy_action(self.epsilon())
            if self.step_count > self.config['steps_before_training']:
                self.update_thread.join()
                self.update_thread = threading.Thread(target=self.update)
                self.update_thread.start()
            self.step_count += 1
        else:
            self.observe(x, r)
            self.game_action = self.e_greedy_action(0.01)
        return self.game_action

In [5]:
def test_run(agent, env, n):
    agent.testing(True)
    score_list = []
    for episode in range(n):
        screen, r, terminal, score = env.reset(), 0, False, 0
        while not terminal:
            action = agent.step(screen, r)
            screen, r, terminal, _ = env.step(action)
            score += r
        agent.done()
        score_list.append(score)
    agent.testing(False)
    return score_list

In [None]:
env = gym.make('Breakout-v0')
num_actions = env.action_space.n 

sess_config = tf.ConfigProto()
sess_config.allow_soft_placement = True
sess_config.gpu_options.allow_growth = True
sess_config.log_device_placement = False

with tf.Session(config=sess_config) as sess:
    agent = Agent(conf_parameters, sess, num_actions)
    saver = tf.train.Saver(max_to_keep=20)
    
    sess.run(tf.initialize_all_variables())
    
    for episode in tqdm(range(conf_parameters['num_episodes']),ncols=80,initial=0):  
        screen, r, terminal, score = env.reset(), 0, False, 0
        
        ep_begin_t = time.time()
        ep_begin_step_count = agent.step_count
        while not terminal:
            action = agent.step(screen, r)
            screen, r, terminal, _ = env.step(action)
            score += r
        agent.done()
        ep_duration = time.time() - ep_begin_t

        is_final_episode = conf_parameters['num_episodes'] == episode
        # online Summary
        if episode % conf_parameters['log_online_summary_rate'] == 0 or is_final_episode:
            episode_online_summary = tf.Summary(
                value=[
                    tf.Summary.Value(
                        tag="online/epsilon",
                        simple_value=agent.epsilon()),
                    tf.Summary.Value(
                        tag="score/online",
                        simple_value=score),
                    tf.Summary.Value(
                        tag="online/global_step",
                        simple_value=agent.step_count),
                    tf.Summary.Value(
                        tag="online/step_duration",
                        simple_value=ep_duration/(agent.step_count - ep_begin_step_count)),
                    tf.Summary.Value(
                        tag="online/ep_duration_seconds",
                        simple_value=ep_duration)])
            agent.summary_writter.add_summary(episode_online_summary, episode)

        # save
        if (episode % conf_parameters['save_rate'] == 0 and episode != 0) or is_final_episode:
            saver.save(sess,os.path.join(conf_parameters['checkpoint_dir'],'model'), global_step=episode) 
            
        # performance summary
        if episode % conf_parameters['eval_freq'] == 0 or is_final_episode:
            score_list = test_run(agent, env, conf_parameters['eval_steps'])
            performance_summary = tf.Summary(
                value=[
                    tf.Summary.Value(
                        tag="score/average",simple_value=sum(score_list)/len(score_list)),
                    tf.Summary.Value(
                        tag="score/max",simple_value=max(score_list)),
                    tf.Summary.Value(
                        tag="score/min",simple_value=min(score_list)),
                ])
            agent.summary_writter.add_summary(performance_summary, episode)