In [1]:
import gym
import numpy as np
import tensorflow as tf
import random
import sys
import cv2
import os
import time
from tqdm import tqdm
import threading
import time

from inspect import getsourcefile
current_path = os.path.dirname(os.path.abspath(getsourcefile(lambda:0)))
import_path = os.path.abspath(os.path.join(current_path, ".."))

if import_path not in sys.path:
    sys.path.append(import_path)

import commonOps as cops

In [2]:
conf_parameters = {
    'num_episodes': 100000,
    
    'eval_freq': 1000,
    'eval_steps': 20,
    
    # Input size
    'screen_width': 84,
    'screen_height': 84,
    'history_length': 4,
    'pool_frame_size': 1,
    
    'memory_size': 1000000,       # Replay memory size
    'batch_size': 32,             # Number of training cases ove which SGD update is computed
    'gamma': 0.99,                # Discount factor
    'learning_rate': 0.00025,     # Learning rate
         
    'random_start': 30,           # Maximum number of 'do nothing' actions at the start of an episode
    
    # Exploration parameters
    'ep_min': 0.1,                 # Final exploration
    'ep_start': 1.0,               # Initial exploration
    'exploration_steps': 250000,   # Final exploration frame
 
    'target_q_update_step': 10000, # Target network update frequency
    'log_online_summary_rate': 100,
    'steps_before_training': 12500,   
    'save_rate': 1000,
    'update_summary_rate': 50000,
    'sync_rate': 2500,
    
    # Clip rewards
    'min_reward': -1.0,
    'max_reward': 1.0,

    # How many times should the same action be taken
    'action_repeat': 1,
    
    'checkpoint_dir': 'Models/',
    'log_dir': 'Logs/',
    'device': '/gpu:0'
}

if not os.path.exists(conf_parameters['checkpoint_dir']):
    os.makedirs(conf_parameters['checkpoint_dir'])
    
if not os.path.exists(conf_parameters['log_dir']):
    os.makedirs(conf_parameters['log_dir'])

In [3]:
class GameScreen:
    def __init__(self, conf):
        self.screens = np.zeros([conf['pool_frame_size'],conf['screen_height'],conf['screen_width']],dtype=np.float32)
        
    def add(self, screen):
        self.screens[:-1] = self.screens[1:]
        self.screens[-1] = screen
        
    def get(self):        
        return np.amax(np.transpose(self.screens,(1,2,0)),axis=2)
    
class GymEnvironment():
    def __init__(self,name,conf):
        self.env = gym.make(name)            # Initialize Gym environment
        self.game_screen = GameScreen(conf)
        self.screen_width = conf['screen_width']         
        self.screen_height = conf['screen_height']
        self.random_start = conf['random_start']
        self.action_repeat = conf['action_repeat']
        self.pool_frame_size = conf['pool_frame_size']
        self.display = False
    
    def set_display(self,value=False):
        self.display = value
        
    def new_game(self):
        self._observation = self.env.reset()
        
        for i in range(self.pool_frame_size):
            self.game_screen.add(self.observation)
            
        self.render()        
        return self.screen
    
    def new_random_game(self):  # Starts a random new game doing 
        _ = self.new_game()
        terminal = False
        for _ in xrange(random.randint(0,self.random_start-1)):
            self._observation, reward, terminal, _ = self.env.step(0)
            self.game_screen.add(self.observation)
        
        self.render()
        return self.screen, 0, terminal
        
    def execute_action(self,action,is_training=True):
        # This function execute the selected action for 'action_repeat' number of times and returns the cumulative reward
        # and final state
        cum_reward = 0
        start_lives = self.num_lives
        
        for _ in xrange(self.action_repeat):
            self._observation, reward, terminal, _ = self.env.step(action)
            self.game_screen.add(self.observation)
            cum_reward += reward
            
            if is_training and start_lives > self.num_lives:
                terminal = True
                
            if terminal:
                break
                
        self.render()
        
        return self.screen, cum_reward, terminal
    
    @property
    def screen(self):
        return self.game_screen.get()
        
    @property
    def action_size(self):
        return self.env.action_space.n        # Number of available actions

    @property
    def num_lives(self):
        return self.env.ale.lives()
    
    @property
    def observation(self):     # Method to resize the screen provided by gym to the desired values
        return cv2.resize(cv2.cvtColor(self._observation,cv2.COLOR_RGB2GRAY)/255.,(self.screen_width,self.screen_height))
    
    def render(self):    # Renders the environment only if display == True
        if self.display:
            self.env.render()  

In [4]:
class Experience_Replay():
    def __init__(self,conf):
        self.memory_size = conf['memory_size']
        # These are the arrays where we will store the experiences
        self._experience = {}
        self.batch_size = conf['batch_size']       
        self.current = 0     # Pointer to the current saving location
        self.count = 0       # Number of collected experiences
        
    def add(self, experience):
        '''
        Stores experience: experience is a tuple of (s1,a,r,s2,t)
        '''        
        if self.current in self._experience:
            del self._experience[self.current]
        self._experience[self.current] = experience
        
        self.count = max(self.count, self.current+1)
        self.current = (self.current + 1) % self.memory_size
                    
    def retrieve(self,indices):
        '''
        Get experiences from indices
        '''
        return [self._experience[v] for v in indices]
        
    def sample_from_replay(self):
        # Randomly sampling from experiences (Standard implementation)
        indexes = []
        while len(indexes) < self.batch_size:
            index = random.randint(0,self.count-1)
            indexes.append(index)
            
        experience_batch = self.retrieve(indexes)
        state, action , reward, next_state, terminal = map(np.array, zip(*experience_batch))
        
        return state, action, reward, next_state, terminal

In [5]:
class History:
    def __init__(self, conf):
        self.history = np.zeros([conf['history_length'],conf['screen_height'],conf['screen_width']],dtype=np.float32)
        
    def add(self, screen):
        self.history[:-1] = self.history[1:]
        self.history[-1] = screen
        
    def get(self):
        return np.transpose(self.history,(1,2,0))

In [6]:
class Agent():

    def __init__(self, config, session, num_actions):
        self.config = config
        self.sess = session
        self.num_actions = num_actions
        
        self.gamma = config['gamma']
        self.learning_rate = config['learning_rate']
        
        self.exp_replay = Experience_Replay(self.config)
        self.history = History(self.config)
        
        self.update_thread = threading.Thread(target=lambda: 0)
        self.update_thread.start()
        
        self.step_count = 0
        self.episode = 0
        self.isTesting = False
        
        self.reset_game()
        self.timeout_option = tf.RunOptions(timeout_in_ms=5000)
        
        # build the net
        with tf.device(config['device']):
            # Create all variables 
            self.state_ph = tf.placeholder(tf.float32, [None, config['screen_width'], config['screen_height'], config['history_length']], name='state_ph')
            self.stateT_ph = tf.placeholder(tf.float32, [None, config['screen_width'], config['screen_height'], config['history_length']], name='stateT_ph')
            self.action_ph = tf.placeholder(tf.int64, [None], name='action_ph')
            self.reward_ph = tf.placeholder(tf.float32, [None], name='reward_ph')
            self.terminal_ph = tf.placeholder(tf.float32, [None], name='terminal_ph')
            # Define training network
            with tf.variable_scope('Q'):
                self.Q = self.Q_network(self.state_ph, config, 'Normal')
            # Define Target network
            with tf.variable_scope('QT'):
                self.QT = self.Q_network(self.stateT_ph, config, 'Target')
            
            # Define training operation
            self.train_op = self.train_op(self.Q, self.QT, self.action_ph, self.reward_ph, self.terminal_ph, config, 'Normal')
            
            # Define operation to copy parameteres from training to target net.
            with tf.variable_scope('Copy_parameters'):
                self.sync_QT_op = []
                for W_pair in zip(tf.get_collection('Target_weights'),tf.get_collection('Normal_weights')):
                    self.sync_QT_op.append(W_pair[0].assign(W_pair[1]))
                
            # Define the summary ops
            self.Q_summary_op = tf.merge_summary(tf.get_collection('Normal_summaries'))

        self.summary_writter = tf.train.SummaryWriter(config['log_dir'], self.sess.graph, flush_secs=20)

    def update(self):
        init_state, action, reward, next_state, terminal = self.exp_replay.sample_from_replay()
    
        feed_dict={self.state_ph: init_state,
                    self.stateT_ph: next_state,
                    self.action_ph: action,
                    self.reward_ph: reward,
                    self.terminal_ph: terminal}
        if self.step_count % self.config['update_summary_rate'] == 0:
            _, Q_summary_str = self.sess.run([self.train_op, self.Q_summary_op], feed_dict, options=self.timeout_option)
            self.summary_writter.add_summary(Q_summary_str, self.step_count)
        else:
            _ = self.sess.run(self.train_op, feed_dict, options=self.timeout_option)

        if self.step_count % self.config['sync_rate'] == 0:
            self.sess.run(self.sync_QT_op)

    def Q_network(self, input_state, config, Collection=None):
        conv_stack_shape=[(32,8,4),
                    (64,4,2),
                    (64,3,1)]

        head = cops.conv_stack(input_state, conv_stack_shape, Collection)
        head = cops.flatten(head)
        head = cops.add_relu_layer(head, size=512, Collection=Collection)
        Q = cops.add_linear_layer(head, self.num_actions, Collection, layer_name="Q")

        return Q

    def train_op(self, Q, QT, action, reward, terminal, config, Collection):
        with tf.name_scope('Loss'):
            action_one_hot = tf.one_hot(action, self.num_actions, 1., 0., name='action_one_hot')
            acted_Q = tf.reduce_sum(Q * action_one_hot, reduction_indices=1, name='DQN_acted')

            QT_max_action = tf.reduce_max(QT, 1)
            Y = reward + self.gamma * QT_max_action * (1 - terminal)
            Y = tf.stop_gradient(Y)

            loss_batch = cops.clipped_l2(Y, acted_Q)
            loss = tf.reduce_sum(loss_batch, name='loss')

            tf.scalar_summary('losses/loss', loss, collections=[Collection + '_summaries'])
            tf.scalar_summary('losses/loss_0', loss_batch[0],collections=[Collection + '_summaries'])
            tf.scalar_summary('losses/loss_max', tf.reduce_max(loss_batch),collections=[Collection + '_summaries'])
            tf.scalar_summary('main/Y_0', Y[0], collections=[Collection + '_summaries'])
            tf.scalar_summary('main/Y_max', tf.reduce_max(Y), collections=[Collection + '_summaries'])
            tf.scalar_summary('main/QT_max_action_0', QT_max_action[0], collections=[Collection + '_summaries'])
            tf.scalar_summary('main/acted_Q_0', acted_Q[0], collections=[Collection + '_summaries'])
            tf.scalar_summary('main/acted_Q_max', tf.reduce_max(acted_Q), collections=[Collection + '_summaries'])
            tf.scalar_summary('main/reward_max', tf.reduce_max(reward), collections=[Collection + '_summaries'])

        train_op, grads = cops.graves_rmsprop_optimizer(loss, self.learning_rate, 0.95, 0.01, 1)

        return train_op
    
    def testing(self, t=True):
        self.isTesting = t

    def reset_game(self):
        self.history.history.fill(0)

    def epsilon(self):
        if self.step_count < self.config['exploration_steps']:
            return self.config['ep_start'] - ((self.config['ep_start'] - self.config['ep_min']) / self.config['exploration_steps']) * self.step_count
        else:
            return self.config['ep_min']

    def e_greedy_action(self, epsilon):
        if np.random.uniform() < epsilon:
            action = random.randint(0, self.num_actions - 1)
        else:
            action = np.argmax(self.sess.run(self.Q, feed_dict={self.state_ph: [self.history.get()]})[0])
        return action
    
    def observe(self,screen,r,terminal):
        if not self.isTesting:  
            r = max(self.config['min_reward'], min(self.config['max_reward'], r))
            prev_state = self.history.get()
            # Add to history
            self.history.add(screen)
            # Add to Exp replay
            self.exp_replay.add((prev_state,r,self.game_action,self.history.get(),terminal))
        else:
            self.history.add(screen)
    
    def select_action(self):       
        if not self.isTesting:            
            self.game_action = self.e_greedy_action(self.epsilon())
            if self.step_count > self.config['steps_before_training']:
                self.update_thread.join()
                self.update_thread = threading.Thread(target=self.update)
                self.update_thread.start()
            self.step_count += 1
        else:
            self.game_action = self.e_greedy_action(0.01)
        return self.game_action

In [7]:
def test_run(agent, env, n):
    agent.testing(True)
    score_list = []
    for episode in range(n):
        screen, r, terminal, score = env.new_game(), 0, False, 0
        agent.history.add(screen)
        
        while not terminal:
            action = agent.select_action()
            screen, r, terminal = env.execute_action(action,is_training=False)
            agent.observe(screen,r,terminal)
            score += r
        agent.reset_game()
        score_list.append(score)
    agent.testing(False)
    return score_list

In [None]:
env = GymEnvironment('Breakout-v0',conf_parameters)
num_actions = env.action_size

sess_config = tf.ConfigProto()
sess_config.allow_soft_placement = True
sess_config.gpu_options.allow_growth = True
sess_config.log_device_placement = False

with tf.Session(config=sess_config) as sess:
    agent = Agent(conf_parameters, sess, num_actions)
    saver = tf.train.Saver(max_to_keep=20)
    
    sess.run(tf.initialize_all_variables())
    
    for episode in tqdm(range(conf_parameters['num_episodes']),ncols=70,initial=0):  
    #for episode in range(conf_parameters['num_episodes']):
        screen, r, terminal, score = env.new_game(), 0, False, 0
        agent.history.add(screen)
        
        ep_begin_t = time.time()
        ep_begin_step_count = agent.step_count
        while not terminal:
            action = agent.select_action()
            screen, r, terminal = env.execute_action(action,is_training=False)
            agent.observe(screen,r,terminal)
            score += r
        agent.reset_game()
        ep_duration = time.time() - ep_begin_t

        is_final_episode = conf_parameters['num_episodes'] == episode
        # online Summary
        if episode % conf_parameters['log_online_summary_rate'] == 0 or is_final_episode:
            episode_online_summary = tf.Summary(
                value=[
                    tf.Summary.Value(
                        tag="online/epsilon",
                        simple_value=agent.epsilon()),
                    tf.Summary.Value(
                        tag="score/online",
                        simple_value=score),
                    tf.Summary.Value(
                        tag="online/global_step",
                        simple_value=agent.step_count),
                    tf.Summary.Value(
                        tag="online/step_duration",
                        simple_value=ep_duration/(agent.step_count - ep_begin_step_count)),
                    tf.Summary.Value(
                        tag="online/ep_duration_seconds",
                        simple_value=ep_duration)])
            agent.summary_writter.add_summary(episode_online_summary, episode)

        # save
        if (episode % conf_parameters['save_rate'] == 0 and episode != 0) or is_final_episode:
            saver.save(sess,os.path.join(conf_parameters['checkpoint_dir'],'model'), global_step=episode) 
            
        # performance summary
        if episode % conf_parameters['eval_freq'] == 0 or is_final_episode:
            score_list = test_run(agent, env, conf_parameters['eval_steps'])
            performance_summary = tf.Summary(
                value=[
                    tf.Summary.Value(
                        tag="score/average",simple_value=sum(score_list)/len(score_list)),
                    tf.Summary.Value(
                        tag="score/max",simple_value=max(score_list)),
                    tf.Summary.Value(
                        tag="score/min",simple_value=min(score_list)),
                ])
            agent.summary_writter.add_summary(performance_summary, agent.step_count)