In [1]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
import random
import cv2
import os
import time
import tqdm

## Load the game environment

In [2]:
env = gym.make('Breakout-v0')

[2016-11-24 14:05:29,514] Making new env: Breakout-v0


In [7]:
screen = env.reset()
print(screen.shape)
env.render()

(210, 160, 3)


In [5]:
env.render(close=True)

## Configuration parameters

In [11]:
screen_width = 84
screen_height = 84
history_length = 4

memory_size = 1000000
batch_size = 32
gamma = 0.99

learning_rate = 0.0005
learning_rate_decay = 0.96
learning_rate_decay_step = 50000

max_steps = 50000000
learn_start = 50000

# Exploration parameters
ep_min = 0.1
ep_start = 1.0
ep_end_time = memory_size

# Update Target network
target_q_update_step = 10000

# Train batch
train_frequency = 4

# Clip rewards
min_reward = -1.0
max_reward = 1.0

## Experience Replay
This class will allow us to store the experiences and to take random samples to update our target network paramters.

In [4]:
class Experience_Buffer():
    def __init__(self,memory_size = 50000):
        self.memory = []
        self.memory_size = memory_size
        
    def add(self, experience):
        if len(self.memory) + len(experience) >= self.memory_size:
            # Delete old experiences
            self.memory[0:(len(self.memory) + len(experience))-self.memory_size] = []
        # Add new experience
        self.memory.extend(experience)
        
    def sample_from_replay(self,size):
        return np.reshape(np.array(random.sample(self.memory)),[size,5])

## History
This class will allow us to stack the last K screens to use them as the input to the network (history of states).

In [5]:
class History:
    def __init__(self):
        self.history = np.zeros([history_length,screen_height,screen_width],dtype=np.float32)
        
    def add(self, screen):
        self.history[:-1] = self.history[1:]
        self.history[-1] = screen
    
    def reset(self):
        self.history *= 0
        
    def get(self):
        return np.transpose(self.history,(1,2,0))

## Auxiliar functions 

In [13]:
checkpoint_dir = 'Models/'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
    
def conv2d(x,output_dim,kernel_size,stride,
           initializer=tf.contrib.layers.xavier_initializer(),
           activation_fn = tf.nn.relu,
           padding = 'VALID',
           name = 'conv2d'):
    with tf.variable_scope(name):
        stride = [1,stride[0],stride[1],1]
        kernel_shape = [kernel_size[0],kernel_size[1],x.get_shape()[-1],output_dim]
        
        w = tf.get_variable('w',kernel_shape,tf.float32,initializer=initializer)
        conv = tf.nn.conv2d(x,w,stride,padding)
        b = tf.get_variable('biases',[output_dim],initializer=tf.constant_initializer(0.0))
        out = activation_fn(tf.nn.bias_add(conv,b))
        
    return out,w,b       

def linear(input_,output_size,stddev=0.02,bias_start=0.0,activation_fn=None,name='linear'):
    shape = input_.get_shape().as_list()
    
    with tf.variable_scope(name):
        w = tf.get_variable('Matrix',[shape[1],output_size],tf.float32,
                            tf.random_normal_initializer(stddev=stddev))
        b = tf.get_variable('bias',[output_size],initializer=tf.constant_initializer(bias_start))
        
        out = tf.nn.bias_add(tf.matmul(input_,w),b)
        
    if activation_fn!=None:
        return activation_fn(out), w, b
    else:
        return out,w,b
    
def new_game():
    screen = env.reset()
    screen = cv2.resize(cv2.cvtColor(screen,cv2.COLOR_RGB2GRAY)/255.,(screen_width,screen_height))
    return screen

def execute_action(action):
    screen, reward, terminal, _ = env.step(action)
    screen = cv2.resize(cv2.cvtColor(screen,cv2.COLOR_RGB2GRAY)/255.,(screen_width,screen_height))
    return screen, reward, terminal

## Learning agent

In [9]:
class Agent():
    def __init__(self,environment,sess):
        self.sess = sess
        self.env = environment
        self.exp_replay = Experience_Buffer(memory_size)
        self.history = History()
        
        with tf.variable_scope('step'):
            self.step_op = tf.Variable(0,trainable=False,name='step')
            self.step_input = tf.placeholder(tf.int32,None,name='step_input')
            self.step_assign_op = self.step_op.assign(self.step_input)
        
        # Build Deep network
        self.build_dqn()
        
    def build_dqn(self):
        self.w = {}
        self.t_w = {}
        
        initializer = tf.truncated_normal_initializer(0, 0.02)
        activation_fn = tf.nn.relu
        
        # TRAINING NETWORK
        with tf.variable_scope('prediction'):
            self.input_state = tf.placeholder(tf.float32,[None, screen_height, screen_width, history_length],
                                              name='input_state')
            self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(self.input_state,32,[8,8],[4,4],initializer, 
                                                             activation_fn, name='l1')
            self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(self.l1,64,[4,4],[2,2],initializer, 
                                                             activation_fn, name='l2')
            self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(self.l2,64,[3,3],[1,1],initializer, 
                                                             activation_fn, name='l3')
            
            shape = self.l3.get_shape().as_list()
            self.l3_flat = tf.reshape(self.l3,[-1,reduce(lambda x,y: x*y,shape[1:])])
            
            # Standard DQN implementation
            self.l4, self.w['l4_w'], self.w['l4_b'] = linear(self.l3_flat,512,activation_fn=activation_fn,name='l4')
            self.q, self.w['q_w'],self.w['q_b'] = linear(self.l4,self.env.action_space.n,name='q')
            
            sef.q_action = tf.argmax(self.q,dimension=1)
            
            # Add output summaries
            '''q_summary = []
            avg_q = tf.reduce_mean(self.q,0)  # Mean q_value per action for each batch
            for idx in xrange(self.env.action_space.n):
                q_summary.append(tf.histogram_summary('q/%s' % idx, avg_q[idx]))
            self.q_summary = tf.merge_summary(q_summary,'q_summary')
            '''        
        
        # TARGET NETWORK
        with tf.variable_scope('target'):
            self.target_state = tf.placeholder(tf.float32,[None, screen_height, screen_width, history_length],
                                              name='target_state')
            self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = conv2d(self.target_state,32,[8,8],[4,4],initializer, 
                                                             activation_fn, name='target_l1')
            self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = conv2d(self.target_l1,64,[4,4],[2,2],initializer, 
                                                             activation_fn, name='target_l2')
            self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = conv2d(self.target_l2,64,[3,3],[1,1],initializer, 
                                                             activation_fn, name='target_l3')
            
            shape = self.target_l3.get_shape().as_list()
            self.target_l3_flat = tf.reshape(self.target_l3,[-1,reduce(lambda x,y: x*y,shape[1:])])
            
            # Standard DQN
            self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = linear(self.target_l3_flat,512,
                                                                    activation_fn=activation_fn,name='target_l4')
            self.target_q, self.t_w['q_w'],self.t_w['q_b'] = linear(self.target_l4,self.env.action_space.n,
                                                                    name='target_q')
            
            self.target_q_idx = tf.placeholder(tf.int32,[None,None],'outputs_idx')
            self.target_q_with_idx = tf.gather_nd(self.target_q,self.target_q_idx)  # Gets q value according to index
        
        # COPY TRAINING NETWORK INTO TARGET
        with tf.variable_scope('pred_to_target'):
            self.t_w_input = {}
            self.t_w_assign_op = {}
            
            for name in self.w.keys():
                self.t_w_input[name] = tf.placeholder(tf.float32,self.t_w[name].get_shape().as_list(),name=name)
                self.t_w_assign_op[name] = self.t_w[name].assign(self.t_w_input[name])
                    
        # OPTIMIZER
        with tf.variable_scope('optimizer'):
            self.target_q_t = tf.placeholder(tf.float32,[None],name='target_q_t')
            self.action = tf.placeholder(tf.int64,[None],name='action')
            
            # One hot of the action which was taken
            action_one_hot = tf.one_hot(self.action,self.env.action_space.n,1.0,0.0,name='action_one_hot')
            # Extract the q_value of the action
            q_acted = tf.reduce_sum(self.q * action_one_hot,reduction_indices=1,name='q_acted')
            
            self.delta = self.target_q_t - q_acted      # Error
            self.clipped_delta = tf.clip_by_value(self.delta,-1.0,1.0,name='clipped_delta')
            
            # PAY ATTENTION TO THIS ONE!
            self.global_step = tf.Variable(0,trainable=False)
            
            self.loss = tf.reduce_mean(tf.square(self.clipped_delta),name='loss')
            self.learning_rate_step = tf.placeholder(tf.int64,None,name='learning_rate_step')
            self.learning_rate_op = tf.maximum(0.00025,tf.train.exponential_decay(
                learning_rate,self.learning_rate_step,learning_rate_decay_step,learning_rate_decay,staircase=True))
            self.optim = tf.train.RMSPropOptimizer(self.learning_rate_op,momentum=0.95,epsilon=0.01).minimize(self.loss)
            
        tf.initialize_all_variables().run()
        self._saver = tf.train.Saver(self.w.values() + [self.step_op], max_to_keep = 10)
        
        self.load_model()
        self.update_target_q_network()
                    
    def update_target_q_network(self):
        for name in self.w.keys():
            self.t_w_assign_op[name].eval({self.t_w_input[name]:self.w[name].eval()})
            
    def load_model(self,step=None):
        print('[*] Loading Checkpoints...')

        ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
        if ckpt and ckpt.model_checkpoint_path:
            ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
            fname = os.path.join(checkpoint_dir,ckpt_name)
            self._saver.restore(self.sess,fname)
            print('[*] Load succesfull: %s' % fname)
            return True
        else:
            print('[!] Load failed: %s' % checkpoint_dir)
            return False
        
    def save_model(self):
        print('[*] Saving checkpoints...')
        self._saver.save(self.sess,checkpoint_dir,global_step=step)    
        
    def train(self):
        start_step = self.step_op.eval()
        start_time = time.time()
        
        # VERIFY THE NEED OF THE FOLLOWING VARIABLES
        num_game, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []
        
        # New game (we could modify this to start randomly -> taking random initial actions)
        screen = new_game()
        
        # Stacking the screen in the first input buffer
        for _ in range(history_length):
            self.history.add(screen)
            
        # Training:
        for self.step in tqdm(range(start_step,max_steps),ncols=70,initial=start_step):
            if self.step == learn_start:
                num_game, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []
                
            # 1. Predict: Use our training network to select an action
            action = self.predict(self.history.get())
            
            # 2. Execute the action
            screen, reward, terminal = execute_action(action)
            
            # 3. New observation
            self.observe(screen,reward,action,terminal)
            
            if terminal:
                screen = new_game()
                num_game += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0
            else:
                ep_reward += reward
                total_reward += reward
                actions.append(action)
                
            if self.step >= learn_start:
            
            
    def predict(self,current_state,test_ep=None):
        # Calculate exploration prob. epsilon => This is a decaying epsilon starting at 1 and decreasing until 0.1
        # once the learning process start
        ep = test_ep or (ep_min + max(0.,(ep_start-ep_min)*(ep_end_time-max(0.,self.step-learn_start))/ep_end_time))
        if random.random() < ep:
            # Explore: random action
            action = random.randrange(self.env.action_space.n)
        else:
            action = self.q_action.eval({self.input_state:[current_state]})[0]
            
        return action
    
    def observe(self,screen,reward,action,terminal):
        # Clip reward
        reward = max(min_reward,min(max_reward,reward))
        
        # Add to history
        self.history.add(screen)
        # Add to exp. replay
        # TODO
        
        if self.step > learn_start:
            if self.step % train_frequency == 0:
                self.q_learning_mini_batch()
                
            if self.step % target_q_update_step == target_q_update_step -1:
                self.update_target_q_network()
        
        
        
                