In [1]:
import numpy as np
import gym
import tensorflow as tf

from tensorflow.contrib.layers import flatten, conv2d, fully_connected
from collections import deque, Counter
import random
from datetime import datetime

  from ._conv import register_converters as _register_converters


In [50]:
mspacman_color = np.array([210, 164, 74]).mean()

def preprocess_observation(obs):
    img = obs[1:176:2, ::2]
    img = img.mean(axis=2)
    img[img==mspacman_color] = 0
    img = (img - 128) / 128 - 1
    return img.reshape(88,80,1)

In [51]:
tf.reset_default_graph()
env = gym.make("MsPacman-v0")

[2018-03-22 09:53:48,884] Making new env: MsPacman-v0


In [52]:
n_outputs = env.action_space.n

def q_network(X, name_scope):
    initializer = tf.contrib.layers.variance_scaling_initializer()
    with tf.variable_scope(name_scope) as scope: 
        layer_1 = conv2d(X, num_outputs=32, kernel_size=(8,8), stride=4, padding='SAME', weights_initializer=initializer) 
        tf.summary.histogram('fc_1',fc_1)
        layer_2 = conv2d(layer_1, num_outputs=64, kernel_size=(4,4), stride=2, padding='SAME', weights_initializer=initializer)
        tf.summary.histogram('layer_2',layer_2)
        layer_3 = conv2d(layer_2, num_outputs=64, kernel_size=(3,3), stride=1, padding='SAME', weights_initializer=initializer)
        tf.summary.histogram('layer_3',layer_3)
        
        flat = flatten(layer_3)
        fc_1 = fully_connected(flat, num_outputs=128, weights_initializer=initializer)
        tf.summary.histogram('fc_1',fc_1)
        
        outputs = fully_connected(fc_1, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)
        tf.summary.histogram('outputs',outputs)
        
        vrs = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)} 
        return vrs, outputs

In [53]:
maxlen = 20000
def get_memories(batch_size):
    perm_batch = np.random.permutation(len(memories))[:batch_size]
    mem = np.array(memories)[perm_batch]
    return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]

epsilon = 0.5
eps_min = 0.05
eps_max = 1.0
eps_decay_steps = 500000

def expl_policy(action, step, print_ep=False):
    p = np.random.random(1).squeeze()
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    if print_ep:
        print('Epsilon:',epsilon)
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        return action

In [None]:
num_games = 800
batch_size = 48
global_step = 0
print_ep = 10
input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97
copy_steps = 100
steps_train = 4
start_steps = 2000

now = datetime.utcnow().strftime("%Y/%m/%d-%H-%M-%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

tf.reset_default_graph()

memories = deque(maxlen=maxlen)

X = tf.placeholder(tf.float32, shape=X_shape)
in_training_mode = tf.placeholder(tf.bool)

mainQ, mainQ_outputs = q_network(X, 'mainQ')
secondQ, secondQ_outputs = q_network(X, 'secondQ')


X_action = tf.placeholder(tf.int32, shape=(None,))
Q_action = tf.reduce_sum(secondQ_outputs * tf.one_hot(X_action, n_outputs), axis=-1, keep_dims=True)

copy_op = [tf.assign(main_name, secondQ[var_name]) for var_name, main_name in mainQ.items()]
copy_second_to_main = tf.group(*copy_op)

y = tf.placeholder(tf.float32, shape=(None,1))
loss = tf.reduce_mean(tf.square(y - Q_action))
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)


init = tf.global_variables_initializer()

loss_summary = tf.summary.scalar('LOSS', loss) #loss_summary
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [None]:
print(learning_rate)
with tf.Session() as sess:
    init.run()
    
    for i in range(num_games):
        done = False
        obs = env.reset()
        epoch = 0
        game_reward = 0
        actions_counter = Counter() 
        game_loss = []
        
        while not done:
            obs = preprocess_observation(obs)
            actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})

            ## GET THE ACTION
            action = np.argmax(actions, axis=-1)
            actions_counter[str(action)] += 1 
            action = expl_policy(action, global_step)
            
            new_obs, reward, done, _ = env.step(action)

            ## ADD THE LAST MEMORIES IN THE EXPERIENCE MEMORIES STRUCTURE
            memories.append([obs, action, preprocess_observation(new_obs), reward, done])
            
            
            if global_step % steps_train == 0 and global_step > start_steps:
                ## TRAIN THE SECOND Q
                o_obs, o_act, o_next_obs, o_rew, o_done = get_memories(batch_size)

                o_obs = [x for x in o_obs]
                o_next_obs = [x for x in o_next_obs]

                next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})
                y_batch = o_rew + discount_factor * np.max(next_act, axis=-1) * (1-o_done) 
            
                mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
                file_writer.add_summary(mrg_summary, global_step)

                train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
                game_loss.append(train_loss)
            
            ## COPY THE SECONDQ IN THE MAINQ
            if (global_step+1) % copy_steps == 0 and global_step > start_steps:
                copy_second_to_main.run()
                
            
            obs = new_obs
            epoch += 1
            global_step += 1
            game_reward += reward
        
        print('----', i, '---', epoch, '--', game_reward,'----', np.mean(game_loss), '----')
        expl_policy(3, global_step, print_ep=True)
        print(actions_counter)
        
        ## TEST THE MAIN Q
        obs = env.reset()
        test_ep = 0
        test_reward = 0
        test_done = False
        test_actions_counter = Counter() 
        while not test_done:
            obs = preprocess_observation(obs)
            action = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})
            test_actions_counter[str(np.argmax(action))] += 1 
            new_obs, reward, test_done, _ = env.step(np.argmax(action))
            
            obs = new_obs
            test_ep += 1
            test_reward += reward
            
        print('TEST:', test_ep, '\t', test_reward, test_actions_counter)


0.001


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


---- 0 --- 484 -- 140.0 ---- nan ----
Epsilon: 0.9990804
Counter({'[7]': 484})
TEST: 679 	 70.0 Counter({'7': 679})
---- 1 --- 610 -- 250.0 ---- nan ----
Epsilon: 0.9979214
Counter({'[7]': 610})
TEST: 679 	 70.0 Counter({'7': 679})
---- 2 --- 901 -- 390.0 ---- nan ----
Epsilon: 0.9962095
Counter({'[7]': 901})
TEST: 679 	 70.0 Counter({'7': 679})
---- 3 --- 713 -- 240.0 ---- 5.91276 ----
Epsilon: 0.9948548
Counter({'[0]': 230, '[7]': 105, '[4]': 100, '[8]': 100, '[5]': 83, '[3]': 70, '[6]': 25})
TEST: 511 	 210.0 Counter({'6': 511})
---- 4 --- 625 -- 190.0 ---- 5.2930274 ----
Epsilon: 0.9936673
Counter({'[6]': 192, '[3]': 161, '[0]': 139, '[1]': 100, '[5]': 33})
TEST: 634 	 210.0 Counter({'5': 634})
---- 5 --- 793 -- 290.0 ---- 5.487555 ----
Epsilon: 0.9921606
Counter({'[6]': 200, '[3]': 166, '[5]': 101, '[7]': 100, '[8]': 100, '[4]': 74, '[2]': 26, '[1]': 26})
TEST: 463 	 60.0 Counter({'2': 463})
---- 6 --- 529 -- 130.0 ---- 4.5045366 ----
Epsilon: 0.9911555
Counter({'[0]': 200, '[2]':