In [1]:
import numpy as np
import gym
import tensorflow as tf

from tensorflow.contrib.layers import convolution2d, fully_connected, flatten
from collections import deque, Counter
import random
from datetime import datetime

  from ._conv import register_converters as _register_converters


In [2]:
mspacman_color = np.array([210, 164, 74]).mean()

def preprocess_observation(obs):
    img = obs[1:176:2, ::2]
    img = img.mean(axis=2)
    img[img==mspacman_color] = 0
    img = (img - 128) / 128 - 1
    return img.reshape(88,80,1)

In [3]:
tf.reset_default_graph()
env = gym.make("MsPacman-v0")

[2018-03-22 09:12:14,757] Making new env: MsPacman-v0


In [7]:
n_outputs = env.action_space.n

def q_network(X, name_scope):
    initializer = tf.contrib.layers.variance_scaling_initializer()
    with tf.variable_scope(name_scope) as scope: ## or tf.name_scope(name) ??
        layer_1 = convolution2d(X, num_outputs=32, kernel_size=8, stride=4, padding='SAME') #ev change weights_initializer
        layer_1_normed = tf.layers.batch_normalization(layer_1, training=in_training_mode)
        
        layer_2 = convolution2d(layer_1_normed, num_outputs=64, kernel_size=4, stride=2, padding='SAME')
        layer_2_normed = tf.layers.batch_normalization(layer_2, training=in_training_mode)
        layer_3 = convolution2d(layer_2_normed, num_outputs=64, kernel_size=3, stride=1, padding='SAME')
        layer_3_normed = tf.layers.batch_normalization(layer_3, training=in_training_mode)
        
        flat = flatten(layer_3_normed)
        fc_1 = fully_connected(flat, num_outputs=128)
        
        fc_1_normed = tf.layers.batch_normalization(fc_1, training=in_training_mode)
        tf.summary.histogram('fc_1',fc_1_normed)
        
        outputs = fully_connected(fc_1_normed, num_outputs=n_outputs, activation_fn=None)
        tf.summary.histogram('outputs',outputs)
        
        vrs = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)} 
        return vrs, outputs

'''
input_height = 88
input_width = 80
input_channels = 1
conv_n_maps = [32, 64, 64]
conv_kernel_sizes = [(8,8), (4,4), (3,3)]
conv_strides = [4,2,1]
conv_paddings = ['SAME']*3
conv_activation = [tf.nn.relu]*3
n_hidden_in = 64 * 11 * 10
n_hidden = 512
hidden_activation = tf.nn.relu
n_outputs = env.action_space.n
initializer = tf.contrib.layers.variance_scaling_initializer()
from tensorflow.contrib.layers import convolution2d, fully_connected
def q_network(X_state, scope):
    prev_layer = X_state
    conv_layers = []
    
    with tf.variable_scope(scope) as scope:
        for n_maps, kernel_size, stride, padding, activation in zip(conv_n_maps, conv_kernel_sizes, conv_strides, 
                                                                   conv_paddings, conv_activation):
            prev_layer = convolution2d(prev_layer, num_outputs=n_maps, kernel_size=kernel_size, stride=stride,
                                      padding=padding, activation_fn=activation, weights_initializer=initializer)
            conv_layers.append(prev_layer)
        last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_in])
        hidden = fully_connected(last_conv_layer_flat, n_hidden, activation_fn=hidden_activation, weights_initializer=initializer)
        tf.summary.histogram('hidden',hidden)
        outputs = fully_connected(hidden, n_outputs, activation_fn=None, weights_initializer=initializer)
        tf.summary.histogram('outputs',outputs)
        
    trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)
    trainable_vars_by_name = {var.name[len(scope.name):]:var for var in trainable_vars}
    return trainable_vars_by_name, outputs'''

"\ninput_height = 88\ninput_width = 80\ninput_channels = 1\nconv_n_maps = [32, 64, 64]\nconv_kernel_sizes = [(8,8), (4,4), (3,3)]\nconv_strides = [4,2,1]\nconv_paddings = ['SAME']*3\nconv_activation = [tf.nn.relu]*3\nn_hidden_in = 64 * 11 * 10\nn_hidden = 512\nhidden_activation = tf.nn.relu\nn_outputs = env.action_space.n\ninitializer = tf.contrib.layers.variance_scaling_initializer()\nfrom tensorflow.contrib.layers import convolution2d, fully_connected\ndef q_network(X_state, scope):\n    prev_layer = X_state\n    conv_layers = []\n    \n    with tf.variable_scope(scope) as scope:\n        for n_maps, kernel_size, stride, padding, activation in zip(conv_n_maps, conv_kernel_sizes, conv_strides, \n                                                                   conv_paddings, conv_activation):\n            prev_layer = convolution2d(prev_layer, num_outputs=n_maps, kernel_size=kernel_size, stride=stride,\n                                      padding=padding, activation_fn=activation, 

In [8]:
maxlen = 20000
def get_memories(batch_size):
    perm_batch = np.random.permutation(len(memories))[:batch_size]
    mem = np.array(memories)[perm_batch]
    return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]

epsilon = 0.5
eps_min = 0.05
eps_max = 1.0
eps_decay_steps = 500000

def expl_policy(action, step, print_ep=False):
    p = np.random.random(1).squeeze()
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    if print_ep:
        print('Epsilon:',epsilon)
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        return action

In [None]:
num_games = 400
batch_size = 48
global_step = 0
print_ep = 10
input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97
copy_steps = 100
steps_train = 4
start_steps = 2000

now = datetime.utcnow().strftime("%Y/%m/%d-%H-%M-%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

tf.reset_default_graph()

memories = deque(maxlen=maxlen)

X = tf.placeholder(tf.float32, shape=X_shape)
in_training_mode = tf.placeholder(tf.bool)

mainQ, mainQ_outputs = q_network(X, 'mainQ')
secondQ, secondQ_outputs = q_network(X, 'secondQ')

## copy..

## Get the one hot vector of the action taken (useful for take in consideration only the Q value of action previouly considerated)
## Of all the other actions isn't clear the estimantes Q_values


X_action = tf.placeholder(tf.int32, shape=(None,))
Q_action = tf.reduce_sum(secondQ_outputs * tf.one_hot(X_action, n_outputs), axis=-1, keep_dims=True)

copy_op = [tf.assign(main_name, secondQ[var_name]) for var_name, main_name in mainQ.items()]
copy_second_to_main = tf.group(*copy_op)

#mainQ_W = [mainQ[var_name] for var_name, _ in mainQ.items()]
#secondQ_W = [secondQ[var_name] for var_name, _ in secondQ.items()]

y = tf.placeholder(tf.float32, shape=(None,1))
loss = tf.reduce_mean(tf.square(y - Q_action))
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)


init = tf.global_variables_initializer()

loss_summary = tf.summary.scalar('LOSS', loss) #loss_summary = 
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [None]:
print(learning_rate)
with tf.Session() as sess:
    init.run()
    
    for i in range(num_games):
        done = False
        obs = env.reset()
        epoch = 0
        game_reward = 0
        actions_counter = Counter() 
        
        while not done:
            obs = preprocess_observation(obs)
            actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})

            ## GET ACTION'S ONE HOT VECTOR
            action = np.argmax(actions, axis=-1)
            actions_counter[str(action)] += 1 
            action = expl_policy(action, global_step)
            
            new_obs, reward, done, _ = env.step(action)

            ## UPDATE EXPERIENCE MEMORIES WITH THE LAST ONE
            memories.append([obs, action, preprocess_observation(new_obs), reward, done])
            
            
            if global_step % steps_train == 0 and global_step > start_steps:
                ## TRAIN THE SECOND Q
                o_obs, o_act, o_next_obs, o_rew, o_done = get_memories(batch_size)

                o_obs = [x for x in o_obs]
                o_next_obs = [x for x in o_next_obs]

                next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})
                y_batch = o_rew + discount_factor * np.max(next_act, axis=-1) * (1-o_done) ## IS IT CORRECT??
                '''
                if global_step % (steps_train+800) == 0:
                    print('actions', actions)
                    print('arg next_act action',np.argmax(next_act, axis=-1))
                    print('y_batch',y_batch)
                    '''
            
                mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False}) ## IS IT CORRECT?
                file_writer.add_summary(mrg_summary, global_step)

                train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
            
            ## COPY SECOND IN FIRST
            if (global_step+1) % copy_steps == 0 and global_step > start_steps:
                print('Copying..')
                copy_second_to_main.run()
                
            
            obs = new_obs
            epoch += 1
            global_step += 1
            game_reward += reward
        
        print('----', i, '---', epoch, '--', game_reward,'----')
        expl_policy(3, global_step, print_ep=True)
        print(actions_counter)
        
        ## TEST THE ACTOR
        obs = env.reset()
        test_ep = 0
        test_reward = 0
        test_done = False
        test_actions_counter = Counter() 
        while not test_done:
            obs = preprocess_observation(obs)
            action = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})
            test_actions_counter[str(np.argmax(action))] += 1 
            new_obs, reward, test_done, _ = env.step(np.argmax(action))
            
            obs = new_obs
            test_ep += 1
            test_reward += reward
            
        print('TEST:', test_ep, '\t', test_reward, test_actions_counter)


0.001
---- 0 --- 647 -- 160.0 ----
Epsilon: 0.9987707
Counter({'[0]': 647})
TEST: 433 	 60.0 Counter({'0': 433})
---- 1 --- 667 -- 220.0 ----
Epsilon: 0.9975034
Counter({'[0]': 667})
TEST: 437 	 60.0 Counter({'0': 437})
---- 2 --- 638 -- 180.0 ----
Epsilon: 0.9962912
Counter({'[0]': 638})
TEST: 435 	 60.0 Counter({'0': 435})
Copying..
Copying..
Copying..
Copying..
Copying..
Copying..
Copying..
---- 3 --- 785 -- 290.0 ----
Epsilon: 0.9947997
Counter({'[5]': 300, '[2]': 200, '[0]': 148, '[8]': 100, '[3]': 37})
TEST: 440 	 60.0 Counter({'3': 440})
Copying..
Copying..
Copying..
Copying..
Copying..
Copying..
---- 4 --- 607 -- 160.0 ----
Epsilon: 0.9936464
Counter({'[7]': 424, '[3]': 183})
TEST: 432 	 60.0 Counter({'3': 432})
Copying..
Copying..
Copying..
Copying..
---- 5 --- 402 -- 70.0 ----
Epsilon: 0.9928826
Counter({'[3]': 202, '[7]': 200})
TEST: 430 	 60.0 Counter({'3': 430})
Copying..
Copying..


In [None]:
# cd C:\Users\Andrea\Jupyter notebook\Reinforcement Learning\Pacman_rl
# tensorboard --logdir tf_logs/