# iLykei Lecture Series

# Advanced Machine Learning and Artificial Intelligence (MScA 32017)

# Pac-Man Competition for Human-Machine Teams 

### Y.Balasanov, M. Tselishchev, &copy; iLykei 2018

## Preparation

In [1]:
import random
import numpy as np
import gym
import os 
from keras.models import Sequential, clone_model, Model
from keras.layers import Dense, Flatten, Conv2D, InputLayer, Lambda, Input
from keras.callbacks import CSVLogger, TensorBoard
from keras.optimizers import Adam
import keras.backend as K
from keras.initializers import VarianceScaling

Using TensorFlow backend.


In [2]:
print(gym.__version__)
#os.__version__
import keras
print(keras.__version__)

0.10.9
2.2.4


Load trained model (which was previously saved by `model.save()`-method) for online network:

In [3]:
import numpy as np
from keras import initializers, regularizers, activations, constraints
from keras.engine.topology import Layer
import keras.backend as K

class NoisyNetDense(Layer):
    """
    A modified fully-connected layer that injects noise into the parameter distribution
    before each prediction. This randomness forces the agent to explore - at least
    until it can adjust its parameters to learn around it.
    To use: replace Dense layers (like the classifier at the end of a DQN model)
    with NoisyNetDense layers and set your policy to GreedyQ.
    See examples/noisynet_pdd_dqn_atari.py
    Reference: https://arxiv.org/abs/1706.10295
    """
    def __init__(self,
                units,
                activation=None,
                kernel_constraint=None,
                bias_constraint=None,
                kernel_regularizer=None,
                bias_regularizer=None,
                mu_initializer=None,
                sigma_initializer=None,
                **kwargs):

        super(NoisyNetDense, self).__init__(**kwargs)

        self.units = units

        self.activation = activations.get(activation)
        self.kernel_constraint = constraints.get(kernel_constraint) if kernel_constraint is not None else None
        self.bias_constraint = constraints.get(bias_constraint) if kernel_constraint is not None else None
        self.kernel_regularizer = regularizers.get(kernel_regularizer)if kernel_constraint is not None else None
        self.bias_regularizer = regularizers.get(bias_regularizer) if kernel_constraint is not None else None

    def build(self, input_shape):
        self.input_dim = input_shape[-1]

        #See section 3.2 of Fortunato et al.
        sqr_inputs = self.input_dim**(1/2)
        self.sigma_initializer = initializers.Constant(value=.5/sqr_inputs)
        self.mu_initializer = initializers.RandomUniform(minval=(-1/sqr_inputs), maxval=(1/sqr_inputs))


        self.mu_weight = self.add_weight(shape=(self.input_dim, self.units),
                                        initializer=self.mu_initializer,
                                        name='mu_weights',
                                        constraint=self.kernel_constraint,
                                        regularizer=self.kernel_regularizer)

        self.sigma_weight = self.add_weight(shape=(self.input_dim, self.units),
                                        initializer=self.sigma_initializer,
                                        name='sigma_weights',
                                        constraint=self.kernel_constraint,
                                        regularizer=self.kernel_regularizer)

        self.mu_bias = self.add_weight(shape=(self.units,),
                                        initializer=self.mu_initializer,
                                        name='mu_bias',
                                        constraint=self.bias_constraint,
                                        regularizer=self.bias_regularizer)

        self.sigma_bias = self.add_weight(shape=(self.units,),
                                        initializer=self.sigma_initializer,
                                        name='sigma_bias',
                                        constraint=self.bias_constraint,
                                        regularizer=self.bias_regularizer)

        super(NoisyNetDense, self).build(input_shape=input_shape)

    def call(self, x):
        #sample from noise distribution
        e_i = K.random_normal((self.input_dim, self.units))
        e_j = K.random_normal((self.units,))

        #We use the factorized Gaussian noise variant from Section 3 of Fortunato et al.
        eW = K.sign(e_i)*(K.sqrt(K.abs(e_i))) * K.sign(e_j)*(K.sqrt(K.abs(e_j)))
        eB = K.sign(e_j)*(K.abs(e_j)**(1/2))

        #See section 3 of Fortunato et al.
        noise_injected_weights = K.dot(x, self.mu_weight + (self.sigma_weight * eW))
        noise_injected_bias = self.mu_bias + (self.sigma_bias * eB)
        output = K.bias_add(noise_injected_weights, noise_injected_bias)
        if self.activation != None:
            output = self.activation(output)
        return output

    def compute_output_shape(self, input_shape):
        output_shape = list(input_shape)
        output_shape[-1] = self.units
        return tuple(output_shape)

    def get_config(self):
        config = {
            'units': self.units,
            'activation': activations.serialize(self.activation),
            'mu_initializer': initializers.serialize(self.mu_initializer),
            'sigma_initializer': initializers.serialize(self.sigma_initializer),
            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
            'kernel_constraint': constraints.serialize(self.kernel_constraint),
            'bias_constraint': constraints.serialize(self.bias_constraint)
        }
        base_config = super(NoisyNetDense, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [4]:
def create_dqn_model(input_shape, nb_actions, dense_layers, dense_units):
    model = Sequential()
    model.add(InputLayer(input_shape=input_shape))
    for i in range(dense_layers):
        model.add(Dense(units=dense_units, activation='relu'))
    model.add(Dense(nb_actions, activation='linear'))
    return model

In [5]:
env = gym.make("MsPacman-ram-v0")
input_shape = env.reset().shape #(128,)
nb_actions = env.action_space.n  # 9
dense_layers = 2
dense_units = 256

  result = entry_point.load(False)


In [6]:
def epsilon_greedy(q_values, epsilon, n_outputs):
    if random.random() < epsilon:
        return random.randrange(n_outputs)  # random action
    else:
        return np.argmax(q_values)          # q-optimal action

In [7]:
#Standard DQN model architecture, but swapping the Dense classifier layers for the rl.layers.NoisyNetDense version.
def create_noisy_model(input_shape = input_shape, nb_actions = nb_actions, dense_layers = 4, dense_units = 256):
    model = Sequential()
    #model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(InputLayer(input_shape=input_shape))
    for i in range(dense_layers):
        model.add(Dense(units=dense_units, activation='relu'))
    model.add(NoisyNetDense(units=dense_units, activation='relu'))
    model.add(NoisyNetDense(nb_actions, activation='linear'))
    #print(model.summary())
    return model

Define $\varepsilon$-greedy strategy (using small $\varepsilon$):

## Testing model

Define a function to evalutate the trained network. 
Note that we still using $\varepsilon$-greedy strategy here to prevent an agent from getting stuck. 
`test_dqn` returns a list with scores for specific number of games.

In [8]:
def test_dqn(n_games, model, nb_actions=9, skip_start=90, eps=0.05, render=False, sleep_time=0.01):
    env = gym.make("MsPacman-ram-v0")
    scores = []
    for i in range(n_games):
        obs = env.reset()
        score = 0
        done = False
        for skip in range(skip_start):  # skip the start of each game (it's just freezing time before game starts)
            obs, reward, done, info = env.step(0)
            score += reward
        while not done:
            state = obs
            q_values = model.predict(np.array([state]))[0]
            action = epsilon_greedy(q_values, eps, nb_actions)
            obs, reward, done, info = env.step(action)
            score += reward
            if render:
                env.render()
                time.sleep(sleep_time)
                if done:
                    time.sleep(1)
        scores.append(score)
        # print('{}/{}: {}'.format(i+1, n_games, score))
        env.close()
    return scores

In [9]:
input_shape = env.reset().shape
nb_actions = env.action_space.n 
fourlayer_reducedLR  = create_noisy_model(input_shape = input_shape, 
                                          nb_actions = nb_actions, 
                                          dense_layers = 2, 
                                          dense_units = 256)
fourlayer_reducedLR.load_weights('models/6Layer_256_D0.9999_Upd4_reducedLR/2730.00med_2523.20avg__640.00min__2020.03.12.16.46.19.model')

In [10]:
updated_model = Sequential()
updated_model.add(InputLayer(input_shape))
updated_model.add(Lambda(lambda x: x/255, name = 'normalizer'))
updated_model.add(fourlayer_reducedLR.layers[0])
updated_model.add(fourlayer_reducedLR.layers[1])
updated_model.add(fourlayer_reducedLR.layers[2])
updated_model.add(fourlayer_reducedLR.layers[3])

In [11]:
updated_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalizer (Lambda)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
noisy_net_dense_1 (NoisyNetD (None, 256)               131584    
_________________________________________________________________
noisy_net_dense_2 (NoisyNetD (None, 9)                 4626      
Total params: 235,026
Trainable params: 235,026
Non-trainable params: 0
_________________________________________________________________


### Collecting scores

Run 100 games without rendering and collect necessary statistics for final score.

In [12]:
n_games = 100
eps = 0
nb_actions = 9
render = False 

scores = test_dqn(n_games = n_games, 
                  model = updated_model, 
                  nb_actions = nb_actions, 
                  eps=eps, render=render)

print('\nMedian score: ', np.median(scores))
print('\nMean score: ', np.mean(scores))
print('\nMax score: ', np.max(scores))
print('\nFifth percentile: ',np.percentile(scores, 95))
print('\nPercentiles:')
print([ np.percentile(scores, p) for p in [0, 25, 50, 75, 100]])


Median score:  2725.0

Mean score:  2629.6

Max score:  3950.0

Fifth percentile:  3351.0

Percentiles:
[430.0, 2335.0, 2725.0, 2930.0, 3950.0]


In [41]:
n_games = 100
eps = 0
nb_actions = 9
render = False 

scores = test_dqn(n_games = n_games, 
                  model = updated_model, 
                  nb_actions = nb_actions, 
                  eps=eps, render=render)

print('\nMedian score: ', np.median(scores))
print('\nMean score: ', np.mean(scores))
print('\nMax score: ', np.max(scores))
print('\nFifth percentile: ',np.percentile(scores, 95))
print('\nPercentiles:')
print([ np.percentile(scores, p) for p in [0, 25, 50, 75, 100]])


Median score:  2600.0

Mean score:  2580.1

Max score:  3850.0

Fifth percentile:  3321.5

Percentiles:
[490.0, 2375.0, 2600.0, 2827.5, 3850.0]


### Rendering

Play 3 more times with rendering

In [None]:
import time

ngames = 5
eps = 0.05
render = True

scores = test_dqn(ngames, online_network, eps=eps, render=render)

print('\nMean score: ', np.mean(scores))
print('\nMax score: ', np.max(scores))
print('\nPercentiles:')
print([ np.percentile(scores, p) for p in [0, 25, 50, 75, 100] ])