# iLykei Lecture Series

# Advanced Machine Learning and Artificial Intelligence (MScA 32017)

# Pac-Man Competition for Human-Machine Teams 

### Y.Balasanov, M. Tselishchev, &copy; iLykei 2018

## Preparation

In [1]:
import random
import numpy as np
import gym

Load trained model (which was previously saved by `model.save()`-method) for online network:

In [2]:
def create_dqn_model(input_shape, nb_actions, dense_layers, dense_units):
    model = Sequential()
    model.add(InputLayer(input_shape=input_shape))
    for i in range(dense_layers):
        model.add(Dense(units=dense_units, activation='relu'))
        #model.add(BatchNormalization())                                                                                                                                     
        #model.add(Dropout(0.5))                                                                                                                                             
    model.add(Dense(nb_actions, activation='linear'))
    return model

input_shape = (128,)
nb_actions = 9                                                                                                                                         
dense_layers = 7
dense_units = 256

In [3]:
from keras.models import load_model

from keras.models import Sequential, clone_model
from keras.layers import Dense, Flatten, Conv2D, InputLayer, Dropout, BatchNormalization
from keras.callbacks import CSVLogger, TensorBoard
from keras.optimizers import Adam
import keras.backend as K

online_network = create_dqn_model(input_shape, nb_actions, dense_layers, dense_units) #load_model('ram_model_4kk.h5f', compile=False) 

Using TensorFlow backend.


In [4]:
import os

weights_folder = './Competition/MsPacman_DQN_9/weights'
online_network.load_weights(os.path.join(weights_folder, 'weights_last.h5f'))

Define $\varepsilon$-greedy strategy (using small $\varepsilon$):

In [5]:
def epsilon_greedy(q_values, epsilon, n_outputs):
    if random.random() < epsilon:
        return random.randrange(n_outputs)  # random action
    else:
        return np.argmax(q_values)          # q-optimal action

## Testing model

Define a function to evalutate the trained network. 
Note that we still using $\varepsilon$-greedy strategy here to prevent an agent from getting stuck. 
`test_dqn` returns a list with scores for specific number of games.

In [6]:
def test_dqn(n_games, model, nb_actions=9, skip_start=90, eps=0.05, render=False, sleep_time=0.01):
    env = gym.make("MsPacman-ram-v0")
    scores = []
    for i in range(n_games):
        obs = env.reset()
        score = 0
        done = False
        for skip in range(skip_start):  # skip the start of each game (it's just freezing time before game starts)
            obs, reward, done, info = env.step(0)
            score += reward
        while not done:
            state = obs
            q_values = model.predict(np.array([state]))[0]
            action = epsilon_greedy(q_values, eps, nb_actions)
            obs, reward, done, info = env.step(action)
            score += reward
            if render:
                env.render()
                time.sleep(sleep_time)
                if done:
                    time.sleep(1)
        scores.append(score)
        print('{}/{}: {}'.format(i+1, n_games, score))
        env.close()
    return scores

### Collecting scores

Run 100 games without rendering and collect necessary statistics for final score.

In [12]:
ngames = 100
eps = 0.01
render = False

scores = test_dqn(ngames, online_network, eps=eps, render=render)

#print('\nMean score: ', np.mean(scores))
#print('\nMax score: ', np.max(scores))
#print('\nPercentiles:')
#print([ np.percentile(scores, p) for p in [0, 25, 50, 75, 100] ])
print('\nFifth percentile: ',np.percentile(scores,95))

1/100: 2110.0
2/100: 3050.0
3/100: 940.0
4/100: 920.0
5/100: 1560.0
6/100: 1820.0
7/100: 3500.0
8/100: 1440.0
9/100: 790.0
10/100: 1390.0
11/100: 1340.0
12/100: 1330.0
13/100: 4420.0
14/100: 1380.0
15/100: 1310.0
16/100: 1770.0
17/100: 2600.0
18/100: 1700.0
19/100: 2360.0
20/100: 2510.0
21/100: 2670.0
22/100: 1430.0
23/100: 1120.0
24/100: 2040.0
25/100: 1030.0
26/100: 1140.0
27/100: 2880.0
28/100: 2010.0
29/100: 2120.0
30/100: 2240.0
31/100: 1940.0
32/100: 2380.0
33/100: 2660.0
34/100: 1510.0
35/100: 3810.0
36/100: 2720.0
37/100: 1440.0
38/100: 2050.0
39/100: 1700.0
40/100: 4190.0
41/100: 1680.0
42/100: 2520.0
43/100: 1480.0
44/100: 3050.0
45/100: 1160.0
46/100: 4320.0
47/100: 1670.0
48/100: 3370.0
49/100: 2350.0
50/100: 1160.0
51/100: 3050.0
52/100: 2210.0
53/100: 1940.0
54/100: 1140.0
55/100: 5490.0
56/100: 1850.0
57/100: 2190.0
58/100: 1840.0
59/100: 4670.0
60/100: 2570.0
61/100: 2560.0
62/100: 1260.0
63/100: 2720.0
64/100: 3730.0
65/100: 5660.0
66/100: 2020.0
67/100: 1860.0
68/100:

### Rendering

Play 3 more times with rendering

In [7]:
import time
ngames = 5
eps = 0.05
render = True

scores = test_dqn(ngames, online_network, eps=eps, render=render)

print('\nMean score: ', np.mean(scores))
print('\nMax score: ', np.max(scores))
print('\nPercentiles:')
print([ np.percentile(scores, p) for p in [0, 25, 50, 75, 100] ])

1/5: 2660.0
2/5: 1530.0
3/5: 3060.0
4/5: 2580.0
5/5: 1740.0

Mean score:  2314.0

Max score:  3060.0

Percentiles:
[1530.0, 1740.0, 2580.0, 2660.0, 3060.0]
