In [24]:
import numpy as np
import random
import time
import os
import gc

from keras.models import Sequential, clone_model
from keras.layers import Dense, Flatten, Conv2D, InputLayer
from keras.callbacks import CSVLogger, TensorBoard, EarlyStopping
from keras.optimizers import Adam
import keras.backend as K

import gym
from collections import deque
from mini_pacman import PacmanGame, test
import itertools

In [27]:
def get_state(obs):
    v = []
    x,y = obs['player']
    v.append(x)
    v.append(y)
    for x, y in obs['monsters']:
        v.append(x)
        v.append(y)
    for x, y in obs['diamonds']:
        v.append(x)
        v.append(y)
    for x, y in obs['walls']:
        v.append(x)
        v.append(y)
    return v


In [28]:
def create_dqn_model(input_shape, nb_actions, dense_layers, dense_units):
    model = Sequential()
    model.add(InputLayer(input_shape=input_shape))
    for i in range(dense_layers):
        model.add(Dense(units=dense_units, activation='relu'))
    model.add(Dense(nb_actions, activation='linear'))
    return model

In [26]:

def epsilon_greedy(q_values, epsilon, n_outputs):
    if random.random() < epsilon:
        return random.randrange(1, n_outputs, 1)  # random action
    else:
        return 1+np.argmax(q_values)          # q-optimal action

In [147]:
#env = gym.make("MsPacman-ram-v0")                                                                                                                                                                                                                                              
env = PacmanGame(field_shape=(8, 8), nmonsters=2, ndiamonds=3, nwalls=10, monster_vision_range=2) #, max_moves=100, diamond_reward=100, survival_reward=1)                                                                                                                      
#env = PacmanGame(field_shape=(10,10), nmonsters=2,ndiamonds=3, nwalls=4, monster_vision_range=1)                                                                                                                                                                               
obs = env.reset()
end_game = False
score = 0


input_shape = (32,) #obs.shape                                                                                                                                                                                                                                                  
nb_actions = 9 #env.action_space.n  # 9                                                                                                                                                                                                                                         
dense_layers = 5
dense_units = 256

online_network = create_dqn_model(input_shape, nb_actions, dense_layers, dense_units)


replay_memory_maxlen = 1000000
replay_memory = deque([], maxlen=replay_memory_maxlen)


target_network = clone_model(online_network)
target_network.set_weights(online_network.get_weights())

In [148]:
def mean_q(y_true, y_pred):
    return K.mean(K.max(y_pred, axis=-1))

In [149]:
name = 'MiniPacman_DQN'  # used in naming files (weights, logs, etc)                                                                                                                                                                                                            
n_steps = 5000        # total number of training steps (= n_epochs)                                                                                                                                                                                                            
warmup = 1000          # start training after warmup iterations                                                                                                                                                                                                                 
training_interval = 4  # period (in actions) between training steps                                                                                                                                                                                                             
save_steps = int(n_steps/10)  # period (in training steps) between storing weights to file                                                                                                                                                                                      
copy_steps = 100       # period (in training steps) between updating target_network weights                                                                                                                                                                                     
gamma = 0.95            # discount rate                                                                                                                                                                                                                                          
skip_start = 10        # skip the start of every game (it's just freezing time before game starts)                                                                                                                                                                              
batch_size = 64        # size of minibatch that is taken randomly from replay memory every training step                                                                                                                                                                        
double_dqn = False     # whether to use Double-DQN approach or simple DQN (see above)                                                                                                                                                                                           
# eps-greedy parameters: we slowly decrease epsilon from eps_max to eps_min in eps_decay_steps                                                                                                                                                                                  
eps_max = 1.0
eps_min = 0.05
eps_decay_steps = int(n_steps/2)

learning_rate = 0.0001

In [150]:
online_network.compile(optimizer=Adam(learning_rate), loss='mse', metrics=[mean_q])

if not os.path.exists(name):
    os.makedirs(name)

weights_folder = os.path.join(name, 'weights')
if not os.path.exists(weights_folder):
    os.makedirs(weights_folder)


csv_logger = CSVLogger(os.path.join(name, 'log.csv'), append=True, separator=';')
early_stopping = EarlyStopping(monitor='mean_q', patience=300)





# counters:                                                                                                                                                                                                                                                                     
step = 0          # training step counter (= epoch counter)                                                                                                                                                                                                                     
iteration = 0     # frames counter                                                                                                                                                                                                                                              
episodes = 0      # game episodes counter                                                                                                                                                                                                                                       
end_game = True       # indicator that env needs to be reset                                                                                                                                                                                                                    

episode_scores = []  # collect total scores in this list and log it later                                                                                                                                                                                                       


while step < n_steps:
    if end_game:  # game over, restart it                                                                                                                                                                                                                                       
        obs = env.reset()
        score = 0  # reset score for current episode                                                                                                                                                                                                                            
        #for skip in range(skip_start):  # skip the start of each game (it's just freezing time before game starts)                                                                                                                                                              
        #    try:
        #        obs = env.make_action(1) #env.step(0)                                                                                                                                                                                                                           
        #        reward = obs['reward']
        #        end_game = obs['end_game']
        #        score += reward
        #    except AssertionError:
        #        continue

        state = get_state(obs)
        episodes += 1

    # Online network evaluates what to do                                                                                                                                                                                                                                       
    iteration += 1
    q_values = online_network.predict(np.array([state]))[0]  # calculate q-values using online network                                                                                                                                                                          

    # select epsilon (which linearly decreases over training steps):                                                                                                                                                                                                            
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    action = epsilon_greedy(q_values, epsilon, nb_actions)

    # Play:                                                                                                                                                                                                                                                                     
    try:
        obs = env.make_action(action)
        reward = obs['reward']
        end_game = obs['end_game']
    except AssertionError:
        continue

    score += reward

    if end_game:
        episode_scores.append(score)
    next_state = get_state(obs)
    # Let's memorize what just happened                                                                                                                                                                                                                                         
    replay_memory.append((state, action, reward, next_state, end_game))
    state = next_state

    if iteration >= warmup and iteration % training_interval == 0:
        # learning branch                                                                                                                                                                                                                                                       
        step += 1

        #minibatch = random.sample(replay_memory, batch_size)                                                                                                                                                                                                                   
        minibatch = []
        selectcnt = 4
        batch_counter = 0
        batch_limit = batch_size/selectcnt
        batch_selection_complete = False
        batch_indices = []

        while(not batch_selection_complete):
            cstart = random.randint(0, len(replay_memory) - selectcnt)

            if(any(ind in batch_indices for ind in (cstart, cstart+1, cstart+2, cstart+3))):
                continue

            batch_indices.append(cstart)

            minibatch = (minibatch + (list(itertools.islice(replay_memory, cstart, cstart + selectcnt))))

            batch_counter += 1

            if(batch_counter == batch_limit):
                batch_selection_complete = True

        replay_state = np.array([x[0] for x in minibatch])
        replay_action = np.array([x[1] for x in minibatch])
        replay_rewards = np.array([x[2] for x in minibatch])
        replay_next_state = np.array([x[3] for x in minibatch])
        replay_end_game = np.array([x[4] for x in minibatch], dtype=int)

        # calculate targets (see above for details)                                                                                                                                                                                                                             
        if double_dqn == False:
            # DQN                                                                                                                                                                                                                                                               
            target_for_action = replay_rewards + (1-replay_end_game) * gamma * \
                                    np.amax(target_network.predict(replay_next_state), axis=1)
        else:
            # Double DQN                                                                                                                                                                                                                                                        
            best_actions = np.argmax(online_network.predict(replay_next_state), axis=1)
            target_for_action = replay_rewards + (1-replay_end_game) * gamma * \
                                    target_network.predict(replay_next_state)[np.arange(batch_size), best_actions-1]
            
        target = online_network.predict(replay_state)  # targets coincide with predictions ...                                                                                                                                                                                  
        target[np.arange(batch_size), replay_action-1] = target_for_action  #...except for targets with actions from replay                                                                                                                                                       

        # Train online network                                                                                                                                                                                                                                                  
        online_network.fit(replay_state, target, epochs=step, verbose=2, initial_epoch=step-1,
                           callbacks=[early_stopping])

        # Periodically copy online network weights to target network                                                                                                                                                                                                            
        if step % copy_steps == 0:
            target_network.set_weights(online_network.get_weights())
        # And save weights                                                                                                                                                                                                                                                      
        if step % save_steps == 0:
            online_network.save_weights(os.path.join(weights_folder, 'weights_{}.h5f'.format(step)))
            gc.collect()  # also clean the garbage                                                                                                                                                                                                                              




Epoch 1/1
 - 1s - loss: 0.5120 - mean_q: 0.8490
Epoch 2/2
 - 0s - loss: 0.1647 - mean_q: 0.8648
Epoch 3/3
 - 0s - loss: 0.3589 - mean_q: 0.9551
Epoch 4/4
 - 0s - loss: 0.1424 - mean_q: 1.0911
Epoch 5/5
 - 0s - loss: 0.3175 - mean_q: 1.1760
Epoch 6/6
 - 0s - loss: 0.2934 - mean_q: 1.2155
Epoch 7/7
 - 0s - loss: 0.3433 - mean_q: 1.3264
Epoch 8/8
 - 0s - loss: 0.0563 - mean_q: 1.4284
Epoch 9/9
 - 0s - loss: 0.0573 - mean_q: 1.4627
Epoch 10/10
 - 0s - loss: 0.2074 - mean_q: 1.6626
Epoch 11/11
 - 0s - loss: 0.2000 - mean_q: 1.7569
Epoch 12/12
 - 0s - loss: 0.1956 - mean_q: 1.8000
Epoch 13/13
 - 0s - loss: 0.1725 - mean_q: 1.8271
Epoch 14/14
 - 0s - loss: 0.0505 - mean_q: 1.9096
Epoch 15/15
 - 0s - loss: 0.2208 - mean_q: 1.9786
Epoch 16/16
 - 0s - loss: 0.0252 - mean_q: 1.9387
Epoch 17/17
 - 0s - loss: 0.2100 - mean_q: 2.0550
Epoch 18/18
 - 0s - loss: 0.0358 - mean_q: 1.8550
Epoch 19/19
 - 0s - loss: 0.2001 - mean_q: 1.9049
Epoch 20/20
 - 0s - loss: 0.3421 - mean_q: 1.9214
Epoch 21/21
 - 0s 

KeyboardInterrupt: 

In [38]:
best_actions

NameError: name 'best_actions' is not defined

In [131]:
import numpy as np
import random
import time
import os
import gc

from collections import deque
from mini_pacman import PacmanGame, test

from keras.models import Sequential, clone_model
from keras.layers import Dense, Flatten, Conv2D, InputLayer, Dropout, BatchNormalization
from keras.callbacks import CSVLogger, TensorBoard
from keras.optimizers import Adam
import keras.backend as K

from keras.models import load_model

def get_state(obs):
    v = []
    x,y = obs['player']
    v.append(x)
    v.append(y)
    for x, y in obs['monsters']:
        v.append(x)
        v.append(y)
    for x, y in obs['diamonds']:
        v.append(x)
        v.append(y)
    for x, y in obs['walls']:
        v.append(x)
        v.append(y)
    return v

def create_dqn_model(input_shape, nb_actions, dense_layers, dense_units):
    model = Sequential()
    model.add(InputLayer(input_shape=input_shape))
    for i in range(dense_layers):
        model.add(Dense(units=dense_units, activation='relu'))
        #model.add(BatchNormalization())                                                                                                                                                                                                                                        
        #model.add(Dropout(0.2))                                                                                                                                                                                                                                                
    model.add(Dense(nb_actions, activation='linear'))
    return model

def epsilon_greedy(q_values, epsilon, n_outputs):
    if random.random() < epsilon:
        return random.randrange(1, n_outputs, 1)  # random action                                                                                                                                                                                                               
    else:
        return 1+np.argmax(q_values)          # q-optimal action                                                                                                                                                                                                                  


input_shape = (32,) #obs.shape                                                                                                                                                                                                                                                  
nb_actions = 9 #env.action_space.n  # 9                                                                                                                                                                                                                                         
dense_layers = 5
dense_units = 256

dqn_model = create_dqn_model(input_shape, nb_actions, dense_layers, dense_units)

#dqn_model = load_model('dqn_model.h5', compile=False)                                                                                                                                                                                                                          
dqn_model.load_weights('MiniPacman_DQN/weights/weights_5000.h5f')

def dqn_strategy(obs):
    state = get_state(obs)
    q_values = dqn_model.predict(np.array([state]))[0]
    eps=0.05
    action = epsilon_greedy(q_values, eps, nb_actions)
    return action

test(strategy=dqn_strategy, log_file='test_pacman_log.json')

Your average score is 45.754, median is 18.0, saved log to 'test_pacman_log.json'. Do not forget to upload it for submission!


18.0