In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import vizdoom
import os
import time
import keras
import random
from keras.layers import Input, Conv2D, Dense, Flatten, MaxPooling2D, Lambda
from keras.models import Model
from keras import backend as K
from keras.optimizers import SGD
from collections import deque
import numpy as np
from skimage import transform
from IPython.display import display, clear_output

Using TensorFlow backend.


In [2]:
game = vizdoom.DoomGame()
game.load_config("doom/basic.cfg")
#game.set_window_visible(True)
game.set_doom_scenario_path("doom/basic.wad")

In [3]:
num_episodes       = 500
num_steps          = 100
replay_buffer_size = 1000000
learning_rate      = 0.0002
discount_factor    = 0.95
batch_size         = 64

In [4]:
input_images = Input(shape=(84, 84, 4), dtype="float32")
input_action = Input(shape=(3,), dtype="float32") # one-hot vector
x = Conv2D(32, (3, 3), activation='elu', padding="valid", input_shape=(84, 84, 4))(input_images)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Conv2D(64, (3, 3), activation='elu', padding="valid")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Conv2D(128, (3, 3), activation='elu', padding="valid")(x)
x = Flatten()(x)
x = Dense(512, activation='elu')(x)
x = Dense(128, activation='elu')(x)
Q_actions = Dense(game.get_available_buttons_size(), activation=None)(x)
Q_input_action = Lambda(lambda x: K.expand_dims(K.sum(x[0] * x[1], axis=1), axis=-1))([Q_actions, input_action]) # Get Q-predicted for input_action

training_model = Model(inputs=[input_images, input_action], outputs=[Q_input_action])
training_model.compile(loss="mse", optimizer=SGD(lr=learning_rate))
training_model.summary()
prediction_model = Model(inputs=[input_images], outputs=[Q_actions])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 84, 84, 4)    0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 82, 82, 32)   1184        input_1[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 41, 41, 32)   0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 39, 39, 64)   18496       max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
max_poolin

In [5]:
def preprocess_frame(frame):
    # Crop the screen (remove the roof because it contains no information)
    cropped_frame = frame[30:-10, 30:-30]
    
    # Normalize Pixel Values
    normalized_frame = cropped_frame / 255.0
    
    # Resize
    preprocessed_frame = transform.resize(normalized_frame, [84, 84])
    
    return preprocessed_frame

In [6]:
replay_buffer = deque(maxlen=replay_buffer_size)

game.close()

game.set_episode_timeout(num_steps)
game.init()

for episode in range(num_episodes):
    game.new_episode()
    
    # Init frame stack
    frame_stack = deque(maxlen=4)
    initial_frame = preprocess_frame(game.get_state().screen_buffer)
    for _ in range(4):
        frame_stack.append(initial_frame)
    state = np.stack(frame_stack, axis=2)
    
    done = False
    for step in range(num_steps):
        if not done:
            # Get action with highest Q-value for current state
            action = np.argmax(prediction_model.predict_on_batch(np.expand_dims(state, axis=0)))
            action_one_hot = [False] * 3
            action_one_hot[action] = True

            # Take action and get reward
            reward = game.make_action(action_one_hot)
            done = game.is_episode_finished()

            if not done:
                # Get new state
                frame_stack.append(preprocess_frame(game.get_state().screen_buffer))
                new_state = np.stack(frame_stack, axis=2)
            else:
                new_state = None

            # Store experience
            replay_buffer.append((state, action_one_hot, reward, new_state))
            state = new_state
        
        # Train network on expreiences
        loss = 0
        if len(replay_buffer) >= batch_size:
            # Get batch
            replay_batch      = random.sample(replay_buffer, batch_size)
            replay_state      = [r[0] for r in replay_batch]
            replay_action     = [r[1] for r in replay_batch]
            replay_reward     = [r[2] for r in replay_batch]
            replay_next_state = [r[3] for r in replay_batch]
            
            # Q_target = reward + gamma * max_a' Q(s')
            Q_target = []
            for i in range(batch_size):
                if replay_next_state[i] is not None:
                    Q_next_state = prediction_model.predict_on_batch(np.expand_dims(replay_next_state[i], axis=0))[0]
                    Q_next_max   = np.max(Q_next_state)
                    Q_target.append(replay_reward[i] + discount_factor * Q_next_max)
                else:
                    Q_target.append(replay_reward[i])
            loss += training_model.train_on_batch([replay_state, replay_action], Q_target)
            
    clear_output(wait=True)
    display("-- Episode {}/{} --".format(episode, num_episodes))
    display("Loss: {}".format(loss))
        
print("Done!")
game.close()

'-- Episode 499/500 --'

'Loss: 11.742753982543945'

Done!


In [7]:
prediction_model.save("dqn-predict-v1.h5")
training_model.save("dqn-train-v1.h5")

In [9]:
game.close()
game.set_episode_timeout(num_steps)
game.init()

for episode in range(100):
    game.new_episode()
    
    # Init frame stack
    frame_stack = deque(maxlen=4)
    initial_frame = preprocess_frame(game.get_state().screen_buffer)
    for _ in range(4):
        frame_stack.append(initial_frame)
    state = np.stack(frame_stack, axis=2)
    
    for step in range(num_steps):
        # Get action with highest Q-value for current state
        action = np.argmax(prediction_model.predict_on_batch(np.expand_dims(state, axis=0)))
        action_one_hot = [False] * 3
        action_one_hot[action] = True

        # Take action and get reward
        reward = game.make_action(action_one_hot)
        done = game.is_episode_finished()

        if done:
            break
            
        frame_stack.append(preprocess_frame(game.get_state().screen_buffer))
        new_state = np.stack(frame_stack, axis=2)
        state = new_state
        time.sleep(0.016)
print("Done!")
game.close()

Done!
