In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import vizdoom
import os
import time
import random
import keras
import keras.models
from keras.layers import *
from keras.optimizers import *
from keras import backend as K
from collections import deque
import numpy as np
from skimage import transform
from IPython.display import display, clear_output

Using TensorFlow backend.


In [2]:
num_episodes       = 500
num_steps          = 100
replay_buffer_size = 1000000
learning_rate      = 0.0002
discount_factor    = 0.95
batch_size         = 4#64

In [11]:
def preprocess_frame(frame):
    cropped_frame = frame[30:-10, 30:-30] # Crop the screen
    normalized_frame = cropped_frame / 255.0 # Normalize Pixel Values
    preprocessed_frame = transform.resize(normalized_frame, [84, 84]) # Resize
    return preprocessed_frame
        
class DoomInstance():
    def __init__(self):
        # Setup DoomGame
        self.game = vizdoom.DoomGame()
        self.game.load_config("doom/my_way_home.cfg")

        # Visualize the game (set to False to train faster)
        self.game.set_window_visible(False)

        # Set screen format to greyscale, improves training time
        self.game.set_screen_format(vizdoom.ScreenFormat.GRAY8)

        # Make the game end after 2100 ticks (set to 0 to disable)
        self.game.set_episode_timeout(2100)

        # Init game
        self.game.init()
        self.num_actions = self.game.get_available_buttons_size()
            
    def run_episode(self, a2c_model):
        self.game.new_episode()
        
        # Init frame stack
        frame_stack = deque(maxlen=4)
        initial_frame = preprocess_frame(self.game.get_state().screen_buffer)
        for _ in range(4):
            frame_stack.append(initial_frame)
        state = np.stack(frame_stack, axis=2)
        
        batch = []
        while not self.game.is_episode_finished():
            
            # Predict action given state pi(a_t|s_t)
            action_prob = np.squeeze(a2c_model.predict_policy([np.expand_dims(state, axis=0)]))
            action = np.random.choice(np.arange(0, self.num_actions), p=action_prob)
            action_one_hot = [False] * self.num_actions
            action_one_hot[action] = True
            
            # Take the action
            reward = self.game.make_action(action_one_hot)
            
            # If not done
            if not self.game.is_episode_finished():
                # Store the experience
                frame_stack.append(preprocess_frame(self.game.get_state().screen_buffer))
                new_state = np.stack(frame_stack, axis=2)
                batch.append((state, action_one_hot, reward, new_state))
                #batch.append((a2c_model.predict_value([np.expand_dims(state, axis=0)]), action_one_hot, reward))
                
        return batch
    
def run_episodes(game_instances, policy_model, value_model):
    pool = ThreadPool()
    results = []
    for instance in game_instances:
        results.append(pool.apply_async(instance.run_episode, args=(policy_model, value_model)))
    pool.close()
    pool.join()
    return [r.get() for r in results]

In [34]:
class A2C():
    def __init__(self, num_actions, optimizer=Adam()):
        # Construct model
        input_states = Input(shape=(84, 84, 4), dtype="float32")
        conv1 = Conv2D(32, (3, 3), activation="elu", padding="valid", input_shape=(84, 84, 4))(input_states)
        pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
        conv2 = Conv2D(64, (3, 3), activation="elu", padding="valid")(pool1)
        pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
        conv3 = Conv2D(128, (3, 3), activation="elu", padding="valid")(pool2)
        shared_features = Flatten()(conv3)
        
        # Policy branch
        dense1       = Dense(16, activation="relu")(shared_features)
        action_prob  = Dense(num_actions, activation="softmax")(dense1)
        
        # Value branch
        dense2 = Dense(512, activation="elu")(shared_features)
        dense3 = Dense(128, activation="elu")(dense2)
        value  = Dense(1, activation=None)(dense3)
        
        # Policy model
        self.policy_model = keras.models.Model(input_states, action_prob)
                
        # Value model
        self.value_model = keras.models.Model(input_states, value)
        
        # Create prediction function
        self.predict_policy_fn = K.function(inputs=[self.policy_model.input],
                                            outputs=[self.policy_model.output],
                                            updates=[])
        
        # Create policy gradient train function
        action_onehot_placeholder   = K.placeholder(shape=(None, num_actions))
        discount_reward_placeholder = K.placeholder(shape=(None,))
        
        # Get probabilities of taken actions
        log_action_prob = K.log(K.sum(self.policy_model.output * action_onehot_placeholder, axis=1))
        
        # loss
        # Negative log likelihood of the taken actions,
        # weighted by the discounted and normalized rewards
        loss = K.mean(discount_reward_placeholder * -log_action_prob)
        updates = optimizer.get_updates(params=self.policy_model.trainable_weights, loss=loss)
        self.train_policy_fn = K.function(inputs=[input_states, action_onehot_placeholder, discount_reward_placeholder],
                                          outputs=[],
                                          updates=updates)
        
        self.predict_value_fn = K.function(inputs=[self.value_model.input],
                                           outputs=[self.value_model.output],
                                           updates=[])
        
        loss = K.mean(value)
        updates = optimizer.get_updates(params=self.value_model.trainable_weights, loss=loss)
        self.train_value_fn = K.function(inputs=[self.value_model.input, action_onehot_placeholder, discount_reward_placeholder],
                                         outputs=[],
                                         updates=updates)
    
    def train_policy(self, inputs):
        self.train_policy_fn(inputs)
        
    def predict_policy(self, inputs):
        return self.predict_policy_fn(inputs)
    
    def train_value(self, inputs):
        self.train_value_fn(inputs)
        
    def predict_value(self, inputs):
        return self.predict_value_fn(inputs)

In [35]:
doom_instance = DoomInstance()
a2c_model = A2C(num_actions=doom_instance.game.get_available_buttons_size())
trajectory = doom_instance.run_episode(a2c_model)

In [None]:
replay_batch      = trajectory#random.sample(trajectory, batch_size)
replay_state      = [r[0] for r in replay_batch]
replay_action     = [r[1] for r in replay_batch]
replay_reward     = [r[2] for r in replay_batch]
replay_next_state = [r[3] for r in replay_batch]

last_reward = a2c_model.predict_value([np.expand_dims(replay_state[-1], axis=0)])

for i in range(len(replay_batch)):
    reward = replay_reward[i] + discount_fator * last_reward
    
    discounted_rewards = reward - a2c_model.predict_value([np.expand_dims(replay_state[i], axis=0)])
    a2c_model.train_policy([np.stack(states, axis=0), np.stack(actions, axis=0), discounted_rewards])
    a2c_model.train_value([replay_state, replay_action, Q_target])
    
    # Calculate discounted rewards
    #discounted_rewards = discount_rewards(episode_rewards)
    
