In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import vizdoom
import os
import time
import random
import numpy as np
from collections import deque
from skimage import transform
from IPython.display import display, clear_output
import tensorflow as tf

In [2]:
num_episodes       = 500
learning_rate      = 0.0002
discount_factor    = 0.99
num_envs           = 16
num_actions        = 5

t_max            = 5
I_Update         = 5
frame_stack_size = 4

In [3]:
class A2C():
    def __init__(self, num_actions, optimizer=tf.train.RMSPropOptimizer(0.0001)):
        tf.reset_default_graph()
        
        # Construct model
        self.input_states = tf.placeholder(shape=(None, 84, 84, 4), dtype=tf.float32)
        conv1 = tf.keras.layers.Conv2D(32, (3, 3), activation="elu", padding="valid", input_shape=(84, 84, 4))(self.input_states)
        pool1 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(conv1)
        conv2 = tf.keras.layers.Conv2D(64, (3, 3), activation="elu", padding="valid")(pool1)
        pool2 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(conv2)
        conv3 = tf.keras.layers.Conv2D(128, (3, 3), activation="elu", padding="valid")(pool2)
        shared_features = tf.keras.layers.Flatten()(conv3)
        
        # Policy branch
        dense1            = tf.keras.layers.Dense(16, activation="relu")(shared_features)
        self.action_prob  = tf.keras.layers.Dense(num_actions, activation="softmax")(dense1)
        
        # Baseline value branch
        dense2              = tf.keras.layers.Dense(512, activation="elu")(shared_features)
        dense3              = tf.keras.layers.Dense(128, activation="elu")(dense2)
        self.baseline_value = tf.keras.layers.Dense(1, activation=None)(dense3) # V(s_t; θ_v)
        
        # Create policy gradient train function
        self.actions_onehot_placeholder = tf.placeholder(shape=(None, num_actions), dtype=tf.float32)
        self.returns_placeholder        = tf.placeholder(shape=(None,), dtype=tf.float32) # R_t
        
        # Get probabilities of taken actions: log π(a_t | s_t; θ)
        log_action_prob = tf.log(tf.reduce_sum(self.action_prob * self.actions_onehot_placeholder, axis=1))
        
        # Loss = ∇_θ log π(a_t | s_t; θ)(R_t − V(s_t; θ_v))
        # Negative log likelihood of the taken actions,
        # weighted by the discounted and normalized rewards
        self.loss = -tf.reduce_mean(log_action_prob * (self.returns_placeholder - self.baseline_value)) # + TODO: entropy
        
        # Create gradient accumulator
        tvs = tf.trainable_variables()
        accum_vars = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in tvs]
        gvs = optimizer.compute_gradients(self.loss, tvs)
        
        self.accum_grads = [accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(gvs)]
        self.reset_grads = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars]
        self.train_step = optimizer.apply_gradients([(accum_vars[i], gv[1]) for i, gv in enumerate(gvs)])
        
        # Create session
        self.sess = tf.Session()

        # Run the initializer
        self.sess.run(tf.global_variables_initializer())
        
    def apply_gradients(self):
        return self.sess.run([self.train_step])
        
    def reset_gradients(self):
        return self.sess.run([self.reset_grads], feed_dict={})
    
    def accumulate_gradients(self, input_states, actions_onehot, returns):
        return self.sess.run([self.accum_grads, self.loss],
                             feed_dict={self.input_states: input_states,
                                        self.actions_onehot_placeholder: actions_onehot,
                                        self.returns_placeholder: returns})[1]
        
    def predict_action(self, input_states):
        return self.sess.run(self.action_prob, feed_dict={self.input_states: input_states})
    
    def predict_value(self, input_states):
        return self.sess.run(self.baseline_value, feed_dict={self.input_states: input_states})
    
    
a2c_model = A2C(num_actions=num_actions)

In [None]:
class DoomEnv():
    def __init__(self, show_window=False):
        # Setup DoomGame
        self.game = vizdoom.DoomGame()
        self.game.load_config("doom/my_way_home.cfg")

        # Visualize the game (set to False to train faster)
        self.game.set_window_visible(show_window)

        # Set screen format to greyscale, improves training time
        self.game.set_screen_format(vizdoom.ScreenFormat.GRAY8)

        # Make the game end after 2100 ticks (set to 0 to disable)
        self.game.set_episode_timeout(2100)

        # Init game
        self.game.init()
        
    def reset(self):
        self.game.new_episode()
        
        # Setup initial state
        self.frame_stack = deque(maxlen=4)
        initial_frame = preprocess_frame(self.game.get_state().screen_buffer)
        for _ in range(4):
            self.frame_stack.append(initial_frame)
        self.state = np.stack(self.frame_stack, axis=2)
        
envs = [DoomEnv(i == 0) for i in range(num_envs)]

In [None]:
def preprocess_frame(frame):
    cropped_frame = frame[30:-10, 30:-30] # Crop the screen
    normalized_frame = cropped_frame / 255.0 # Normalize Pixel Values
    preprocessed_frame = transform.resize(normalized_frame, [84, 84]) # Resize
    return preprocessed_frame

episode_loss = float("nan")
for episode in range(num_episodes):
    clear_output(wait=True)
    print("-- Episode {}/{} --".format(episode, num_episodes))
    print("Episode loss", episode_loss)
    for env in envs:
        env.reset()

    episode_loss = 0
    while True:
        for env in envs:
            if env.game.is_episode_finished():
                break
                    
            a2c_model.reset_gradients()
            rewards = []
            states  = []
            actions = []
            for _ in range(t_max):
                # Predict action given state: π(a_t | s_t; θ)
                action_prob = np.squeeze(a2c_model.predict_action(np.expand_dims(env.state, axis=0)))
                action = np.random.choice(np.arange(0, num_actions), p=action_prob) # Sample action stochastically
                action_one_hot = [False] * num_actions
                action_one_hot[action] = True

                states.append(env.state)
                actions.append(action_one_hot)

                #print("Taking action ",action)

                # Take the action
                rewards.append(env.game.make_action(action_one_hot))

                if env.game.is_episode_finished():
                    break

                # Get new state
                env.frame_stack.append(preprocess_frame(env.game.get_state().screen_buffer))
                env.state = np.stack(env.frame_stack, axis=2)


            R = 0 if env.game.is_episode_finished() else a2c_model.predict_value(np.expand_dims(env.state, axis=0))[0][0]
            discounted_rewards = [R]
            #print("Accumulating gradients")
            for i in range(len(rewards)-2, -1, -1):
                R = rewards[i] + discount_factor * R
                discounted_rewards.append(R)

            #print(np.array(states).shape, np.array(actions).shape, np.array(discounted_rewards).shape)
            episode_loss += a2c_model.accumulate_gradients(states, actions, discounted_rewards)

            #print("Train step")
            a2c_model.apply_gradients()

        # If all environments are done, break
        if sum([env.game.is_episode_finished() for env in envs]) == num_envs:
            break

print("Done!")

-- Episode 4/500 --
Episode loss 0.0


In [None]:
greedy = True
for episode in range(1):
    env = envs[0]
    env.reset()
    while not env.game.is_episode_finished():
        # Predict action given state: π(a_t | s_t; θ)
        action_prob = np.squeeze(a2c_model.predict_action(np.expand_dims(env.state, axis=0)))
        if greedy:
            action = np.argmax(action_prob)
        else:
            action = np.random.choice(np.arange(0, num_actions), p=action_prob) # Sample action stochastically
        action_one_hot = [False] * num_actions
        action_one_hot[action] = True

        # Take the action
        env.game.make_action(action_one_hot)
        
        if not env.game.is_episode_finished():
            # Get new state
            env.frame_stack.append(preprocess_frame(env.game.get_state().screen_buffer))
            env.state = np.stack(env.frame_stack, axis=2)