In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import vizdoom
import os
import time
import random
import numpy as np
from collections import deque
from skimage import transform
from IPython.display import display, clear_output
import tensorflow as tf

In [2]:
class Scheduler():
    def __init__(self, initial_value, interval, decay_factor):
        self.interval = self.counter = interval
        self.decay_factor = decay_factor
        self.value_factor = 1
        self.value = initial_value
        
    def get_value(self):
        self.counter -= 1
        if self.counter < 0:
            self.counter = self.interval
            self.value *= self.decay_factor
        return self.value
        
lr_scheduler = Scheduler(initial_value=1e-2, interval=20, decay_factor=0.75)

In [3]:
num_episodes     = 500
discount_factor  = 0.99
num_envs         = 16
num_actions      = 5
t_max            = 5
frame_stack_size = 4

In [5]:
def entropy(logits):
    a0 = logits - tf.reduce_max(logits, axis=-1, keepdims=True)
    ea0 = tf.exp(a0)
    z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
    p0 = ea0 / z0
    return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)

class A2C():
    def __init__(self, num_actions, optimizer, value_scale=0.5, entropy_scale=0.001):
        tf.reset_default_graph()
        
        # Construct model
        self.input_states = tf.placeholder(shape=(None, 84, 84, 4), dtype=tf.float32)
        self.conv1 = tf.keras.layers.Conv2D(32, (3, 3), activation="elu", padding="valid")(self.input_states)
        self.pool1 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(self.conv1)
        self.conv2 = tf.keras.layers.Conv2D(64, (3, 3), activation="elu", padding="valid")(self.pool1)
        self.pool2 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(self.conv2)
        self.conv3 = tf.keras.layers.Conv2D(128, (3, 3), activation="elu", padding="valid")(self.pool2)
        self.shared_features = tf.keras.layers.Flatten()(self.conv3)
        
        # Policy branch
        self.dense1       = tf.keras.layers.Dense(16, activation="relu")(self.shared_features)
        self.action_prob  = tf.keras.layers.Dense(num_actions, activation="softmax")(self.dense1)
        
        # Baseline value branch
        self.dense2 = tf.keras.layers.Dense(512, activation="elu")(self.shared_features)
        self.dense3 = tf.keras.layers.Dense(128, activation="elu")(self.dense2)
        self.value  = tf.keras.layers.Dense(1, activation=None)(self.dense3) # V(s_t; θ_v)
        
        # Create policy gradient train function
        self.actions_placeholder = tf.placeholder(shape=(None,), dtype=tf.int32)
        self.returns_placeholder = tf.placeholder(shape=(None,), dtype=tf.float32)
        self.lr_placeholder      = tf.placeholder(shape=(), dtype=tf.float32)
        
        # Get probabilities of taken actions: log π(a_t | s_t; θ)
        # log_action_prob = tf.log(tf.reduce_sum(self.action_prob * self.actions_onehot_placeholder, axis=1))
        self.log_action_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.action_prob,
                                                                              labels=self.actions_placeholder)
        
        # Policy Gradient Loss = ∇_θ log π(a_t | s_t; θ)(R_t − V(s_t; θ_v))
        # Negative log likelihood of the taken actions, weighted by the discounted and normalized rewards
        self.policy_loss  = tf.reduce_mean((self.returns_placeholder - self.value) * self.log_action_prob)
        #self.policy_loss = -tf.reduce_mean(log_action_prob * (self.returns_placeholder - self.value))
        
        # Get value loss
        # MSE(V(s_t), R_t)
        self.value_loss = tf.reduce_mean(tf.squared_difference(tf.squeeze(self.value), self.returns_placeholder))
        
        # Get entropy
        self.entropy_loss = tf.reduce_mean(entropy(self.action_prob))
        #self.entropy_loss = -tf.reduce_mean(self.action_prob * tf.log(self.action_prob + 0.0001))
        
        # Total loss
        self.loss = self.policy_loss + self.value_loss * value_scale - self.entropy_loss * entropy_scale
        
        # Minimize loss
        self.optimizer = optimizer(learning_rate=self.lr_placeholder)
        self.learning_rate = 1e-4
        self.train_step = self.optimizer.minimize(self.loss)
        
        # Create session
        self.sess = tf.Session()

        # Run the initializer
        self.sess.run(tf.global_variables_initializer())
        
        tf.summary.scalar("policy_loss", self.policy_loss)
        tf.summary.scalar("value_loss", self.value_loss)
        tf.summary.scalar("entropy_loss", self.entropy_loss)
        tf.summary.scalar("loss", self.loss)
        self.summary_merged = tf.summary.merge_all()
        
        run_idx = 0
        while os.path.isdir("./logs/run{}".format(run_idx)):
            run_idx += 1
        self.train_writer = tf.summary.FileWriter("./logs/run{}".format(run_idx), self.sess.graph)
        self.step_idx = 0
        
    def train(self, input_states, actions, returns, values):
        r = self.sess.run([self.summary_merged, self.train_step, self.loss, self.policy_loss, self.value_loss, self.entropy_loss],
                          feed_dict={self.input_states: input_states,
                                     self.actions_placeholder: actions,
                                     self.returns_placeholder: returns,
                                     self.lr_placeholder: self.learning_rate})
        self.train_writer.add_summary(r[0], self.step_idx)
        self.step_idx += 1
        return r[2:]
        
    def predict(self, input_states):
        return self.sess.run([self.action_prob, self.value], feed_dict={self.input_states: input_states})
    
a2c_model = A2C(num_actions=num_actions, optimizer=tf.train.AdamOptimizer)

In [6]:
class DoomEnv():
    def __init__(self, show_window=False):
        # Setup DoomGame
        self.game = vizdoom.DoomGame()
        #self.game.load_config("doom/my_way_home.cfg")
        #self.game.load_config("doom/defend_the_center.cfg")
        self.game.load_config("doom/basic.cfg")

        # Visualize the game (set to False to train faster)
        self.game.set_window_visible(show_window)

        # Set screen format to greyscale, improves training time
        self.game.set_screen_format(vizdoom.ScreenFormat.GRAY8)

        # Make the game end after 2100 ticks (set to 0 to disable)
        #self.game.set_episode_timeout(2100)

        # Init game
        self.game.init()
        
    def reset(self):
        self.game.new_episode()
        
        # Setup initial state
        self.frame_stack = deque(maxlen=frame_stack_size)
        initial_frame = preprocess_frame(self.game.get_state().screen_buffer)
        for _ in range(frame_stack_size):
            self.frame_stack.append(initial_frame)
        self.state = np.stack(self.frame_stack, axis=2)
        
envs = [DoomEnv(i == 0) for i in range(num_envs)]

In [None]:
def preprocess_frame(frame):
    cropped_frame = frame[30:-10, 30:-30] # Crop the screen
    normalized_frame = cropped_frame / 255.0 # Normalize Pixel Values
    preprocessed_frame = transform.resize(normalized_frame, [84, 84]) # Resize
    return preprocessed_frame

def calculate_expected_return(rewards, gamma):
    expected_return = []
    r = 0
    for reward in rewards[::-1]: # for rewards from end to start
        r = reward + gamma * r
        expected_return.append(r)
    return expected_return[::-1] # reverse so that we get the expected return from start to end

episode_loss = episode_policy_loss = episode_value_loss = episode_entropy_loss = float("nan")
for episode in range(num_episodes):
    clear_output(wait=True)
    print("-- Episode {}/{} --".format(episode, num_episodes))
    print("Learning rate:", a2c_model.learning_rate)
    print("Episode policy loss:", episode_policy_loss)
    print("Episode value loss:", episode_value_loss)
    print("Episode entropy loss:", episode_entropy_loss)
    print("Episode loss:", episode_loss)
    for env in envs:
        env.reset()
    
    # While there are running environments
    episode_loss = episode_policy_loss = episode_value_loss = episode_entropy_loss = 0
    while sum([env.game.is_episode_finished() for env in envs]) < num_envs:
        states, actions, returns = [], [], []
        
        # For every environment
        for env in envs:
            # Simulate game for some number of steps
            rewards = []
            for _ in range(t_max):
                # Predict and value action given state
                # π(a_t | s_t; θ)
                action_prob = np.squeeze(a2c_model.predict(np.expand_dims(env.state, axis=0))[0])
                
                # Take action stochastically 
                action = np.random.choice(np.arange(0, num_actions), p=action_prob)
                action_one_hot = [False] * num_actions
                action_one_hot[action] = True
                reward = env.game.make_action(action_one_hot)
                
                # Store state, action and reward
                states.append(env.state)
                actions.append(action)
                #actions.append(action_one_hot)
                rewards.append(reward)

                if env.game.is_episode_finished():
                    break
                    
                # Get new state
                env.frame_stack.append(preprocess_frame(env.game.get_state().screen_buffer))
                env.state = np.stack(env.frame_stack, axis=2)
                
            # Calculate return (discounted rewards over a trajectory)
            last_value = 0 if env.game.is_episode_finished() else \
                         a2c_model.predict(np.expand_dims(env.state, axis=0))[1][0][0]
            returns.extend(calculate_expected_return(rewards+[last_value], discount_factor)[:-1])
            
        eploss, pgloss, vloss, entloss = a2c_model.train(states, actions, returns, None)
        episode_loss         += eploss
        episode_policy_loss  += pgloss
        episode_value_loss   += vloss
        episode_entropy_loss += entloss

print("Done!")

-- Episode 3/500 --
Learning rate: 0.0001
Episode policy loss: -143.98721891641617
Episode value loss: 5713.306781768799
Episode entropy loss: 96.56625628471375
Episode loss: 2712.5696226358414


In [None]:
greedy = True
for episode in range(10):
    env = envs[0]
    env.reset()
    while not env.game.is_episode_finished():
        # Predict action given state: π(a_t | s_t; θ)
        action_prob = np.squeeze(a2c_model.predict_action(np.expand_dims(env.state, axis=0))[0])
        if greedy:
            action = np.argmax(action_prob)
        else:
            action = np.random.choice(np.arange(0, num_actions), p=action_prob) # Sample action stochastically
        action_one_hot = [False] * num_actions
        action_one_hot[action] = True

        # Take the action
        env.game.make_action(action_one_hot)
        
        if not env.game.is_episode_finished():
            # Get new state
            env.frame_stack.append(preprocess_frame(env.game.get_state().screen_buffer))
            env.state = np.stack(env.frame_stack, axis=2)