# Exercise 1: Multi-Armed Bandit

In [None]:
import json
import numpy as np
import os
import tensorflow as tf

from unityagents import UnityEnvironment

### Hyperparameters

In [None]:
total_episodes = 4000 # Total episodes to run environment.
summary_freq = 50 # How often to display information about training.
update_frequency = 20 # How many episodes before updating model.
summary_path = './summaries/bandit' # Path to save summary statistics.
learning_rate = 1e-2 # Agent's learning rate.

### Load the Unity Environment

In [None]:
env = UnityEnvironment("./envs/Bandit", worker_id=0)
default_brain = env.brain_names[0]

### Examine the state space

In [None]:
brains = env.reset()
bandit_brain = brains[default_brain]
print(bandit_brain.vector_observations)

The environment is stateless, so our observation will always be `[0]`.

### The Multi-Arm Bandit Agent

In [None]:
class Agent(object):
    def __init__(self, learning_rate, num_actions):
        # The below two lines established the feed-forward part of the network. 
        
        # Our value estimates will be stored as a vector initialized with ones.
        self.value_estimates = tf.Variable(tf.ones([num_actions]))
        # In order to get probabilities for acting, we use the softmax function.
        self.policy = tf.nn.softmax(self.value_estimates)
        self.sampled_action = tf.multinomial(tf.reshape(self.policy, [-1, num_actions]), 1)

        # These lines establish the training proceedure. 
        # We feed the reward and chosen action into the network
        # to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        self.action = tf.one_hot(self.action_holder, num_actions)

        # We select only the portion of the estimates which corresponds to the taken action
        self.responsible_value = tf.reduce_sum(tf.multiply(self.value_estimates, self.action), axis=1)
        
        # We take the difference between the emperical reward and the value estimate
        self.loss = tf.reduce_mean(tf.squared_difference(self.responsible_value, self.reward_holder))
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.update = optimizer.minimize(self.loss)

### Training the Agent

In [None]:
if not os.path.exists(summary_path):
    os.makedirs(summary_path)

# Create our tensorflow agent
tf.reset_default_graph()
action_size = env.brains[default_brain].vector_action_space_size
agent = Agent(learning_rate, action_size)

# Start an interactive TensorFlow session.
sess = tf.InteractiveSession()
init = tf.global_variables_initializer()
sess.run(init)
summary_writer = tf.summary.FileWriter(summary_path)

# Create variables which will be used throughout training.
rewards = []
losses = []

# Restart Environment.
env.reset()

# Start training loop.
ep_history = []
for i in range(total_episodes):
    values = sess.run(agent.value_estimates)
    
    if i < total_episodes / 2:
        # Pick action randomly - explore.
        action = np.random.randint(0, action_size)
    else:
        # Pick action greedily - exploit.
        actions = sess.run(agent.policy)
        action = np.argmax(actions)

    # Act in the environment.
    brains = env.step(vector_action = action, text_action = json.dumps(values.tolist()))
    
    # Collect the reward for picking one of the actions.
    bandit_brain = brains[default_brain]
    reward = bandit_brain.rewards[0]
    rewards.append(reward)
    ep_history.append([action, reward])

    if i % update_frequency == 0 and i != 0:
        #Update the agent using the outcome of the action.
        ep_history = np.array(ep_history)
        _, value_loss = sess.run([agent.update, agent.loss], 
                                 feed_dict={agent.reward_holder:ep_history[:,1], 
                                            agent.action_holder:ep_history[:,0]})

        losses.append(value_loss)
        ep_history = []

    #Update our running tally of scores and save information to Tensorboard.
    if i % summary_freq == 0 and i > 0:
        summary = tf.Summary()
        summary.value.add(tag='Info/Mean Reward', simple_value=float(np.mean(rewards[-summary_freq:])))
        summary.value.add(tag='Info/Value Loss', simple_value=float(np.mean(losses[-summary_freq//update_frequency:])))
        summary_writer.add_summary(summary, i)
        summary_writer.flush()
        print("Trial: {}, Mean Reward: {}".format(str(i), str(round(np.mean(rewards[-summary_freq:]), 3))))
env.close()

In [None]:
env.close()