# Exercise 3: Q-Learning with TD(0) Updates

In [None]:
from __future__ import division

import json
import numpy as np
import random
import os
import tensorflow as tf

from unityagents import UnityEnvironment

### Hyperparameters

In [None]:
y = .9 # Discount rate
start_e = 1 # Initial epsilon value
end_e = 0.1 # Final epsilon value
annealing_steps = 10000 # Steps to anneal epsilon downward
num_episodes = 1000 # Total number of episodes to run environment
summary_freq = 50
summary_path = './summaries/q-td' # Path to save summary statistics

### Load the environment

In [None]:
env = UnityEnvironment("./envs/Tabular", worker_id=2)
default_brain = env.brain_names[0]

### Examine the state space

In [None]:
brain = env.reset()[default_brain]
print(brain.vector_observations)

State (s) is an integer which corresponds to a discrete state.

## The Q-Learning Agent

In [None]:
class QAgent(object):
    def __init__(self, num_states, num_actions, lr):
        # These lines establish the feed-forward part of the network used to estimate Q(s, a)
        
        # The network takes an integer and uses it to index a row of the matrix.
        self.state_input = tf.placeholder(shape=[1],dtype=tf.int32, name="state_input")
        state = tf.contrib.layers.one_hot_encoding(self.state_input, num_states)
        self.q_out = tf.layers.dense(state, num_actions, use_bias=None, 
                                     kernel_initializer=tf.ones_initializer(), 
                                     activation=None)
        
        # Selected action is largest Q value for current state.
        self.predict = tf.argmax(self.q_out,1)

        # Below we obtain the loss by taking the sum of squares difference
        # between the target and prediction Q values.
        self.q_next = tf.placeholder(shape=[1, 4],dtype=tf.float32, name="q_input")
        self.loss = tf.reduce_sum(tf.squared_difference(self.q_next, self.q_out))
        trainer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update = trainer.minimize(self.loss)

### Training the network

In [None]:
if not os.path.exists(summary_path):
    os.makedirs(summary_path)

# Create our tensorflow agent
tf.reset_default_graph()
space_size = env.brains[default_brain].vector_observation_space_size
action_size = env.brains[default_brain].vector_action_space_size
agent = QAgent(space_size, action_size, 1e-2)

# Start an interactive TensorFlow session.
sess = tf.InteractiveSession()
init = tf.global_variables_initializer()
sess.run(init)
summary_writer = tf.summary.FileWriter(summary_path)

# Create variables which will be used throughout training.
e_drop = (start_e - end_e) / annealing_steps
e = start_e    
value_table = np.zeros([space_size])
episode_list = []
reward_list = []
loss_list = []

# Reset the environment before training.
brains = env.reset()
state = brains[default_brain].vector_observations[0]

# Start training loop.
for i in range(num_episodes):
    # Reset environment and get first new observation
    total_reward = 0
    done = False
    steps = 0
    while not done:
        steps +=1
        # Choose an action by greedily (with e chance of random action) from the Q-network
        action, Q = sess.run([agent.predict,agent.q_out],feed_dict={agent.state_input: state})
        action = action[0]
        if np.random.rand(1) < e:
            action = np.random.randint(0, action_size)

        # Get new state and reward from environment
        brains = env.step(vector_action = action, text_action = json.dumps(value_table.tolist()))
        tabular_brain = brains[default_brain]
        state_1 = tabular_brain.vector_observations[0]
        reward = tabular_brain.rewards[0]
        done = tabular_brain.local_done[0]

        targetQ = Q
        if not done:
            #Obtain the Q' values by feeding the new state through our network
            Q1 = sess.run(agent.q_out,feed_dict={agent.state_input: state_1})
            #Obtain maxQ' and set our target value for chosen action.
            maxQ1 = np.max(Q1)
            targetQ[0, action] = reward + y * maxQ1
        else:
            targetQ = Q
            targetQ[0, action] = reward

        # Train our network using target and estimated Q values
        _, q_table, v_loss = sess.run([agent.update,tf.trainable_variables()[0], agent.loss],
                         feed_dict={agent.state_input: state, 
                                    agent.q_next: targetQ})
        total_reward += reward
        state = state_1
        value_table = np.mean(q_table, axis=1)
        if e > end_e:
            e -= e_drop
        loss_list.append(v_loss)
    
    episode_list.append(steps)
    reward_list.append(total_reward)
    
    # Update our running tally of scores and save information to Tensorboard.
    if i % summary_freq == 0 and i != 0:
        summary = tf.Summary()
        summary.value.add(tag='Info/Reward', simple_value=float(np.mean(reward_list[-summary_freq:])))
        summary.value.add(tag='Info/Value Loss', simple_value=float(np.mean(loss_list[-summary_freq:])))
        summary.value.add(tag='Info/Epsilon', simple_value=float(e))
        summary.value.add(tag='Info/Q Estimate', simple_value=float(np.mean(value_table)))
        summary.value.add(tag='Info/Episode Length', simple_value=float(np.mean(episode_list[-summary_freq:])))
        summary_writer.add_summary(summary, i)
        summary_writer.flush()
        print("Episode: {}, Epsilon: {}, Mean Reward: {}".format(str(i), str(e), str(round(np.mean(reward_list[-summary_freq:]), 3))))
env.close()

In [None]:
env.close()