# Policy Based Agents

This notebook will describe how we get from a simple agent to one that is
capable of taking in an observation/state of the world, and taking actions
which provide the optimal reward not just in the present, but over the long 
run. With this, we can have a full reinforcemnet learning agent.

Environments which pose the entire problem to an agent are reffered to as 
Markov Decision Processes (MDPs). ALong with this, these rewards are not 
only tied to the actions the agent takes, but also to the state of the
environment. So now, it's evident that the agent must make good decisions
in the past to get more rewards in the future.

To define an MDP, we say it consists of a set of all possible states `S` from which our agent can experience `s`. A set of possible actions `A` from which our agent a any time will take action `a`. Given a state-action pair `(s, a)`, the transistion probablity to a new state `s'` is defined by `T(s, a)` and the reward `r` is given by `R(s, a)`. 

## Cart-pole problem

In [19]:
import tensorflow as tf
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# Cause I like the distinction between xrange and range
try:
    xrange = xrange
except:
    xrange = range

In [20]:
env = gym.make("CartPole-v0")

[33mWARN: gym.spaces.Box autodetected dtype as <type 'numpy.float32'>. Please provide explicit dtype.[0m


### Policy-Based Agent

In [30]:
gamma = 0.99

# r is a 1D float array of rewards and returns the computed discounted 
# reward.
def discount_rewards(r):
    discounted_r = np.zeros_like(r)
    running_sum = 0
    for t in reversed(xrange(0, r.size)):
        running_sum = running_sum * gamma + r[t]
        discounted_r[t] = running_sum
    return discounted_r

In [31]:
class agent():
    """
        lr:     float - learning_rate
        s_size: int   - State/input size
        a_size: int   - Action/output size
        h_size: int   - number of neurons in the hidden layer
    """
    def __init__(self, lr, s_size, a_size, h_size):
        self.input = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
        hidden = tf.layers.dense(self.input, 
                                 h_size, 
                                 bias_initializer=None,
                                 activation=tf.nn.relu)
        self.output = tf.layers.dense(hidden, 
                                      a_size,
                                      bias_initializer=None,
                                      activation=tf.nn.softmax)
        self.chosen_action = tf.argmax(self.output, 1)
        
        # We use these vairables to get the reward and action into the
        # network to compute the loss, and use it to update the network.
        self.r_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.a_holder = tf.placeholder(shape=[None], dtype=tf.int32)
        
        self.indices = tf.range(0, tf.shape(self.output)[0]) * \
                       tf.shape(self.output)[1] + \
                       self.a_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indices)
        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs) * self.r_holder)
        tvars = tf.trainable_variables()
        self.g_holder = []
        for idx, var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32, name=str(idx)+'_holder')
            self.g_holder.append(placeholder)
            
        self.gradients = tf.gradients(self.loss, tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.g_holder, tvars))

### Training the Agent

In [None]:
reset_graph()

myAgent = agent(lr=1e-2, s_size=4, a_size=2, h_size=8)

total_episodes = 5000
max_ep = 999
update_freq = 5

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_length = []
    
    grad_buffer = sess.run(tf.trainable_variables())
    for ix, grad in enumerate(grad_buffer):
        grad_buffer[ix] = grad * 0
        
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            # Take probabilities of actions from network 
            a_dist = sess.run(myAgent.output, feed_dict={myAgent.input:[s]})
            # Weighted random choice
            a = np.random.choice(a_dist[0], p=a_dist[0])
            # Grab index
            a = np.argmax(a_dist == a)

            # Do action and get reward and sate and done if not.
            s1, r, d, _ = env.step(a)
            ep_history.append([s,a,r,s1])
            s = s1
            running_reward += r

            if d == True:
                #Update this bitch, foo!
                ep_history = np.array(ep_history)
                ep_history[:, 2] = discount_rewards(ep_history[:,2])
                feed_dict = {
                    myAgent.r_holder: ep_history[:,2],
                    myAgent.a_holder: ep_history[:,1],
                    myAgent.input: np.vstack(ep_history[:, 0])
                }
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
                for idx, grad in enumerate(grads):
                    grad_buffer[idx] += grad

                if i % update_freq == 0 and i != 0:
                    feed_dict = dict(zip(myAgent.g_holder, grad_buffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix, grad in enumerate(grad_buffer):
                        grad_buffer[ix] = grad * 0

                total_reward.append(running_reward)
                total_length.append(j)
                break

        if i % 100 == 0:
            print np.mean(total_reward[-100:])
        i += 1    

12.0
15.93
18.44
38.02
44.26
57.02
67.29
49.97
118.43
172.32
179.96
197.44
193.62
196.91
200.0
199.0
198.28
200.0
