- Tutorial

[An intro to Advantage Actor Critic methods: let’s play Sonic the Hedgehog!
](https://medium.freecodecamp.org/an-intro-to-advantage-actor-critic-methods-lets-play-sonic-the-hedgehog-86d6240171d)

# Introducing Actor Critic

The Actor Critic model is a better score function. Instead of waiting until the end of the episode as we do in Monte Carlo REINFORCE, we make an update at each step (TD Learning).

In [5]:
# AC Algorithm for continuous action_space
# Source: https://github.com/allanbreyes/bipedal-walker/blob/master/writeup.md

# randomly initialize critic network Q and actor μ
# initialize target network Q' and μ'
# initialize replay buffer R
# for episode = 1, M do:
#   initialize a random process N for action exploration
#   receive initial observation state s1
#   for t = 1, T do:
#     select action a_t = μ + N_t according to current policy
#     execute action a_t and observe reward r_t and new state s_t+1
#     store experience in replay buffer
#     sample a random minibatch of N transitions from R
#     update target values according to discount, γ
#     update the actor policy using the sampled policy gradient
#     update the target networks

In [None]:
"""
Actor-Critic with continuous action using TD-error as the Advantage, Reinforcement Learning.
The Pendulum example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
Cannot converge!!! oscillate!!!
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow r1.3
gym 0.8.0
"""

import tensorflow as tf
import numpy as np
import gym

np.random.seed(2)
tf.set_random_seed(2)  # reproducible


class Actor(object):
    def __init__(self, sess, n_features, action_bound, lr=0.0001):
        self.sess = sess

        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
        self.a = tf.placeholder(tf.float32, None, name="act")
        self.td_error = tf.placeholder(tf.float32, None, name="td_error")  # TD_error

        l1 = tf.layers.dense(
            inputs=self.s,
            units=30,  # number of hidden units
            activation=tf.nn.relu,
            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
            bias_initializer=tf.constant_initializer(0.1),  # biases
            name='l1'
        )
        
        # average value for action
        mu = tf.layers.dense(
            inputs=l1,
            units=1,  # number of hidden units
            activation=tf.nn.tanh, # use tanh to scale output to [-1,1]
            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
            bias_initializer=tf.constant_initializer(0.1),  # biases
            name='mu'
        )
        
        # deviation value for action
        sigma = tf.layers.dense(
            inputs=l1,
            units=1,  # output units
            activation=tf.nn.softplus,  # get action probabilities
            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
            bias_initializer=tf.constant_initializer(1.),  # biases
            name='sigma'
        )
        
        global_step = tf.Variable(0, trainable=False)
        # self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
        self.mu, self.sigma = tf.squeeze(mu*2), tf.squeeze(sigma+0.1)
        self.normal_dist = tf.distributions.Normal(self.mu, self.sigma)
        
        # key: sample from the normal districution (mu,sigma) as the action to be taken
        self.action = tf.clip_by_value(self.normal_dist.sample(1), action_bound[0], action_bound[1]) # ensure no action value is invalid

        with tf.name_scope('exp_v'):
            log_prob = self.normal_dist.log_prob(self.a)  # loss without advantage
            self.exp_v = log_prob * self.td_error  # advantage (TD_error) guided loss
            # Add cross entropy cost to encourage exploration
            self.exp_v += 0.01*self.normal_dist.entropy()

        with tf.name_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step)    # min(v) = max(-v)

    def learn(self, s, a, td):
        s = s[np.newaxis, :]
        feed_dict = {self.s: s, self.a: a, self.td_error: td}
        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
        return exp_v

    def choose_action(self, s):
        s = s[np.newaxis, :]
        return self.sess.run(self.action, {self.s: s})  # get probabilities for all actions


class Critic(object):
    def __init__(self, sess, n_features, lr=0.01):
        self.sess = sess
        with tf.name_scope('inputs'):
            self.s = tf.placeholder(tf.float32, [1, n_features], "state")
            self.v_ = tf.placeholder(tf.float32, [1, 1], name="v_next")
            self.r = tf.placeholder(tf.float32, name='r')

        with tf.variable_scope('Critic'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=30,  # number of hidden units
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )

            self.v = tf.layers.dense(
                inputs=l1,
                units=1,  # output units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='V'
            )

        with tf.variable_scope('squared_TD_error'):
            self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_ - self.v)
            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)

    def learn(self, s, r, s_):
        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]

        v_ = self.sess.run(self.v, {self.s: s_})
        td_error, _ = self.sess.run([self.td_error, self.train_op],
                                          {self.s: s, self.v_: v_, self.r: r})
        return td_error


OUTPUT_GRAPH = False
MAX_EPISODE = 1000
MAX_EP_STEPS = 200
DISPLAY_REWARD_THRESHOLD = 100  # renders environment if total episode reward is greater then this threshold
RENDER = False  # rendering wastes time
GAMMA = 0.9
LR_A = 0.001    # learning rate for actor
LR_C = 0.01     # learning rate for critic

env = gym.make('Pendulum-v0')
env.seed(1)  # reproducible
env = env.unwrapped

N_S = env.observation_space.shape[0]
A_BOUND = env.action_space.high

sess = tf.Session()

actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
critic = Critic(sess, n_features=N_S, lr=LR_C)

sess.run(tf.global_variables_initializer())

if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    ep_rs = []
    while True:
        # if RENDER:
        env.render()
        a = actor.choose_action(s)

        s_, r, done, info = env.step(a)
        r /= 10

        td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
        actor.learn(s, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]

        s = s_
        t += 1
        ep_rs.append(r)
        if t > MAX_EP_STEPS:
            ep_rs_sum = sum(ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))
            break

episode: 0   reward: -105
episode: 1   reward: -110
episode: 2   reward: -114
episode: 3   reward: -120
episode: 4   reward: -122
episode: 5   reward: -128
episode: 6   reward: -131
episode: 7   reward: -135
episode: 8   reward: -135
episode: 9   reward: -138
episode: 10   reward: -139


In [1]:
"""
Actor-Critic with continuous action using TD-error as the Advantage, Reinforcement Learning.
The Pendulum example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
Cannot converge!!! oscillate!!!
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow r1.3
gym 0.8.0
"""

import tensorflow as tf
import numpy as np
import gym

np.random.seed(2)
tf.set_random_seed(2)  # reproducible


class Actor(object):
    def __init__(self, sess, n_features, action_bound, lr=0.0001):
        self.sess = sess

        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
        self.a = tf.placeholder(tf.float32, None, name="act")
        self.td_error = tf.placeholder(tf.float32, None, name="td_error")  # TD_error

        l1 = tf.layers.dense(
            inputs=self.s,
            units=30,  # number of hidden units
            activation=tf.nn.relu,
            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
            bias_initializer=tf.constant_initializer(0.1),  # biases
            name='l1'
        )
        
        # average value for action
        mu = tf.layers.dense(
            inputs=l1,
            units=1,  # number of hidden units
            activation=tf.nn.tanh, # use tanh to scale output to [-1,1]
            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
            bias_initializer=tf.constant_initializer(0.1),  # biases
            name='mu'
        )
        
        # deviation value for action
        sigma = tf.layers.dense(
            inputs=l1,
            units=1,  # output units
            activation=tf.nn.softplus,  # get action probabilities
            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
            bias_initializer=tf.constant_initializer(1.),  # biases
            name='sigma'
        )
        
        global_step = tf.Variable(0, trainable=False)
        # self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
        self.mu, self.sigma = tf.squeeze(mu*2), tf.squeeze(sigma+0.1)
        self.normal_dist = tf.distributions.Normal(self.mu, self.sigma)
        
        # key: sample from the normal districution (mu,sigma) as the action to be taken
        self.action = tf.clip_by_value(self.normal_dist.sample(1), action_bound[0], action_bound[1]) # ensure no action value is invalid

        with tf.name_scope('exp_v'):
            log_prob = self.normal_dist.log_prob(self.a)  # loss without advantage
            self.exp_v = log_prob * self.td_error  # advantage (TD_error) guided loss
            # Add cross entropy cost to encourage exploration
            self.exp_v += 0.01*self.normal_dist.entropy()

        with tf.name_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step)    # min(v) = max(-v)

    def learn(self, s, a, td):
        s = s[np.newaxis, :]
        feed_dict = {self.s: s, self.a: a, self.td_error: td}
        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
        return exp_v

    def choose_action(self, s):
        s = s[np.newaxis, :]
        return self.sess.run(self.action, {self.s: s})  # get probabilities for all actions


class Critic(object):
    def __init__(self, sess, n_features, lr=0.01):
        self.sess = sess
        with tf.name_scope('inputs'):
            self.s = tf.placeholder(tf.float32, [1, n_features], "state")
            self.v_ = tf.placeholder(tf.float32, [1, 1], name="v_next")
            self.r = tf.placeholder(tf.float32, name='r')

        with tf.variable_scope('Critic'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=30,  # number of hidden units
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )

            self.v = tf.layers.dense(
                inputs=l1,
                units=1,  # output units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='V'
            )

        with tf.variable_scope('squared_TD_error'):
            self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_ - self.v)
            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)

    def learn(self, s, r, s_):
        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]

        v_ = self.sess.run(self.v, {self.s: s_})
        td_error, _ = self.sess.run([self.td_error, self.train_op],
                                          {self.s: s, self.v_: v_, self.r: r})
        return td_error


OUTPUT_GRAPH = False
MAX_EPISODE = 1000
MAX_EP_STEPS = 200
DISPLAY_REWARD_THRESHOLD = 100  # renders environment if total episode reward is greater then this threshold
RENDER = False  # rendering wastes time
GAMMA = 0.9
LR_A = 0.001    # learning rate for actor
LR_C = 0.01     # learning rate for critic

env = gym.make('Pendulum-v0')
env.seed(1)  # reproducible
env = env.unwrapped

N_S = env.observation_space.shape[0]
A_BOUND = env.action_space.high

sess = tf.Session()

actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
critic = Critic(sess, n_features=N_S, lr=LR_C)

sess.run(tf.global_variables_initializer())

if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    ep_rs = []
    while True:
        if RENDER:
            env.render()
        a = actor.choose_action(s)

        s_, r, done, info = env.step(a)
        r /= 10

        td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
        actor.learn(s, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]

        s = s_
        t += 1
        ep_rs.append(r)
        if t > MAX_EP_STEPS:
            ep_rs_sum = sum(ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))
            break

episode: 0   reward: -105
episode: 1   reward: -110
episode: 2   reward: -114
episode: 3   reward: -120
episode: 4   reward: -122
episode: 5   reward: -128
episode: 6   reward: -131
episode: 7   reward: -135
episode: 8   reward: -135
episode: 9   reward: -138
episode: 10   reward: -139
episode: 11   reward: -142
episode: 12   reward: -145
episode: 13   reward: -146
episode: 14   reward: -146
episode: 15   reward: -148
episode: 16   reward: -149
episode: 17   reward: -148
episode: 18   reward: -147
episode: 19   reward: -147
episode: 20   reward: -148
episode: 21   reward: -147
episode: 22   reward: -143
episode: 23   reward: -144
episode: 24   reward: -144
episode: 25   reward: -144
episode: 26   reward: -143
episode: 27   reward: -143
episode: 28   reward: -141
episode: 29   reward: -139
episode: 30   reward: -135
episode: 31   reward: -135
episode: 32   reward: -132
episode: 33   reward: -132
episode: 34   reward: -131
episode: 35   reward: -133
episode: 36   reward: -130
episode: 37

episode: 303   reward: -116
episode: 304   reward: -119
episode: 305   reward: -118
episode: 306   reward: -114
episode: 307   reward: -112
episode: 308   reward: -112
episode: 309   reward: -107
episode: 310   reward: -107
episode: 311   reward: -100
episode: 312   reward: -102
episode: 313   reward: -102
episode: 314   reward: -98
episode: 315   reward: -100
episode: 316   reward: -94
episode: 317   reward: -90
episode: 318   reward: -89
episode: 319   reward: -81
episode: 320   reward: -74
episode: 321   reward: -74
episode: 322   reward: -72
episode: 323   reward: -74
episode: 324   reward: -77
episode: 325   reward: -78
episode: 326   reward: -73
episode: 327   reward: -66
episode: 328   reward: -60
episode: 329   reward: -56
episode: 330   reward: -59
episode: 331   reward: -61
episode: 332   reward: -62
episode: 333   reward: -60
episode: 334   reward: -63
episode: 335   reward: -70
episode: 336   reward: -68
episode: 337   reward: -70
episode: 338   reward: -77
episode: 339   r

KeyboardInterrupt: 

In [10]:
"""
Actor-Critic with continuous action using TD-error as the Advantage, Reinforcement Learning.
The Pendulum example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
Cannot converge!!! oscillate!!!
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow r1.3
gym 0.8.0
"""

import tensorflow as tf
import numpy as np
import gym

np.random.seed(2)
tf.set_random_seed(2)  # reproducible

tf.reset_default_graph() # reset

class Actor(object):
    def __init__(self, sess, n_features, action_bound, lr=0.0001):
        self.sess = sess

        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
        self.a = tf.placeholder(tf.float32, None, name="act")
        self.td_error = tf.placeholder(tf.float32, None, name="td_error")  # TD_error

        l1 = tf.layers.dense(
            inputs=self.s,
            units=30,  # number of hidden units
            activation=tf.nn.relu,
            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
            bias_initializer=tf.constant_initializer(0.1),  # biases
            name='l1'
        )
        
        # average value for action
        mu = tf.layers.dense(
            inputs=l1,
            units=N_A,  # number of hidden units
            activation=tf.nn.tanh, # use tanh to scale output to [-1,1]
            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
            bias_initializer=tf.constant_initializer(0.1),  # biases
            name='mu'
        )
        
        # deviation value for action
        sigma = tf.layers.dense(
            inputs=l1,
            units=N_A,  # output units
            activation=tf.nn.softplus,  # get action probabilities
            kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
            bias_initializer=tf.constant_initializer(1.),  # biases
            name='sigma'
        )
        
        global_step = tf.Variable(0, trainable=False)
        # self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
        self.mu, self.sigma = tf.squeeze(mu*2), tf.squeeze(sigma+0.1)
        self.normal_dist = tf.distributions.Normal(self.mu, self.sigma)
        
        # key: sample from the normal districution (mu,sigma) as the action to be taken
        self.action = tf.clip_by_value(self.normal_dist.sample(1), np.reshape(action_bound[0],(1,4)), np.reshape(action_bound[1],(1,4))) # ensure no action value is invalid

        with tf.name_scope('exp_v'):
            log_prob = self.normal_dist.log_prob(self.a)  # loss without advantage
            self.exp_v = log_prob * self.td_error  # advantage (TD_error) guided loss
            # Add cross entropy cost to encourage exploration
            self.exp_v += 0.01*self.normal_dist.entropy()

        with tf.name_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step)    # min(v) = max(-v)

    def learn(self, s, a, td):
        s = s[np.newaxis, :]
        feed_dict = {self.s: s, self.a: a, self.td_error: td}
        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
        return exp_v

    def choose_action(self, s):
        s = s[np.newaxis, :]
        return self.sess.run(self.action, {self.s: s})  # get probabilities for all actions


class Critic(object):
    def __init__(self, sess, n_features, lr=0.01):
        self.sess = sess
        with tf.name_scope('inputs'):
            self.s = tf.placeholder(tf.float32, [1, n_features], "state")
            self.v_ = tf.placeholder(tf.float32, [1, 1], name="v_next")
            self.r = tf.placeholder(tf.float32, name='r')

        with tf.variable_scope('Critic'):
            l1 = tf.layers.dense(
                inputs=self.s,
                units=30,  # number of hidden units
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='l1'
            )

            self.v = tf.layers.dense(
                inputs=l1,
                units=1,  # output units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='V'
            )

        with tf.variable_scope('squared_TD_error'):
            self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_ - self.v)
            self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)

    def learn(self, s, r, s_):
        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]

        v_ = self.sess.run(self.v, {self.s: s_})
        td_error, _ = self.sess.run([self.td_error, self.train_op],
                                          {self.s: s, self.v_: v_, self.r: r})
        return td_error


OUTPUT_GRAPH = False
MAX_EPISODE = 1000
MAX_EP_STEPS = 200
DISPLAY_REWARD_THRESHOLD = 100  # renders environment if total episode reward is greater then this threshold
RENDER = False  # rendering wastes time
GAMMA = 0.9
LR_A = 0.001    # learning rate for actor
LR_C = 0.01     # learning rate for critic

env = gym.make('BipedalWalker-v2')
env.seed(1)  # reproducible
env = env.unwrapped

N_S = env.observation_space.shape[0]
N_A=env.action_space.shape[0]
A_BOUND = env.action_space.high

sess = tf.Session()

actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
critic = Critic(sess, n_features=N_S, lr=LR_C)

sess.run(tf.global_variables_initializer())

if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    ep_rs = []
    while True:
        if RENDER:
            env.render()
        a = actor.choose_action(s)
        a=a[0]
        
        s_, r, done, info = env.step(a)
        r /= 10

        td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
        actor.learn(s, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]

        s = s_
        t += 1
        ep_rs.append(r)
        if t > MAX_EP_STEPS:
            ep_rs_sum = sum(ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))
            break

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
episode: 0   reward: -2388
episode: 1   reward: -4088
episode: 2   reward: -5615
episode: 3   reward: -7007
episode: 4   reward: -8263
episode: 5   reward: -7945
episode: 6   reward: -9108
episode: 7   reward: -10111
episode: 8   reward: -9102
episode: 9   reward: -10125
episode: 10   reward: -9115
episode: 11   reward: -8206
episode: 12   reward: -9339
episode: 13   reward: -10362
episode: 14   reward: -9328
episode: 15   reward: -9442
episode: 16   reward: -10420
episode: 17   reward: -11325
episode: 18   reward: -12144
episode: 19   reward: -12884
episode: 20   reward: -13537
episode: 21   reward: -13160
episode: 22   reward: -13787
episode: 23   reward: -14355
episode: 24   reward: -14867
episode: 25   reward: -15299
episode: 26   reward: -15674
episode: 27   reward: -1

KeyboardInterrupt: 

In [25]:
a=[[1,2,3],[2,3,4],[3,4,5]]
a=np.array(a)
print(a[:,1])

[2 3 4]


In [2]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

try:
    xrange = xrange
except:
    xrange = range

In [3]:
env = gym.make('CartPole-v0')

In [4]:
gamma = 0.99

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [5]:
class agent():
    def __init__(self, lr, s_size,a_size,h_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden,a_size,activation_fn=tf.nn.softmax,biases_initializer=None)
        self.chosen_action = tf.argmax(self.output,1)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

In [28]:
tf.reset_default_graph() #Clear the Tensorflow graph.

myAgent = agent(lr=1e-2,s_size=4,a_size=2,h_size=8) #Load the agent.

total_episodes = 5000 #Set total number of episodes to train agent on.
max_ep = 999
update_frequency = 5

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    total_reward = []
    total_lenght = []
        
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while i < total_episodes:
        s = env.reset()
        running_reward = 0
        ep_history = []
        for j in range(max_ep):
            #Probabilistically pick an action given our network outputs.
            a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[s]})
            
            
            a = np.random.choice(a_dist[0],p=a_dist[0])

            a = np.argmax(a_dist == a)

            s1,r,d,_ = env.step(a) #Get our reward for taking an action given a bandit.
            ep_history.append([s,a,r,s1])
            s = s1
            running_reward += r
            if d == True:
                #Update the network.

                ep_history = np.array(ep_history)
                ep_history[:,2] = discount_rewards(ep_history[:,2])
                
                    
                feed_dict={myAgent.reward_holder:ep_history[:,2],
                        myAgent.action_holder:ep_history[:,1],myAgent.state_in:np.vstack(ep_history[:,0])}
                

#                 print('tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1]:',(tf.range(0, tf.shape(myAgent.output)[0]) * tf.shape(myAgent.output)[1]).eval(feed_dict=feed_dict))
#                 print('output.shape:',tf.shape(myAgent.output).eval(feed_dict=feed_dict))
#                 print('action_holder:',ep_history[:,1])
#                 eles=sess.run([myAgent.indexes,myAgent.responsible_outputs,myAgent.loss],feed_dict=feed_dict)
#                 print('indexes:',eles[0])
#                 print('reshape(output):',tf.reshape(myAgent.output, [-1]).eval(feed_dict=feed_dict))
#                 print('responsible_outputs:',eles[1])
#                 print('loss:',eles[2])
#                 print('reward_holder:',ep_history[:,2])
                
                grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
        
                for idx,grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if i % update_frequency == 0 and i != 0:
                    feed_dict= dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
                total_reward.append(running_reward)
                total_lenght.append(j)
                break

        
            #Update our running tally of scores.
        if i % 100 == 0:
            print(np.mean(total_reward[-100:]))
        i += 1

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[array([[  1.47445947e-02,  -4.52420302e-03,   1.28968270e-03,
         -4.42169607e-03,  -1.67561751e-02,   2.91425060e-03,
         -1.94829277e-04,  -1.38024706e-03],
       [  5.01107387e-02,  -2.87446320e-01,   2.72752834e-03,
          2.07848474e-01,  -1.00883707e-01,   1.85157761e-01,
          9.15826485e-03,  -8.31003767e-03],
       [ -7.28943944e-03,   2.20602304e-02,  -6.00731524e-04,
         -1.13972705e-02,   6.99976319e-03,  -1.42100379e-02,
         -5.02189621e-04,   5.76587569e-04],
       [ -8.23226497e-02,   4.17846978e-01,  -4.49230010e-03,
         -2.94986606e-01,   1.53697044e-01,  -2.69154996e-01,
         -1.29977651e-02,   1.26604009e-02]], dtype=float32), array([[-0.05834411,  0.05834411],
       [-0.17819208,  0.17819202],
       [-0.0459293 ,  0.04592931],
       [-0.09173708,  0.09173704],
       [-0.01741329,  0.0174133 ],
       [-0.37915605,  0.37915593],
       [-0.09761829,  0.09761826],
       [-0.07454276,  0.07454277]], dtype=float32)]
15.0
[arr

[array([[ 0.03512461,  0.07239474,  0.0010833 , -0.04675518, -0.05658679,
        -0.03076056,  0.00094203, -0.00833462],
       [ 0.129859  , -0.11247241,  0.00400507,  0.07263885, -0.2965177 ,
         0.06178579, -0.00146354, -0.03081385],
       [-0.06253977, -0.12197431, -0.00192883,  0.07877547,  0.09852508,
         0.0497652 , -0.00158717,  0.01483988],
       [-0.24783522, -0.05423759, -0.00764365,  0.03502857,  0.51698542,
         0.01231128, -0.00070575,  0.05880807]], dtype=float32), array([[-0.14623745,  0.14623745],
       [-0.03937835,  0.03937832],
       [-0.15786871,  0.1578687 ],
       [-0.01037876,  0.01037876],
       [-0.05048144,  0.05048144],
       [ 0.00263484, -0.0026349 ],
       [-0.09595071,  0.09595069],
       [-0.23470207,  0.23470207]], dtype=float32)]
[array([[ -4.19094674e-02,   2.23931715e-01,  -5.61202352e-04,
         -1.39108151e-01,   7.01168925e-02,  -1.40848875e-01,
          3.14829755e-03,   6.79315766e-03],
       [  2.97372401e-01,   1.1

KeyboardInterrupt: 

In [17]:
tf.range(0,8)

<tf.Tensor 'range_1:0' shape=(8,) dtype=int32>