In [1]:
import tensorflow as tf
import numpy as np
import gym
from collections import deque
import random

In [2]:
env_name = 'CartPole-v0'
env = gym.make(env_name)

sess = tf.Session()
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)
writer = tf.train.SummaryWriter("/tmp/{}-experiment-10".format(env_name))

state_dim   = env.observation_space.shape[0]
num_actions = env.action_space.n
discount_factor = 1
tf_reg_param=0.001

dqn_optimizer      = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)
dqn_summary_writer = tf.train.SummaryWriter("/tmp/{}-experiment-1".format(env_name))

NUM_ITR = 1000
BATCH_SIZE = 1
MAX_STEPS    = 200

episode_history = deque(maxlen=100)
for i_itr in xrange(1):
  episodes = []
  for i_batch in xrange(BATCH_SIZE):
    # initialize
    state = env.reset()
    total_rewards = 0
    rewards, states, actions, next_states, returns = [], [], [], [], []
    for t in xrange(MAX_STEPS):
      env.render()
      action = env.action_space.sample()# (state[np.newaxis,:])
      next_state, reward, done, _ = env.step(action)
      reward = -10 if done else 0.1 # normalize reward
      ### appending the experience
      states.append(state)
      actions.append(action)
      rewards.append(reward)
      next_states.append(next_state)
      total_rewards += reward

      state = next_state
      if done: break

    return_so_far = 0
    for reward in rewards[::-1]:
      return_so_far = reward + discount_factor * return_so_far
      returns.append(return_so_far)
    #return is calculated in reverse direction
    returns = returns[::-1]

    episodes.append({
    "states" : states,
    "actions" : actions,
    "rewards" : rewards,
    "returns" : returns,
    "next_states": next_states}
    )

    
states = np.concatenate([p['states'] for p in episodes])
next_states = np.concatenate([p['next_states'] for p in episodes])
actions = np.concatenate([p['actions'] for p in episodes])
returns = np.concatenate([p['returns'] for p in episodes])

action_mask = np.zeros((actions.shape[0], 2))
action_mask[np.arange(actions.shape[0]), actions] = 1

next_state_mask = np.ones(states.shape[0])
next_state_mask[-1] = 0

[2016-09-06 14:31:05,227] Making new env: CartPole-v0


In [3]:
class ReplayBuffer(object):

  def __init__(self, buffer_size):

    self.buffer_size = buffer_size
    self.num_experiences = 0
    self.buffer = deque()

  def getBatch(self, batch_size):
    # random draw N
    return random.sample(self.buffer, batch_size)

  def size(self):
    return self.buffer_size

  def add(self, state, action, reward, next_state, done):
    new_experience = (state, action, reward, next_state, done)
    if self.num_experiences < self.buffer_size:
      self.buffer.append(new_experience)
      self.num_experiences += 1
    else:
      self.buffer.popleft()
      self.buffer.append(new_experience)

  def count(self):
    # if buffer is full, return buffer size
    # otherwise, return experience counter
    return self.num_experiences

  def erase(self):
    self.buffer = deque()
    self.num_experiences = 0


In [4]:
def storeExperience(state, action, reward, next_state, done):
  # always store end states
  if dqn_store_experience_cnt % dqn_store_replay_every == 0 or done:
    dqn_replay_buffer.add(state, action, reward, next_state, done)
    dqn_store_experience_cnt += 1

In [5]:
def observation_to_action(states):
  # define policy neural network
  W1 = tf.get_variable("W1", [state_dim, 20],
                       initializer=tf.random_normal_initializer())
  b1 = tf.get_variable("b1", [20],
                       initializer=tf.constant_initializer(0))
  h1 = tf.nn.relu(tf.matmul(states, W1) + b1)
  W2 = tf.get_variable("W2", [20, num_actions],
                       initializer=tf.random_normal_initializer())
  b2 = tf.get_variable("b2", [num_actions],
                       initializer=tf.constant_initializer(0))
  q = tf.matmul(h1, W2) + b2
  return q

In [6]:
def policy_network(states):
  # define policy neural network
  W1 = tf.get_variable("W1", [state_dim, 20],
                       initializer=tf.random_normal_initializer())
  b1 = tf.get_variable("b1", [20],
                       initializer=tf.constant_initializer(0))
  h1 = tf.nn.tanh(tf.matmul(states, W1) + b1)
  W2 = tf.get_variable("W2", [20, num_actions],
                       initializer=tf.random_normal_initializer(stddev=0.1))
  b2 = tf.get_variable("b2", [num_actions],
                       initializer=tf.constant_initializer(0))
  p = tf.matmul(h1, W2) + b2
  return p

In [7]:
# dqn model componenets
dqn_q_network     = observation_to_action
dqn_target_policy = policy_network
dqn_replay_buffer = ReplayBuffer(buffer_size=10000)
dqn_store_experience_cnt = 0
dqn_batch_size = 32

# training parameters
dqn_max_gradient = 5.
dqn_reg_param    = 0.01

# counters
dqn_store_replay_every   = 5
dqn_store_experience_cnt = 0
dqn_train_iteration      = 0


In [8]:
# compute action from a state: a* = argmax_a Q(s_t,a)
with tf.name_scope("dqn_predict_actions"):
  # raw state representation
  dqn_states = tf.placeholder(tf.float32, (None, state_dim), name="states")
  # initialize Q network
  with tf.variable_scope("dqn_q_network"):
    dqn_q_outputs = dqn_q_network(dqn_states)
  # predict actions from Q network
  # dqn_action_scores = tf.identity(dqn_q_outputs, name="action_scores")

In [9]:
# rollout action based on current policy
with tf.name_scope("policy_predict_actions"):
  # initialize policy network
  with tf.variable_scope("policy_network"):
    dqn_policy_outputs = policy_network(dqn_states)
  # predict actions from policy network
  # dqn_action_scores = tf.identity(dqn_policy_outputs, name="action_scores")

In [10]:
# estimate rewards using the next state: r(s_t,a_t) + expectation_a Q(s_{t+1}, a)
with tf.name_scope("estimate_future_rewards"):
  dqn_next_states = tf.placeholder(tf.float32, (None, state_dim), name="next_states")
  dqn_next_state_mask = tf.placeholder(tf.float32, (None,), name="next_state_masks")
  with tf.variable_scope("target_networks"):
      dqn_target_outputs = dqn_q_network(dqn_next_states)
  # compute future rewards
  next_state_rewards = tf.reduce_sum(dqn_policy_outputs * dqn_target_outputs,
                                     reduction_indices=1, keep_dims=True)
  next_state_rewards = tf.stop_gradient(next_state_rewards)
  dqn_rewards = tf.placeholder(tf.float32, (None,), name="rewards")
  dqn_future_rewards = dqn_rewards + discount_factor * next_state_rewards


In [11]:
for x in tf.get_collection("variables"):
    print x.name

dqn_q_network/W1:0
dqn_q_network/b1:0
dqn_q_network/W2:0
dqn_q_network/b2:0
policy_network/W1:0
policy_network/b1:0
policy_network/W2:0
policy_network/b2:0
target_networks/W1:0
target_networks/b1:0
target_networks/W2:0
target_networks/b2:0


In [12]:
# compute loss and gradients
with tf.name_scope("compute_temporal_differences"):
  # compute temporal difference loss
  dqn_action_mask = tf.placeholder(tf.float32, (None, num_actions), name="action_mask")
  dqn_masked_action_scores = tf.reduce_sum(dqn_q_outputs * dqn_action_mask, reduction_indices=[1,])
  dqn_temp_diff = dqn_masked_action_scores - dqn_future_rewards
  dqn_td_loss = tf.reduce_mean(tf.square(dqn_temp_diff))
  # regularization loss
  q_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="q_network")
  dqn_reg_loss = dqn_reg_param * tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in q_network_variables])
  # compute total loss and gradients
  dqn_loss = dqn_td_loss + dqn_reg_loss
  gradients = dqn_optimizer.compute_gradients(dqn_loss)
  # clip gradients by norm
  for i, (grad, var) in enumerate(gradients):
    if grad is not None:
      gradients[i] = (tf.clip_by_norm(grad, dqn_max_gradient), var)
  # add histograms for gradients.
#   for grad, var in gradients:
#     tf.histogram_summary(var.name, var)
#     if grad is not None:
#       tf.histogram_summary(var.name + '/gradients', grad)
  dqn_train_op = dqn_optimizer.apply_gradients(gradients)


In [13]:
for x in gradients:
    try:
        print x[0], x[1].name
    except:
        print x[1].name

Tensor("compute_temporal_differences/clip_by_norm:0", shape=(4, 20), dtype=float32) dqn_q_network/W1:0
Tensor("compute_temporal_differences/clip_by_norm_1:0", shape=(20,), dtype=float32) dqn_q_network/b1:0
Tensor("compute_temporal_differences/clip_by_norm_2:0", shape=(20, 2), dtype=float32) dqn_q_network/W2:0
Tensor("compute_temporal_differences/clip_by_norm_3:0", shape=(2,), dtype=float32) dqn_q_network/b2:0
None policy_network/W1:0
None policy_network/b1:0
None policy_network/W2:0
None policy_network/b2:0
None target_networks/W1:0
None target_networks/b1:0
None target_networks/W2:0
None target_networks/b2:0


In [15]:
init_all = tf.initialize_all_variables()
sess.run(init_all)

In [17]:
sess.run(
  dqn_train_op,
 {
  dqn_states:          states,
  dqn_next_states:     next_states,
  dqn_next_state_mask: next_state_mask,
  dqn_action_mask:     action_mask,
  dqn_rewards:         rewards
 })

In [None]:
cost

In [None]:
# not enough experiences yet
if dqn_replay_buffer.count() < 32:
      pass
else:
    batch           = dqn_replay_buffer.getBatch(batch_size)
    states          = np.zeros((self.batch_size, self.state_dim))
    rewards         = np.zeros((self.batch_size,))
    action_mask     = np.zeros((self.batch_size, self.num_actions))
    next_states     = np.zeros((self.batch_size, self.state_dim))
    next_state_mask = np.zeros((self.batch_size,))

    for k, (s0, a, r, s1, done) in enumerate(batch):
      states[k] = s0
      rewards[k] = r
      action_mask[k][a] = 1
      # check terminal state
      if not done:
        next_states[k] = s1
        next_state_mask[k] = 1

    # whether to calculate summaries
    calculate_summaries = self.train_iteration % self.summary_every == 0 and self.summary_writer is not None

    # perform one update of training
    cost, _, summary_str = self.session.run([
      self.loss,
      self.train_op,
      self.summarize if calculate_summaries else self.no_op
    ], {
      self.states:          states,
      self.next_states:     next_states,
      self.next_state_mask: next_state_mask,
      self.action_mask:     action_mask,
      self.rewards:         rewards
    })


In [None]:
from collections import deque

In [None]:
sess = tf.Session()

In [None]:
init_var = tf.initialize_all_variables()

In [None]:
sess.run(init_var)

In [None]:
sess.run(dqn_q_outputs, 
         feed_dict={dqn_states : states})

# TensorFlow

In [None]:
with tf.name_scope("model_inputs"):
  # raw state representation
  tf_states = tf.placeholder(tf.float32, (None, state_dim), name="states")


In [None]:
# rollout action based on current policy
with tf.name_scope("predict_actions"):
  # initialize policy network
  with tf.variable_scope("policy_network"):
    tf_policy_outputs = policy_network(tf_states)

  # predict actions from policy network
  tf_action_scores = tf.identity(tf_policy_outputs, name="action_scores")

In [None]:
# regularization loss
policy_network_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy_network")

In [None]:
# compute loss and gradients
with tf.name_scope("compute_pg_gradients"):
  # gradients for selecting action from policy network
  tf_taken_actions = tf.placeholder(tf.int32, (None), name="taken_actions")
  tf_discounted_rewards = tf.placeholder(tf.float32, (None), name="discounted_rewards")

  with tf.variable_scope("policy_network", reuse=True):
    tf_logprobs = policy_network(tf_states)

  # compute policy loss and regularization loss
  tf_likelihood_loss = (tf.nn.sparse_softmax_cross_entropy_with_logits(tf_logprobs, tf_taken_actions)
                         * tf_discounted_rewards)
  tf_pg_loss            = tf.reduce_mean(tf_likelihood_loss)
  tf_reg_loss           = tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in policy_network_variables])
  tf_loss               = tf_pg_loss + tf_reg_param * tf_reg_loss

  # compute gradients
  tf_gradients = optimizer.compute_gradients(tf_loss)

  # compute policy gradients
  for i, (grad, var) in enumerate(tf_gradients):
    if grad is not None:
      tf_gradients[i] = (grad, var)

  for grad, var in tf_gradients:
    tf.histogram_summary(var.name, var)
    if grad is not None:
      tf.histogram_summary(var.name + '/gradients', grad)

In [None]:
# emit summaries
tf.scalar_summary("policy_loss", tf_pg_loss)
tf.scalar_summary("reg_loss", tf_reg_loss)
tf.scalar_summary("total_loss", tf_loss)


In [None]:
# training update
with tf.name_scope("train_policy_network"):
  # apply gradients to update policy network
  tf_train_op = optimizer.apply_gradients(tf_gradients)

tf_summarize = tf.merge_all_summaries()
tf_no_op = tf.no_op()

In [None]:
# evaluate gradients
grad_evals = [grad for grad, var in tf_gradients]

In [None]:
grad_evals

In [None]:
init_all = tf.initialize_all_variables()
sess.run(init_all)

In [None]:
sess.run(
    grad_evals,
   # tf_summarize if calculate_summaries else self.no_op
    {
    tf_states:             states,
    tf_taken_actions:      actions,
    tf_discounted_rewards: returns
    })

In [None]:
[x.name for x in policy_network_variables]

In [None]:



    # compute loss and gradients
    with tf.name_scope("compute_pg_gradients"):
      # gradients for selecting action from policy network
      self.taken_actions = tf.placeholder(tf.int32, (None, 1), name="taken_actions")
      self.discounted_rewards = tf.placeholder(tf.float32, (None, 1), name="discounted_rewards")

      with tf.variable_scope("policy_network", reuse=True):
        self.logprobs = self.policy_network(self.states)

      # compute policy loss and regularization loss
      self.likelihood_loss = (tf.nn.sparse_softmax_cross_entropy_with_logits(self.logprobs, self.taken_actions)
                             * self.discounted_rewards)
      self.pg_loss            = tf.reduce_mean(self.likelihood_loss)
      self.reg_loss           = tf.reduce_sum([tf.reduce_sum(tf.square(x)) for x in policy_network_variables])
      self.loss               = self.pg_loss + self.reg_param * self.reg_loss

      # compute gradients
      self.gradients = self.optimizer.compute_gradients(self.loss)

      # compute policy gradients
      for i, (grad, var) in enumerate(self.gradients):
        if grad is not None:
          self.gradients[i] = (grad, var)

      for grad, var in self.gradients:
        tf.histogram_summary(var.name, var)
        if grad is not None:
          tf.histogram_summary(var.name + '/gradients', grad)

    
    # training update
    with tf.name_scope("train_policy_network"):
      # apply gradients to update policy network
      self.train_op = self.optimizer.apply_gradients(self.gradients)

    self.summarize = tf.merge_all_summaries()
    self.no_op = tf.no_op()
