In [1]:
import tensorflow as tf
import numpy as np
import random
import gym
import math
import matplotlib.pyplot as plt

In [8]:
from collections import deque
import random

class ReplayBuffer(object):

  def __init__(self, buffer_size):

    self.buffer_size = buffer_size
    self.num_experiences = 0
    self.buffer = deque()

  def getBatch(self, batch_size):
    # random draw N
    return random.sample(self.buffer, batch_size)

  def size(self):
    return self.buffer_size

  def add(self, episode):
    new_experience = episode
    if self.num_experiences < self.buffer_size:
      self.buffer.append(new_experience)
      self.num_experiences += 1
    else:
      self.buffer.popleft()
      self.buffer.append(new_experience)

  def count(self):
    # if buffer is full, return buffer size
    # otherwise, return experience counter
    return self.num_experiences

  def erase(self):
    self.buffer = deque()
    self.num_experiences = 0

In [None]:
def action_value(behavior_policy, target_policy, episodes):
    

In [2]:
def policy_gradient():
    with tf.variable_scope("policy"):
        params = tf.get_variable("policy_parameters",[4,2])
        state = tf.placeholder("float",[None,4])
        actions = tf.placeholder("float",[None,2])
        advantages = tf.placeholder("float",[None,1])
        linear = tf.matmul(state,params)
        probabilities = tf.nn.softmax(linear)
        good_probabilities = tf.reduce_sum(tf.mul(probabilities, actions),reduction_indices=[1])
        eligibility = tf.log(good_probabilities) * advantages
        loss = -tf.reduce_sum(eligibility)
        optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)
        return probabilities, state, actions, advantages, optimizer

In [7]:
def action_value_gradient():
    with tf.variable_scope("action_value"):
        state = tf.placeholder("float",[None,4])
        newvals = tf.placeholder("float",[None,1])
        w1 = tf.get_variable("w1",[4,10])
        b1 = tf.get_variable("b1",[10])
        h1 = tf.nn.relu(tf.matmul(state,w1) + b1)
        w2 = tf.get_variable("w2",[10,2])
        b2 = tf.get_variable("b2",[2])
        calculated = tf.matmul(h1,w2) + b2
        diffs = calculated - newvals
        loss = tf.nn.l2_loss(diffs)
        optimizer = tf.train.AdamOptimizer(0.1).minimize(loss)
        return calculated, state, newvals, optimizer, loss

In [None]:
def calculate_policy_value(policy_grad, episdoes):
    
    

In [4]:
def run_episode(env, policy_grad, value_grad, sess):
    pl_calculated, pl_state, pl_actions, pl_advantages, pl_optimizer = policy_grad
    vl_calculated, vl_state, vl_newvals, vl_optimizer, vl_loss = value_grad
    observation = env.reset()
    totalreward = 0
    states = []
    actions = []
    advantages = []
    transitions = []
    update_vals = []


    for _ in xrange(200):
        # calculate policy
        obs_vector = np.expand_dims(observation, axis=0)
        probs = sess.run(pl_calculated,feed_dict={pl_state: obs_vector})
        action = env.action_space.sample()  ### random behavior policy
        # record the transition
        states.append(observation)
        actionblank = np.zeros(2)
        actionblank[action] = 1
        actions.append(actionblank)
        # take the action in the environment
        old_observation = observation 
        observation, reward, done, info = env.step(action)
        transitions.append((old_observation, action, reward, observation))
        totalreward += reward

        if done:
            break
    policy_value(,transitions)
    discount = 0.97
    for index, trans in enumerate(transitions):
        obs, action, reward, next_obs = trans

        # calculate advantage r_t + gamma*V_{t+1} - V_t
        
        
        obs_vector = np.expand_dims(obs, axis=0)
        next_obs_vector = np.expand_dims(next_obs, axis=0)
        
        currentval = sess.run(vl_calculated,feed_dict={vl_state: obs_vector})[0][0]
        nextval = sess.run(vl_calculated, feed_dict={vl_state: next_obs_vector})[0][0]
        
        # advantage: how much better was this action than normal
        if index == len(transitions) -1:
            nextval = 0
            
        advantages.append(reward + discount * nextval - currentval)

        # update the value function towards new return
        # update_vals.append(future_reward)

    # update value function
    # update_vals_vector = np.expand_dims(update_vals, axis=1)
    # sess.run(vl_optimizer, feed_dict={vl_state: states, vl_newvals: update_vals_vector})
    # real_vl_loss = sess.run(vl_loss, feed_dict={vl_state: states, vl_newvals: update_vals_vector})

    advantages_vector = np.expand_dims(advantages, axis=1)
    sess.run(pl_optimizer, feed_dict={pl_state: states, pl_advantages: advantages_vector, pl_actions: actions})

    return totalreward




In [5]:
env = gym.make('CartPole-v0')
env.monitor.start('cartpole-hill/', force=True)
policy_grad = policy_gradient()
value_grad = value_gradient()
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
for i in xrange(20):
    reward = run_episode(env, policy_grad, value_grad, sess)
    if reward == 200:
        print "reward 200"
        print i
        break
t = 0
for _ in xrange(10):
    reward = run_episode(env, policy_grad, value_grad, sess)
    t += reward
print t / 1000
env.monitor.close()

[2016-09-02 16:21:22,507] Making new env: CartPole-v0
[2016-09-02 16:21:22,522] Clearing 5 monitor files from previous run (because force=True was provided)
[2016-09-02 16:21:22,781] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.33787.video000000.mp4
[2016-09-02 16:21:23,599] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.33787.video000001.mp4
[2016-09-02 16:21:24,003] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.33787.video000008.mp4
[2016-09-02 16:21:24,671] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.33787.video000027.mp4
[2016-09-02 16:21:25,638] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/drl/DRL/Starter Exercise/notebooks/cartpole-hill')


0.188
