In [1]:
import tensorflow as tf
import numpy as np
import random
import gym
import math
import matplotlib.pyplot as plt

In [None]:
class Retrace(object):
    def __init__(self, session, q_network, target_policy, behavior_policy, lambda_, gamma, action_n):
        self.session = session
        self.target_policy = target_policy
        self.behavior_policy = behavior_policy
        self.lambda_ = lambda_
        self.gamma = gamma
        self.q_network = q_network
        self.action_n = action_n
        
    def __str__(self):
        return "Retrace lambda with lambda = {0.2f}".format(self.lambda_)
    
    def compute_traces(self, episdoe):
        
        states, actions = episdoe["states"], episdoe["actions"] 
        traces = []

        for state, action in zip(states, actions):
            importace_sampling = self.target_policy(state)[action]/self.behavior_policy(state)[action]
            trace = self.lambda_*min(importace_sampling, 1)
            traces.append(trace)

        return traces
    
    def compute_targets(self, episode):
        states, actions, rewards = episode["states"], episode["actions"], episode["rewards"]
        q = self.q_network
        deltas = []
        for state, next_state, action, reward in zip(states[:-1], states[1:], actions[:-1], rewards[:-1]):
            future_reward = self.target_policy(next_state).dot(q(next_state))
            delta = reward + self.gamma * future_reward - q(state)[action]
            deltas.append(delta)
        delta = rewards[-1] - q(states[-1])[actions[-1]]
        deltas.append(delta)
        return deltas
    
    def compute_one_n_step(self, targets, traces):
        discount = 1
        trace = 1
        delta = 0
        traces = traces + [1]
        for i, target in enumerate(targets):
            delta += discount*trace*target
            discount *= self.gamma
            trace *= traces[i]
        return delta
    
    def compute_n_step_target(self, episode):
        states, actions, rewards = episode["states"], episode["actions"], episode["rewards"]
        traces = self.compute_traces(episode)
        targets = self.compute_targets(episode)
        n_step_targets = []
        for i in xrange(len(targets)):
            target = self.compute_one_n_step(gamma, targets[i:i+step_n], traces[i+1:i+step_n])
            n_step_targets.append(target)
        return n_step_targets

In [2]:
def q_network(states):
    # define policy neural network
    W1 = tf.get_variable("W1", [state_dim, 20],
                       initializer=tf.random_normal_initializer())
    b1 = tf.get_variable("b1", [20],
                       initializer=tf.constant_initializer(0))
    h1 = tf.nn.relu(tf.matmul(states, W1) + b1)
    W2 = tf.get_variable("W2", [20, num_actions],
                       initializer=tf.random_normal_initializer())
    b2 = tf.get_variable("b2", [num_actions],
                       initializer=tf.constant_initializer(0))
    q = tf.matmul(h1, W2) + b2
    return q

In [None]:
def retrace(target_policy, behavior_policy, env, lambda_, step_n, alpha, gamma, max_episodes):
    q = defaultdict(int)
    action_n = env.action_space.n
    
    for itr in xrange(max_episodes):
        done = False
        state = env.reset()
        states = []
        actions = []
        rewards = []
        returns = []
        
        while not done:
            states.append(state)
            
            action = np.random.multinomial(1, behavior_policy[state]).argmax()
            next_state, reward, done, _ = env.step(action)
            state = next_state
            
            actions.append(action)
            rewards.append(reward)
            
        n_step_corrections = compute_n_step_target(deepcopy(q), states, actions, rewards, gamma, lambda_, target_policy, 
                                                   behavior_policy, action_n, step_n) 
                
        for state, action, correction in zip(states, actions, n_step_corrections):
            q[(state, action)] += alpha*correction
            
        #alpha /= (itr +1 ) ** 0.8
        
    return q

In [12]:
def policy_gradient(observation_space, action_space):
    with tf.variable_scope("policy"):
        params = tf.get_variable("policy_parameters", [observation_space, action_spce])
        state = tf.placeholder("float", [None, state_space])
        actions = tf.placeholder("float", [None, action_space])
        advantages = tf.placeholder("float", [None, 1])
        linear = tf.matmul(state, params)
        probabilities = tf.nn.softmax(linear)
        good_probabilities = tf.reduce_sum(tf.mul(probabilities, actions), reduction_indices=[1])
        eligibility = tf.log(good_probabilities) * advantages
        loss = -tf.reduce_sum(eligibility)
        optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)
        return probabilities, state, actions, advantages, optimizer

In [13]:
def value_gradient(observation_space, action_space):
    with tf.variable_scope("value"):
        state = tf.placeholder("float",[None, observation_space])
        newvals = tf.placeholder("float",[None,1])
        w1 = tf.get_variable("w1",[observation_space,10])
        b1 = tf.get_variable("b1",[10])
        h1 = tf.nn.relu(tf.matmul(state, w1) + b1)
        w2 = tf.get_variable("w2",[10,1])
        b2 = tf.get_variable("b2",[1])
        calculated = tf.matmul(h1, w2) + b2
        diffs = calculated - newvals
        loss = tf.nn.l2_loss(diffs)
        optimizer = tf.train.AdamOptimizer(0.1).minimize(loss)
        return calculated, state, newvals, optimizer, loss

In [10]:
def run_episode(env, policy_grad, value_grad, sess):
    pl_calculated, pl_state, pl_actions, pl_advantages, pl_optimizer = policy_grad
    vl_calculated, vl_state, vl_newvals, vl_optimizer, vl_loss = value_grad
    observation = env.reset()
    totalreward = 0
    states = []
    actions = []
    advantages = []
    transitions = []
    update_vals = []
    
    for _ in xrange(200):
        # calculate policy
        obs_vector = np.expand_dims(observation, axis=0)
        probs = sess.run(pl_calculated, feed_dict={pl_state: obs_vector})
        action = 0 if random.uniform(0,1) < probs[0][0] else 1
        # record the transition
        states.append(observation)
        actionblank = np.zeros(2)
        actionblank[action] = 1
        actions.append(actionblank)
        # take the action in the environment
        old_observation = observation
        observation, reward, done, info = env.step(action)
        transitions.append((old_observation, action, reward))
        totalreward += reward

        if done:
            break
            
    for index, trans in enumerate(transitions):
        obs, action, reward = trans

        # calculate discounted monte-carlo return
        future_reward = 0
        future_transitions = len(transitions) - index
        decrease = 1
        for index2 in xrange(future_transitions):
            future_reward += transitions[(index2) + index][2] * decrease
            decrease = decrease * 0.97
        obs_vector = np.expand_dims(obs, axis=0)
        currentval = sess.run(vl_calculated, feed_dict={vl_state: obs_vector})[0][0]

        # advantage: how much better was this action than normal
        advantages.append(future_reward - currentval)

        # update the value function towards new return
        update_vals.append(future_reward)
        
    # update value function
    update_vals_vector = np.expand_dims(update_vals, axis=1)
    sess.run(vl_optimizer, feed_dict={vl_state: states, vl_newvals: update_vals_vector})
    # real_vl_loss = sess.run(vl_loss, feed_dict={vl_state: states, vl_newvals: update_vals_vector})

    advantages_vector = np.expand_dims(advantages, axis=1)
    sess.run(pl_optimizer, feed_dict={pl_state: states, pl_advantages: advantages_vector, pl_actions: actions})

    return totalreward

In [11]:
env = gym.make('CartPole-v0')
env.monitor.start('cartpole-hill/', force=True)
policy_grad = policy_gradient()
value_grad = value_gradient()
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
for i in xrange(2000):
    reward = run_episode(env, policy_grad, value_grad, sess)
    if reward == 200:
        print "reward 200"
        print i
        break
t = 0
for _ in xrange(1000):
    reward = run_episode(env, policy_grad, value_grad, sess)
    t += reward
print t / 1000
env.monitor.close()

[2016-08-30 11:38:52,886] Making new env: CartPole-v0
[2016-08-30 11:38:52,897] Creating monitor directory cartpole-hill/
[2016-08-30 11:38:53,182] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.116047.video000000.mp4
[2016-08-30 11:38:54,087] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.116047.video000001.mp4
[2016-08-30 11:38:55,207] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.116047.video000008.mp4
[2016-08-30 11:38:56,109] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.116047.video000027.mp4
[2016-08-30 11:38:57,532] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.116047.video000064.mp4
[2016-08-30 11:39:00,457] Starting new video recorder writing to /home/dr

reward 200
248


[2016-08-30 11:39:10,245] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.116047.video000343.mp4
[2016-08-30 11:39:18,082] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.116047.video000512.mp4
[2016-08-30 11:39:41,161] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.116047.video000729.mp4
[2016-08-30 11:40:17,820] Starting new video recorder writing to /home/drl/DRL/Starter Exercise/notebooks/cartpole-hill/openaigym.video.0.116047.video001000.mp4
[2016-08-30 11:40:52,847] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/drl/DRL/Starter Exercise/notebooks/cartpole-hill')


137.164
