In [1]:
import numpy as np
import gym
from gym.spaces import Discrete, Box
%tensorflow_version 1.x
import tensorflow as tf
import numpy as np
import random
import gym
import math
import matplotlib.pyplot as plt
%matplotlib inline

TensorFlow 1.x selected.


In [2]:
class DeterministicDiscreteActionLinearPolicy(object):

    def __init__(self, theta, ob_space, ac_space):
        """
        dim_ob: dimension of observations
        n_actions: number of actions
        theta: flat vector of parameters
        """
        dim_ob = ob_space.shape[0]
        n_actions = ac_space.n
        assert len(theta) == (dim_ob + 1) * n_actions
        self.W = theta[0 : dim_ob * n_actions].reshape(dim_ob, n_actions)
        self.b = theta[dim_ob * n_actions : None].reshape(1, n_actions)

    def act(self, ob):
        """
        """
        y = ob.dot(self.W) + self.b
        a = y.argmax()
        return a

In [3]:
def do_episode(policy, env, num_steps, discount=1.0, render=False):
    disc_total_rew = 0
    ob = env.reset()
    for t in range(num_steps):
        a = policy.act(ob)
        (ob, reward, done, _info) = env.step(a)
        disc_total_rew += reward * discount**t
        if render and t%3==0:
            env.render()
        if done: break
    return disc_total_rew

def noisy_evaluation(theta, discount=1.0):
    policy = make_policy(theta)
    reward = do_episode(policy, env, num_steps, discount)
    return reward

def make_policy(theta):
    return DeterministicDiscreteActionLinearPolicy(theta, env.observation_space, env.action_space)

In [4]:
# Task settings:
env = gym.make('CartPole-v0') # Change as needed
num_steps = 200 # maximum length of episode

# Alg settings:
n_iter = 20 # number of iterations of CEM
batch_size = 25 # number of samples per batch
elite_frac = 0.2 # fraction of samples used as elite set
n_elite = int(batch_size * elite_frac)
extra_std = 2.0
extra_decay_time = 10
dim_theta = (env.observation_space.shape[0]+1) * env.action_space.n

# Initialize mean and standard deviation
theta_mean = np.zeros(dim_theta)
theta_std = np.ones(dim_theta)

## Cross Entropy

#### Train the agent using cross entropy policy optimization

In [5]:
for itr in range(n_iter):
    # Sample parameter vectors
    extra_cov = max(1.0 - itr / extra_decay_time, 0) * extra_std**2
    thetas = np.random.multivariate_normal(mean=theta_mean, 
                                           cov=np.diag(np.array(theta_std**2) + extra_cov), 
                                           size=batch_size)
    rewards = np.array([noisy_evaluation(theta) for theta in thetas])

    # Get elite parameters
    elite_inds = rewards.argsort()[-n_elite:]
    elite_thetas = thetas[elite_inds]

    # Update theta_mean, theta_std
    theta_mean = elite_thetas.mean(axis=0)
    theta_std = elite_thetas.std(axis=0)
    print("iteration {} mean {} max {}".format(itr, np.mean(rewards), np.max(rewards)))

iteration 0 mean 11.08 max 28.0
iteration 1 mean 19.2 max 59.0
iteration 2 mean 52.48 max 200.0
iteration 3 mean 50.16 max 200.0
iteration 4 mean 76.2 max 200.0
iteration 5 mean 74.88 max 200.0
iteration 6 mean 125.48 max 200.0
iteration 7 mean 143.4 max 200.0
iteration 8 mean 126.44 max 200.0
iteration 9 mean 140.44 max 200.0
iteration 10 mean 195.08 max 200.0
iteration 11 mean 200.0 max 200.0
iteration 12 mean 200.0 max 200.0
iteration 13 mean 200.0 max 200.0
iteration 14 mean 200.0 max 200.0
iteration 15 mean 200.0 max 200.0
iteration 16 mean 200.0 max 200.0
iteration 17 mean 200.0 max 200.0
iteration 18 mean 200.0 max 200.0
iteration 19 mean 200.0 max 200.0


#### Testing the final cross entropy agent

In [6]:
print("reward {}".format(do_episode(make_policy(theta_mean), env, num_steps, render=False)))

reward 200.0


## Random Shooting


#### Training and testing the random shooting agent

In [7]:
t = 0
n_iter = 100
for i in range(n_iter):
    thetas = np.random.multivariate_normal(mean=np.zeros(dim_theta), 
                                               cov=np.diag(np.ones(dim_theta)), 
                                               size=batch_size)
    rewards = np.array([noisy_evaluation(theta) for theta in thetas])
    elite_ind = rewards.argmax()
    elite_theta = thetas[elite_ind]
    t += do_episode(make_policy(elite_theta), env, num_steps, render=False)
print("average reward {}".format(t / n_iter))

average reward 91.77


## Gradient Descent with Policy Network

In [8]:
def policy_gradient():
    with tf.variable_scope("policy", reuse=tf.AUTO_REUSE):
        params = tf.get_variable("policy_parameters",[4,2])
        state = tf.placeholder("float",[None,4])
        actions = tf.placeholder("float",[None,2])
        advantages = tf.placeholder("float",[None,1])
        linear = tf.matmul(state,params)
        probabilities = tf.nn.softmax(linear)
        good_probabilities = tf.reduce_sum(tf.multiply(probabilities, actions),reduction_indices=[1])
        eligibility = tf.log(good_probabilities) * advantages
        loss = -tf.reduce_sum(eligibility)
        optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)
        return probabilities, state, actions, advantages, optimizer

def value_gradient():
    with tf.variable_scope("value", reuse=tf.AUTO_REUSE):
        state = tf.placeholder("float",[None,4])
        newvals = tf.placeholder("float",[None,1])
        w1 = tf.get_variable("w1",[4,10])
        b1 = tf.get_variable("b1",[10])
        h1 = tf.nn.relu(tf.matmul(state,w1) + b1)
        w2 = tf.get_variable("w2",[10,1])
        b2 = tf.get_variable("b2",[1])
        calculated = tf.matmul(h1,w2) + b2
        diffs = calculated - newvals
        loss = tf.nn.l2_loss(diffs)
        optimizer = tf.train.AdamOptimizer(0.1).minimize(loss)
        return calculated, state, newvals, optimizer, loss

def run_episode(env, policy_grad, value_grad, sess, render=False, iter=None, threshold=None):
    pl_calculated, pl_state, pl_actions, pl_advantages, pl_optimizer = policy_grad
    vl_calculated, vl_state, vl_newvals, vl_optimizer, vl_loss = value_grad
    observation = env.reset()
    totalreward = 0
    states = []
    actions = []
    advantages = []
    transitions = []
    update_vals = []

    for t in range(200):
        # calculate policy
        obs_vector = np.expand_dims(observation, axis=0)
        probs = sess.run(pl_calculated,feed_dict={pl_state: obs_vector})
        action = 0 if random.uniform(0,1) < probs[0][0] else 1
        # record the transition
        states.append(observation)
        actionblank = np.zeros(2)
        actionblank[action] = 1
        actions.append(actionblank)
        # take the action in the environment
        old_observation = observation
        observation, reward, done, info = env.step(action)
        transitions.append((old_observation, action, reward))
        totalreward += reward
        
        if t % 3 == 0 and render:
            env.render()

        if done:
            break
            
    if threshold and threshold(totalreward, iter):
        return totalreward, 1
    
    for index, trans in enumerate(transitions):
        obs, action, reward = trans

        # calculate discounted monte-carlo return
        future_reward = 0
        future_transitions = len(transitions) - index
        decrease = 1
        for index2 in range(future_transitions):
            future_reward += transitions[(index2) + index][2] * decrease
            decrease = decrease * 0.97
        obs_vector = np.expand_dims(obs, axis=0)
        currentval = sess.run(vl_calculated,feed_dict={vl_state: obs_vector})[0][0]

        # advantage: how much better was this action than normal
        advantages.append(future_reward - currentval)

        # update the value function towards new return
        update_vals.append(future_reward)

    # update value function
    update_vals_vector = np.expand_dims(update_vals, axis=1)
    sess.run(vl_optimizer, feed_dict={vl_state: states, vl_newvals: update_vals_vector})

    advantages_vector = np.expand_dims(advantages, axis=1)
    sess.run(pl_optimizer, feed_dict={pl_state: states, pl_advantages: advantages_vector, pl_actions: actions})

    return totalreward, 0

#### Training the policy network agent using gradient descent

In [9]:
env = gym.make('CartPole-v0')
policy_grad = policy_gradient()
value_grad = value_gradient()
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
t = 0
for i in range(2000):
    reward, _ = run_episode(env, policy_grad, value_grad, sess)
    t += reward
    if i % 100 == 0:
        print("iteration {} mean {}".format(i, t / (i + 1)))
print("final mean {}".format(t / 2000))

Instructions for updating:
Use `tf.global_variables_initializer` instead.
iteration 0 mean 10.0
iteration 100 mean 22.237623762376238
iteration 200 mean 28.0
iteration 300 mean 44.77076411960133
iteration 400 mean 67.71820448877806
iteration 500 mean 84.24550898203593
iteration 600 mean 99.50415973377704
iteration 700 mean 110.02853067047076
iteration 800 mean 118.3632958801498
iteration 900 mean 123.80910099889012
iteration 1000 mean 129.44455544455545
iteration 1100 mean 134.5068119891008
iteration 1200 mean 138.4338051623647
iteration 1300 mean 142.06764027671022
iteration 1400 mean 144.1163454675232
iteration 1500 mean 145.58760826115923
iteration 1600 mean 146.99625234228608
iteration 1700 mean 149.16519694297472
iteration 1800 mean 150.91504719600223
iteration 1900 mean 152.74960547080485
final mean 154.406


#### Testing the policy network agent

In [10]:
t = 0
for i in range(100):
  reward, _ = run_episode(env, policy_grad, value_grad, sess)
  t += reward
print("test reward {}".format(t / 100))

test reward 181.57


## Policy Network with Cross Entropy

#### Training the policy network agent with cross entropy using gradient descent

In [12]:
def threshold_check(totalreward, iter):    
    return totalreward < min(200, iter * 0.1)

env = gym.make('CartPole-v0')
policy_grad = policy_gradient()
value_grad = value_gradient()
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
t = 0
skipped = 0
for i in range(2000):
    reward, skip = run_episode(env, policy_grad, value_grad, sess, render=False, iter=i, threshold=threshold_check)
    t += reward
    skipped += skip
    if i % 100 == 0:
        print("iteration {} mean {} skipped {}".format(i, t / (i + 1), skipped))
print("final mean {}".format(t / 2000))
print("iterations skipped {}".format(skipped))



iteration 0 mean 42.0 skipped 0
iteration 100 mean 24.673267326732674 skipped 0
iteration 200 mean 31.407960199004975 skipped 10
iteration 300 mean 36.56810631229236 skipped 33
iteration 400 mean 46.119700748129674 skipped 53
iteration 500 mean 60.08582834331337 skipped 66
iteration 600 mean 74.89351081530782 skipped 78
iteration 700 mean 88.15549215406563 skipped 84
iteration 800 mean 95.83770287141074 skipped 95
iteration 900 mean 103.0033296337403 skipped 103
iteration 1000 mean 110.86413586413586 skipped 111
iteration 1100 mean 117.79019073569482 skipped 114
iteration 1200 mean 122.81931723563697 skipped 119
iteration 1300 mean 125.82244427363567 skipped 133
iteration 1400 mean 128.8372591006424 skipped 154
iteration 1500 mean 131.76882078614258 skipped 175
iteration 1600 mean 134.0624609618988 skipped 202
iteration 1700 mean 135.885949441505 skipped 236
iteration 1800 mean 137.25763464741812 skipped 277
iteration 1900 mean 138.4902682798527 skipped 322
final mean 139.8055
iteratio

#### Testing the cross entropy policy network

In [13]:
t = 0
for i in range(100):
  reward, _ = run_episode(env, policy_grad, value_grad, sess)
  t += reward
print("test reward {}".format(t / 100))

test reward 168.81
