In [1]:
import numpy as np
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import gym
env = gym.make("CartPole-v0")

[2016-12-26 20:27:22,448] Making new env: CartPole-v0


In [3]:
num_hidden = 10
learning_rate = 1e-2
gamma = 0.99 # Discount factor for rewards
decay_rate = 0.99 # Decay factor for RMSProp leaky sum of grad**2
resume = False

model_batch_size = 3
real_batch_size = 3

dimen = 4

## Helper Functions

In [4]:
def reset_buffer(buffer):
    return np.array([np.zeros_like(var) for var in buffer])

def discount(r, gamma=0.99, standardize=False):
    """Takes 1d float array of rewards and computes discounted reward
    e.g. f([1, 1, 1], 0.99) -> [1, 0.99, 0.9801]
    """
    discounted = np.array([val * (gamma ** i) for i, val in enumerate(r)])
    if standardize:
        discounted -= np.mean(discounted)
        discounted /= np.std(discounted)
    return discounted

def step_model(sess, xs, action):
    """ Uses our model to produce a new state given a previous state and action """
    # Last state
    x = xs[-1].reshape(1,-1)
    
    # Append action
    x = np.hstack([x, [[action]]])
    
    # Predict output
    output_y = sess.run(predicted_state_m, feed_dict={input_x_m: x})
    
    # predicted_state_m == [state_0, state_1, state_2, state_3, reward, done]
    output_next_state = output_y[:,:4]
    output_reward = output_y[:,4]
    output_done = output_y[:,5]
    
    # First and third env outputs are limited to +/- 2.4 and +/- 0.4
    output_next_state[:,0] = np.clip(output_next_state[:,0],-2.4,2.4)
    
    output_next_state[:,2] = np.clip(output_next_state[:,2],-0.4,0.4)
    
    # Threshold for being done is likliehood being > 0.1
    output_done = True if output_done > 0.01 or len(xs) > 500 else False
    
    return output_next_state, output_reward, output_done
    

## Policy Network
Here we define a neural network to determine policy

In [5]:
tf.reset_default_graph()
input_x_p = tf.placeholder(tf.float32, [None,4], name="input_x")

# First layer
W1_p = tf.get_variable("W1", shape=[dimen,num_hidden], 
                     initializer=tf.contrib.layers.xavier_initializer())
layer1_p = tf.nn.relu(tf.matmul(input_x_p, W1_p))

# Second layer
W2_p = tf.get_variable("W2", shape=[num_hidden,1], 
                     initializer=tf.contrib.layers.xavier_initializer())
output_p = tf.nn.sigmoid(tf.matmul(layer1_p, W2_p))

trainable_vars_p = [W1_p, W2_p]
input_y_p = tf.placeholder(tf.float32, shape=[None, 1], name="input_y")
advantages_p = tf.placeholder(tf.float32, shape=[None,1], name="reward_signal")

# Gradients
W1_grad_p = tf.placeholder(tf.float32,name="W1_grad")
W2_grad_p = tf.placeholder(tf.float32,name="W2_grad")
batch_grad_p = [W1_grad_p, W2_grad_p]

# Loss function
log_lik_p = tf.log(input_y_p * (input_y_p - output_p) + 
                 (1 - input_y_p) * (input_y_p + output_p))
loss_p = -tf.reduce_mean(log_lik_p * advantages_p)

# Gradients
grads_p = tf.gradients(loss_p, trainable_vars_p)

# Optimizer
adam_p = tf.train.AdamOptimizer(learning_rate=learning_rate)

# Update function
update_grads_p = adam_p.apply_gradients(
    zip(batch_grad_p, [W1_p, W2_p]))

## Model Network
Here we implement a multi-layer neural network that predicts the next observation, reward and done state from a current state and action

In [6]:
num_hidden_m = 256

# Dimensions of the previous state plus 1 for the action
dimen_m = dimen + 1

input_x_m = tf.placeholder(tf.float32, [None, dimen_m])

# previous_state = tf.placeholder(tf.float32, [None, dimen + 1], 
#                                 name="previous_state")

# First layer
W1_m = tf.get_variable("W1_m", shape=[dimen_m, num_hidden_m],
                     initializer=tf.contrib.layers.xavier_initializer())
B1_m = tf.Variable(tf.zeros([num_hidden_m]), name="B1M")
layer1_m = tf.nn.relu(tf.matmul(input_x_m, W1_m) + B1_m)

# Second layer
W2_m = tf.get_variable("W2_m", shape=[num_hidden_m, num_hidden_m],
                     initializer=tf.contrib.layers.xavier_initializer())
B2_m = tf.Variable(tf.zeros([num_hidden_m]), name="B2_m")
layer2_m = tf.nn.relu(tf.matmul(layer1_m, W2_m) + B2_m)

# Third (output) layers
W_obs_m = tf.get_variable("W_obs_m", shape=[num_hidden_m, 4],
                     initializer=tf.contrib.layers.xavier_initializer())
B_obs_m = tf.Variable(tf.zeros([4]), name="B_obs_m")

W_reward_m = tf.get_variable("W_reward_m", shape=[num_hidden_m, 1],
                     initializer=tf.contrib.layers.xavier_initializer())
B_reward_m = tf.Variable(tf.zeros([1]), name="B_reward_m")

W_done_m = tf.get_variable("W_done_m", shape=[num_hidden_m,1],
                     initializer=tf.contrib.layers.xavier_initializer())
B_done_m = tf.Variable(tf.zeros([1]), name="B_done_m")

output_obs_m = tf.matmul(layer2_m, W_obs_m) + B_obs_m
output_reward_m = tf.matmul(layer2_m, W_reward_m) + B_reward_m
output_done_m = tf.sigmoid(tf.matmul(layer2_m, W_done_m) + B_done_m)

# Placeholders for inputs
actual_obs_m = tf.placeholder(tf.float32, [None, dimen_m], name="actual_obs")
actual_reward_m = tf.placeholder(tf.float32, [None, 1], name="actual_reward")
actual_done_m = tf.placeholder(tf.float32, [None, 1], name="actual_done")

predicted_state_m = tf.concat(1,[output_obs_m, output_reward_m, output_done_m])

# Loss functions
loss_obs_m = tf.square(actual_reward_m - output_reward_m)
loss_reward_m = tf.square(actual_reward_m - output_reward_m)
loss_done_m = -tf.log(actual_done_m * actual_done_m + 
                (1 - actual_done_m) * (1 - output_done_m))

loss_m = tf.reduce_mean(loss_obs_m + loss_reward_m + loss_done_m)

adam_m = tf.train.AdamOptimizer(learning_rate=learning_rate)
update_m = adam_m.minimize(loss_m)

In [7]:
init = tf.global_variables_initializer()

num_episode = 0
num_episodes = 5000

batch_size = real_batch_size

# Setup array to keep track of preivous states and outputs
observations = np.empty(0).reshape(0,dimen)
actions = np.empty(0).reshape(0,1)
rewards = np.empty(0).reshape(0,1)
dones = np.empty(0).reshape(0,1)
grads = np.array([np.zeros(var.get_shape().as_list()) for var in trainable_vars_p])

sess = tf.Session()

sess.run(init)
observation = env.reset()

while num_episode < num_episodes:
    observation = np.reshape(observation, (1,-1))
    
    # Chose a random action
    action = np.random.choice(range(env.action_space.n))
    
    # Append observation and actions
    observations = np.vstack([observations, observation])
    actions = np.vstack([actions, action])
    
    observation, reward, done, _ = env.step(action)
    
    rewards = np.vstack([rewards, reward])
    dones = np.vstack([dones, done])
    
    # If game is done
    if done or len(observations) > 300:
        
        # Count the episode
        num_episode += 1
        

        # Previous state and actions for training model
        states = np.hstack([observations, actions])
        prev_states = states[:-1,:]
        next_states = states[1:, :]
        next_rewards = rewards[1:, :]
        next_dones = dones[1:, :]

        # Putting it all together
        next_state_all = np.hstack([next_states, next_rewards, next_dones])

        feed_dict = {input_x_m: prev_states.astype(np.float32), 
                     actual_obs_m: next_states.astype(np.float32),
                    actual_done_m: next_dones.astype(np.float32),
                    actual_reward_m: next_rewards.astype(np.float32)}
        loss, output_state, _ = sess.run([loss_m, predicted_state_m, update_m], 
                                         feed_dict=feed_dict)
        
        observation = env.reset()
        
        # If the batch is full
        if num_episode % batch_size == 0:
            
            if (num_episode % (100 * batch_size) == 0):
                print("Episode: {} Training model loss: {}".format(num_episode, loss))

            if loss < 1e-6:
                print("Training done! Episode: {} Model loss: {}".format(num_episode, loss))
                break
                            
            # Reset everything
            observations = np.empty(0).reshape(0,dimen)
            actions = np.empty(0).reshape(0,1)
            rewards = np.empty(0).reshape(0,1)
            dones = np.empty(0).reshape(0,1)
            grads = np.array([np.zeros(var.get_shape().as_list()) for var in trainable_vars_p])
            
        


Episode: 1500 Training model loss: 0.00048257794696837664
Episode: 3000 Training model loss: 0.04406725987792015
Episode: 4500 Training model loss: 2.730608503043186e-05


In [22]:
reward_sum = 0
reward_total = []
batch_size = 3

# Tracks the score on the real (non-simulated) environment to determine when to stop
reward_real = 0
episode_real_count = 0
num_episodes = 5000

train_from_model = False

observations = np.empty(0).reshape(0,dimen)
rewards = np.empty(0).reshape(0,1)
actions = np.empty(0).reshape(0,1)
grads = np.array([np.zeros(var.get_shape().as_list()) for var in trainable_vars_p])

num_episode = 0

observation = env.reset()

while num_episode < num_episodes:
    observation = observation.reshape(1,-1)
    policy = sess.run(output_p, feed_dict={input_x_p: observation})
    action = 0 if policy > np.random.uniform() else 1

    observations = np.vstack([observations, observation])
    actions = np.vstack([actions, action])
    
    if train_from_model:
        observation, reward, done = step_model(sess, observations, action)
    else:
        observation, reward, done, _ = env.step(action)
        reward_real += reward
    
    reward_sum += reward
    rewards = np.vstack([rewards, reward])
    
    if done or len(observations) > 300:
        
        if not train_from_model:
            episode_real_count += 1
        
        reward_total.append(reward_sum)
        disc_rewards = discount(rewards, standardize=True)
        grads += sess.run(grads_p, feed_dict={input_x_p: observations,
                                            input_y_p: actions,
                                            advantages_p: disc_rewards})
        
        if num_episode % batch_size == 0:
            sess.run(update_grads_p, feed_dict={W1_grad_p: grads[0], W2_grad_p: grads[1]})
            grads = np.array([np.zeros(var.get_shape().as_list()) for var in trainable_vars_p])
            
            if (num_episode % (100 * batch_size) == 0):
                print("Episode {} rewards: {}".format(num_episode, reward_sum/batch_size))
            
            if episode_real_count > 0:
                if (reward_real/episode_real_count >= 200):
                    print("Episode {} Training complete with total score of: {}".format(
                            num_episode, reward_real/episode_real_count))
                    break
            
            reward_sum = 0
            
        num_episode += 1
        episode_real_count = 0
        
        observation = env.reset()
        
        observations = np.empty(0).reshape(0,dimen)
        rewards = np.empty(0).reshape(0,1)
        reward_real = 0
        actions = np.empty(0).reshape(0,1)
        
        train_from_model = not train_from_model 
        

Episode 0 rewards: 10.0
Episode 300 rewards: [ 133.38696289]
Episode 600 rewards: [ 135.93412781]
Episode 900 rewards: [ 141.17945862]
Episode 1200 rewards: [ 179.2141571]
Episode 1500 rewards: [ 191.54997253]
Episode 1800 rewards: [ 175.94616699]
Episode 2100 rewards: [ 172.42500305]
Episode 2142 Training complete with total score of: 230.0


In [23]:
# See our trained bot in action

observation = env.reset()
reward_sum = 0

while True:
    env.render()
    
    observation = np.reshape(observation, [1, -1])
    policy = sess.run(output_p, feed_dict={input_x_p: observation})
    action = 0 if policy > 0.5 else 1
    observation, reward, done, _ = env.step(action)
    reward_sum += reward
    if done:
        print("Total score: {}".format(reward_sum))
        break


Total score: 166.0
