In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import gym

In [2]:
env = gym.make('CartPole-v0')

In [222]:
class cartpole:
    def __init__(self, gamma):
        self.gamma = gamma
        
    def decaysum(self, rewards):
        """Calculate sum of future rewards with decay factor gamma"""
        length = rewards.shape[0]
        weights = (self.gamma * np.ones(length)) ** np.arange(length)
        weightedsum = np.dot(rewards, weights)
        return weightedsum
    
    

In [223]:
myCP = cartpole(0.9)
myCP.decaysum(np.ones(10))

6.5132155990000005

In [62]:
def action_simple(observation):
    """Move left if pole is leaning left and vice versa"""
    if observation[2] < 0:
        return 0
    else:
        return 1

In [104]:
def action_sigmoid(observation, theta):
    """Move left or right based on a sigmoid function"""
    
    sigmoid = 1 / (1 + np.exp(-np.dot(observation, theta)))
    
    # Go right (1) with probability sigmoid, else go left (0)
    rand_cf = np.random.rand(1)
    if rand_cf < sigmoid:
        return 1
    else: 
        return 0  

In [176]:
def update_theta(theta, actions, states, fwd_rewards):
    num_steps = actions.shape[0]
    update = np.zeros(4)
    for step in range(num_steps):
        factor1 = int(actions[step]==0) - int(actions[step]==1)
        factor2 = 1 - 1/(1+np.exp(-np.dot(observation, theta)))
        update = update + factor1 * factor2 * states[step]
        
    return update

In [208]:
num_episodes = 1000
episode_length = 250
train_param = 0.1     # Training parameter
gamma = 0.9    # Decay rate

theta = np.array([-0.8, 3.7, 0.7, 6.8])
update_all = np.zeros(4)

# Shows the number of steps until the pole falls. If the
# pole does not fall, we want it to give the episode_length
success_array = episode_length * np.ones(num_episodes)

for episode in range(num_episodes):
    observation = env.reset()
    
    # Keep a log of the states, actions rewards and future 
    # weighted decay rewards
    states = np.zeros(shape=(episode_length, 4))
    actions = np.zeros(shape=(episode_length))
    rewards = np.zeros(shape=(episode_length))
    
    # Do 100 steps, take a step and record all relevant data
    for step in range(episode_length):
        states[step] = observation
        action = action_sigmoid(observation, theta)
        actions[step] = action
        observation, reward, done, info = env.step(action)
        rewards[step] = reward
        
        # Finish the loop if the pole has tilted too far
        if done:
            success_array[episode] = step + 1
            break
            
    # Input the total (decaying) future rewards
    fwd_rewards = np.zeros(shape=(episode_length))
    for step in range(episode_length):
        fwd_rewards[step] = decaysum(rewards[step:], gamma)
        
    update = update_theta(theta, actions, states, fwd_rewards)
    update_all = update_all + update

print(np.mean(success_array))
print(update_all)
print(theta + train_param * update_all)

159.524
[  1628.80107485   7247.35513575  -1060.15505595 -11596.47467608]
[  162.08010749   728.43551358  -105.31550559 -1152.84746761]


In [None]:
"""Results over 10000 episodes:

Action                     Mean time until done
constant (0 or 1)          9.35
action_simple              42.17

"""

In [None]:
theta = tf.placeholder(tf.float32, shape=(4,))
state = tf.placeholder(tf.float32, shape=(4,))
dot = tf.tensordot(theta, state, axes=1)
sigmoid = tf.nn.sigmoid(dot)

In [None]:
with tf.Session() as sess:
    feed_dict = {theta: np.array([1, 1, 1, 1]), \
                 state: np.array([1, 1, 1, 1])}
    out = sess.run(sigmoid, feed_dict=feed_dict)
    print(out)