# CartPole-v0

Solving the [CartPole-v0](https://gym.openai.com/envs/CartPole-v0) environment using reinforcement learning.

In [1]:
import numpy as np
import gym

## Create environment

In [2]:
env = gym.make('CartPole-v0')

[2016-11-21 17:44:14,132] Making new env: CartPole-v0


## 1. Using K-nearest neighbours.
Based on [Andrej Karpathy's](https://gym.openai.com/evaluations/eval_lEi8I8v2QLqEgzBxcvRIaA) code.

In [5]:
# Nearest-neighbor based agent
class EpisodicAgent(object):
    """
    - At training time it remembers all tuples of (state, action, reward).
    - After each episode it computes the empirical value function based 
        on the recorded rewards in the episode.
    - At test time it looks up k-nearest neighbors in the state space 
        and takes the action that most often leads to highest average value.
    """
    def __init__(self, action_space):
        self.action_space = action_space
        assert isinstance(action_space, gym.spaces.discrete.Discrete), 'unsupported action space for now.'

        # options
        self.epsilon = 1.0 # probability of choosing a random action
        self.epsilon_decay = 0.98 # decay of epsilon per episode
        self.epsilon_min = 0
        self.nnfind = 500 # how many nearest neighbors to consider in the policy?
        self.mem_needed = 500 # amount of data to have before we can start exploiting
        self.mem_size = 50000 # maximum size of memory
        self.gamma = 0.95 # discount factor

        # internal vars
        self.iter = 0
        self.mem_pointer = 0 # memory pointer
        self.max_pointer = 0
        self.db = None # large array of states seen
        self.dba = {} # actions taken
        self.dbr = {} # rewards obtained at all steps
        self.dbv = {} # value function at all steps, computed retrospectively
        self.ep_start_pointer = 0

    def act(self, observation, reward, done):
        assert isinstance(observation, np.ndarray) and observation.ndim == 1, 'unsupported observation type for now.'

        if self.db is None:
            # lazy initialization of memory
            self.db = np.zeros((self.mem_size, observation.size))
            self.mem_pointer = 0
            self.ep_start_pointer = 0

        # we have enough data, we want to explore, and we have seen at least one episode already (so values were computed)
        if self.iter > self.mem_needed and np.random.rand() > self.epsilon and self.dbv:
            # exploit: find the few closest states and pick the action that led to highest rewards
            # 1. find k nearest neighbors
            ds = np.sum((self.db[:self.max_pointer] - observation)**2, axis=1) # L2 distance
            ix = np.argsort(ds) # sorts ascending by distance
            ix = ix[:min(len(ix), self.nnfind)] # crop to only some number of nearest neighbors
            
            # find the action that leads to most success. do a vote among actions
            adict = {}
            ndict = {}
            for i in ix:
                vv = self.dbv[i]
                aa = self.dba[i]
                vnew = adict.get(aa, 0) + vv
                adict[aa] = vnew
                ndict[aa] = ndict.get(aa, 0) + 1

            for a in adict: # normalize by counts
                adict[a] = adict[a] / ndict[a]

            its = [(y,x) for x,y in adict.iteritems()]
            its.sort(reverse=True) # descending
            a = its[0][1]

        else:
            # explore: do something random
            a = self.action_space.sample()

        # record move to database
        if self.mem_pointer < self.mem_size:
            self.db[self.mem_pointer] = observation # save the state
            self.dba[self.mem_pointer] = a # and the action we took
            self.dbr[self.mem_pointer-1] = reward # and the reward we obtained last time step
            self.dbv[self.mem_pointer-1] = 0
        self.mem_pointer += 1
        self.iter += 1

        if done: # episode Ended;

            # compute the estimate of the value function based on this rollout
            v = 0
            for t in reversed(xrange(self.ep_start_pointer, self.mem_pointer)):
                v = self.gamma * v + self.dbr.get(t,0)
                self.dbv[t] = v

            self.ep_start_pointer = self.mem_pointer
            self.max_pointer = min(max(self.max_pointer, self.mem_pointer), self.mem_size)

            # decay exploration probability
            self.epsilon *= self.epsilon_decay
            self.epsilon = max(self.epsilon, self.epsilon_min) # cap at epsilon_min

        return a

In [6]:
agent = EpisodicAgent(env.action_space)

episode_count = 500
max_steps = 200
reward = 0
done = False
sum_reward_running = 0

for i in xrange(episode_count):
    ob = env.reset()
    sum_reward = 0

    for j in xrange(max_steps):
        action = agent.act(ob, reward, done)
        ob, reward, done, _ = env.step(action)
        sum_reward += reward
        if done:
            break

    sum_reward_running = sum_reward_running * 0.95 + sum_reward * 0.05
    if i%100 == 0:
        print '%d running reward: %f' % (i, sum_reward_running)

env.render(close=True)

[2016-11-20 17:34:12,922] Making new env: CartPole-v0


0 running reward: 0.650000
100 running reward: 140.665248
200 running reward: 196.045064
300 running reward: 198.917342
400 running reward: 197.071849


### Visualize performance on new episode

In [13]:
reward = 0
sum_reward = 0
done = False
for i_episode in range(1):
    observation = env.reset()
    for t in range(1000):
        env.render()
        action = agent.act(observation, reward, done)
        observation, reward, done, _ = env.step(action)
        sum_reward += reward
        
        if done:
            print(sum_reward)
            break
    
env.render(close=True)

300.0


## 2. Using neural networks

Based on [Arthur Juliani's blog](https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-2-ded33892c724#.722zqqrr2).

In [3]:
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
import math

In [4]:
# Defining hyperparameters
H = 10               # Number of hidden neurons
batch_size = 50      # How many episodes before updating parameters
learning_rate = 1e-2 # Learning rate
gamma = 0.99         # Discount factor
D = 4                # Input dimensionality (env.observation_space size)

In [5]:
tf.reset_default_graph()

# Define the network architecture. Takes observation as an input, and outputs probability of going right or left
observations = tf.placeholder(tf.float32, [None,D], name='input_x')
W1 = tf.get_variable('W1', shape=[D,H],
                    initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations,W1))
W2 = tf.get_variable('W2', shape=[H,1],
                    initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1,W2)
probability = tf.nn.sigmoid(score)

tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32,[None,1],name='inpuy_y')
advantages = tf.placeholder(tf.float32,name='reward_signal')

# Loss function: Try to make good actions more likely
loglik = tf.log(input_y*(input_y-probability) + (1-input_y)*(input_y+probability))
loss = -tf.reduce_mean(loglik*advantages)
newGrads = tf.gradients(loss,tvars)

# We apply the gradients once we have collected them from multiple episodes (stability)
adam = tf.train.AdamOptimizer(learning_rate=learning_rate)

# Placeholders to send the final gradients when we update
W1Grad = tf.placeholder(tf.float32,name='batch_grad1')
W2Grad = tf.placeholder(tf.float32,name='batch_grad2')
batchGrad = [W1Grad, W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad,tvars))

### Advantage function
Weights the rewards the agent receives. Actions which lead to the pole falling have a decreased or negative reward, while actions which keep the pole in the air have a large reward. Actions at the end of the episode are seen as negative because they likely led to failure. Early actions are seen as positive because they were not responsible for the failure.

In [6]:
# Takes 1D array of rewards and returns a discounted one
def discount_rewards(r):
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0,r.size)):
        running_add = running_add*gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

### Running the agent

In [8]:
xs, drs, ys = [],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 10000
init = tf.initialize_all_variables()

with tf.Session() as sess:
    # rendering = False
    sess.run(init)
    observation = env.reset()     # Resets and obtains initial observation
    
    # Reset gradient placeholder
    gradBuffer = sess.run(tvars)
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    while episode_number <= total_episodes:
        # Only render once we achieve a good performance
        '''if reward_sum/batch_size > 100 or rendering == True:
            env.render()
            rendering = True
        '''
        
        x = np.reshape(observation,[1,D])
        
        # Run the network and get an action
        tfprob = sess.run(probability,feed_dict={observations:x})
        action = 1 if np.random.uniform()<tfprob else 0
        
        xs.append(x)  # Observation
        y = 1 if action == 0 else 0  # a "fake" label
        ys.append(y)
        
        # Execute action
        observation, reward, done, _ = env.step(action)
        reward_sum += reward
        
        drs.append(reward) # Store reward
        
        if done:
            episode_number += 1
            # Stack together all inputs, action gradients and rewards on this episode
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            xs, drs, ys = [],[],[]   # Reset array memory
            
            # Compute the discounted reward backwards through time
            discounted_epr = discount_rewards(epr)
            # Size the rewards to be unit normal (helps control the gradients)
            discounted_epr -= np.mean(discounted_epr)
            discounted_epr /= np.std(discounted_epr)
            
            # Get the gradient for this episode and save it in gradBuffer
            tGrad = sess.run(newGrads,feed_dict={observations:epx, input_y:epy, advantages:discounted_epr})
            for ix, grad in enumerate(tGrad):
                gradBuffer[ix] += grad
                
            # If we have completed enough episodes, then update the policy with the gradients
            if episode_number%batch_size == 0:
                sess.run(updateGrads,feed_dict={W1Grad:gradBuffer[0],W2Grad:gradBuffer[1]})
                for ix, grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0
                    
                # Give a performance summary 
                running_reward = reward_sum if running_reward is None else running_reward *0.99 + reward_sum * 0.01
                print('Average reward for episode %f. Total average reward %f.' % (reward_sum/batch_size,running_reward/batch_size))
                
                if reward_sum/batch_size > 200:
                    print('Task solved in ', episode_number, ' episodes!')
                    break
                
                reward_sum = 0
            
            observation = env.reset()
                
print(episode_number, 'Episodes completed')

Average reward for episode 17.260000. Total average reward 17.260000.
Average reward for episode 16.700000. Total average reward 17.254400.
Average reward for episode 19.340000. Total average reward 17.275256.
Average reward for episode 19.820000. Total average reward 17.300703.
Average reward for episode 19.160000. Total average reward 17.319296.
Average reward for episode 18.620000. Total average reward 17.332303.
Average reward for episode 20.500000. Total average reward 17.363980.
Average reward for episode 19.140000. Total average reward 17.381741.
Average reward for episode 19.700000. Total average reward 17.404923.
Average reward for episode 17.920000. Total average reward 17.410074.
Average reward for episode 20.740000. Total average reward 17.443373.
Average reward for episode 20.620000. Total average reward 17.475139.
Average reward for episode 18.620000. Total average reward 17.486588.
Average reward for episode 23.140000. Total average reward 17.543122.
Average reward for e