# Cart pole

In [None]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

env = gym.make('CartPole-v0')

The toy example from [gym](https://gym.openai.com/docs/#installation):

In [None]:
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        # To display the environment
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print(f"Episode finished after {t+1} timesteps")
            break

The examples from [kvfrans.com](http://kvfrans.com/simple-algoritms-for-solving-cartpole/):

In [None]:
def run_episode(env, parameters):
    observation = env.reset()
    totalreward = 0
    for _ in range(200):
        action = 0 if np.matmul(parameters, observation) < 0 else 1
        observation, reward, done, info = env.step(action)
        totalreward += reward
        if done:
            break
    return totalreward

## Random search

We are going to run several tests with different weights initalization and pick up the ones with the highest total reward.

In [None]:
bestparams = None
bestreward = 0
for i in range(10000):
    # Initialize a vector of random weights for each observations (4 for CartPole)
    parameters = np.random.rand(4) * 2 - 1
    reward = run_episode(env, parameters)
    if reward > bestreward:
        bestreward = reward
        bestparams = parameters
        if reward == 200:
            print(f"Stopped at iteration {i}")
            break
print(f"Best reward: {bestreward} with best parameters: {bestparams}")

## Hill climbing

This technique use a trick to avoid testing random weights all the time. Here you intialize randomly the weights then add some noise to the (best) weights. It improves all the time but it could get stucked finding nothing.

In [None]:
noise_scaling = 0.1
parameters = np.random.rand(4) * 2 - 1
bestreward = 0
for i in range(10000):
    newparams = parameters + (np.random.rand(4) * 2 - 1) * noise_scaling
    reward = 0
    reward = run_episode(env, newparams)
    if reward > bestreward:
        bestreward = reward
        parameters = newparams
        if reward == 200:
            print(f"Stopped at iteration {i}")
            break
print(f"Best reward: {bestreward} with best parameters: {parameters}")

## Policy gradient

In [None]:
def policy_gradient():
    params = tf.get_variable("policy_parameters", [4,2])
    state = tf.placeholder("float", [None,4])
    actions = tf.placeholder("float", [None,2])
    linear = tf.matmul(state, params)
    probabilities = tf.nn.softmax(linear)
    good_probabilities = tf.reduce_sum(tf.matmul(probabilities, actions), reduction_indices=[1])
    # Maximize the log probabilty
    log_probabilities = tf.log(good_probabilities)
    loss = -tf.reduce_sum(log_probabilities)
    optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)

In [None]:
def value_gradient():  
    # sess.run(calculated) to calculate value of state
    state = tf.placeholder("float",[None,4])
    w1 = tf.get_variable("w1",[4,10])
    b1 = tf.get_variable("b1",[10])
    h1 = tf.nn.relu(tf.matmul(state,w1) + b1)
    w2 = tf.get_variable("w2",[10,1])
    b2 = tf.get_variable("b2",[1])
    calculated = tf.matmul(h1,w2) + b2

    # sess.run(optimizer) to update the value of a state
    newvals = tf.placeholder("float",[None,1])
    diffs = calculated - newvals
    loss = tf.nn.l2_loss(diffs)
    optimizer = tf.train.AdamOptimizer(0.1).minimize(loss)

In [None]:
# tensorflow operations to compute probabilties for each action, given a state
pl_probabilities, pl_state = policy_gradient()  
observation = env.reset()  
actions = []  
transitions = []  
for _ in xrange(200):  
    # calculate policy
    obs_vector = np.expand_dims(observation, axis=0)
    probs = sess.run(pl_probabilities,feed_dict={pl_state: obs_vector})
    action = 0 if random.uniform(0,1) < probs[0][0] else 1
    # record the transition
    states.append(observation)
    actionblank = np.zeros(2)
    actionblank[action] = 1
    actions.append(actionblank)
    # take the action in the environment
    old_observation = observation
    observation, reward, done, info = env.step(action)
    transitions.append((old_observation, action, reward))
    totalreward += reward

    if done:
        break