# Cart pole

In [None]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

env = gym.make('CartPole-v0')

The toy example from [gym](https://gym.openai.com/docs/#installation):

In [None]:
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        # To display the environment
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print(f"Episode finished after {t+1} timesteps")
            break

The examples from [kvfrans.com](http://kvfrans.com/simple-algoritms-for-solving-cartpole/):

In [None]:
def run_episode(env, parameters):
    observation = env.reset()
    totalreward = 0
    for _ in range(200):
        action = 0 if np.matmul(parameters, observation) < 0 else 1
        observation, reward, done, info = env.step(action)
        totalreward += reward
        if done:
            break
    return totalreward

## Random search

We are going to run several tests with different weights initalization and pick up the ones with the highest total reward.

In [None]:
bestparams = None
bestreward = 0
for i in range(10000):
    # Initialize a vector of random weights for each observations (4 for CartPole)
    parameters = np.random.rand(4) * 2 - 1
    reward = run_episode(env, parameters)
    if reward > bestreward:
        bestreward = reward
        bestparams = parameters
        if reward == 200:
            print(f"Stopped at iteration {i}")
            break
print(f"Best reward: {bestreward} with best parameters: {bestparams}")

## Hill climbing

This technique use a trick to avoid testing random weights all the time. Here you intialize randomly the weights then add some noise to the (best) weights. It improves all the time but it could get stucked finding nothing.

In [None]:
noise_scaling = 0.1
parameters = np.random.rand(4) * 2 - 1
bestreward = 0
for i in range(10000):
    newparams = parameters + (np.random.rand(4) * 2 - 1) * noise_scaling
    reward = 0
    reward = run_episode(env, newparams)
    if reward > bestreward:
        bestreward = reward
        parameters = newparams
        if reward == 200:
            print(f"Stopped at iteration {i}")
            break
print(f"Best reward: {bestreward} with best parameters: {parameters}")