# Reinforcement Learning



In [1]:
import gym
import PIL
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

env = gym.make('CartPole-v0')



The toy example from [gym](https://gym.openai.com/docs/#installation):

In [2]:
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        # To display the environment
        PIL.Image.fromarray(env.render(mode='rgb_array')).resize((320, 420))
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print(f"Episode finished after {t+1} timesteps")
            break

[ 0.02401657  0.00289357 -0.03394907  0.02128966]
[ 0.02407445 -0.19172549 -0.03352328  0.30307092]
[ 2.02399361e-02  3.85779439e-03 -2.74618630e-02  6.82921199e-06]
[ 0.02031709  0.19936261 -0.02746173 -0.30121259]
[ 0.02430434  0.00464261 -0.03348598 -0.01731545]
[ 0.0243972   0.20022839 -0.03383229 -0.32037269]
[ 0.02840176  0.39581541 -0.04023974 -0.62353013]
[ 0.03631807  0.59147541 -0.05271034 -0.92860984]
[ 0.04814758  0.78726781 -0.07128254 -1.23738021]
[ 0.06389294  0.98322948 -0.09603014 -1.55151556]
[ 0.08355753  1.17936307 -0.12706046 -1.87254996]
[ 0.10714479  1.37562362 -0.16451146 -2.20182293]
[ 0.13465726  1.18242322 -0.20854791 -1.96408358]
Episode finished after 13 timesteps
[ 0.04412734  0.00657703 -0.0437545  -0.03917243]
[ 0.04425888  0.20229822 -0.04453795 -0.34533296]
[ 0.04830484  0.00783719 -0.05144461 -0.0670205 ]
[ 0.04846159 -0.1865109  -0.05278502  0.20899791]
[ 0.04473137 -0.38083991 -0.04860506  0.48457384]
[ 0.03711457 -0.57524344 -0.03891358  0.76155049

[ 0.00796511  0.60065661 -0.12431153 -1.08638325]
[ 0.01997824  0.40737382 -0.1460392  -0.83514986]
[ 0.02812572  0.6041566  -0.16274219 -1.16996504]
[ 0.04020885  0.41148135 -0.18614149 -0.93240626]
[ 0.04843848  0.21929238 -0.20478962 -0.70351284]
Episode finished after 16 timesteps
[-0.03542926 -0.00362482  0.03439658  0.02814626]
[-0.03550176 -0.19922273  0.0349595   0.33148017]
[-0.03948621 -0.00461539  0.04158911  0.05002353]
[-0.03957852  0.1898863   0.04258958 -0.22925306]
[-0.0357808  -0.00581757  0.03800451  0.0765541 ]
[-0.03589715 -0.20146315  0.0395356   0.3809812 ]
[-0.03992641 -0.00692425  0.04715522  0.10102132]
[-0.04006489 -0.20268918  0.04917565  0.40820078]
[-0.04411868 -0.00829772  0.05733966  0.1314179 ]
[-0.04428463 -0.20419216  0.05996802  0.44162527]
[-0.04836848 -0.00996782  0.06880053  0.16843249]
[-0.04856783 -0.20600367  0.07216918  0.48200198]
[-0.05268791 -0.01197061  0.08180922  0.21290934]
[-0.05292732 -0.20816112  0.0860674   0.53023595]
[-0.05709054 -

[ 0.01586509  0.57098896 -0.03218796 -0.92390034]
[ 0.02728487  0.7665306  -0.05066597 -1.22652249]
[ 0.04261548  0.96226687 -0.07519642 -1.53463955]
[ 0.06186082  1.15820965 -0.10588921 -1.84981006]
[ 0.08502501  1.35432575 -0.14288541 -2.17340982]
[ 0.11211153  1.55052176 -0.18635361 -2.50657026]
Episode finished after 9 timesteps
[-0.04145083  0.03194637 -0.04579782  0.0286279 ]
[-0.04081191 -0.16248991 -0.04522526  0.30651666]
[-0.04406171 -0.35693921 -0.03909493  0.58460069]
[-0.05120049 -0.55149234 -0.02740291  0.86471641]
[-0.06223034 -0.35600831 -0.01010858  0.56354501]
[-0.0693505  -0.16074599  0.00116232  0.26769462]
[-0.07256542  0.03435935  0.00651621 -0.02462148]
[-0.07187824 -0.16085544  0.00602378  0.27011024]
[-0.07509534  0.03418004  0.01142598 -0.02066668]
[-0.07441174  0.22913628  0.01101265 -0.30972279]
[-0.06982902  0.42409961  0.0048182  -0.59891239]
[-0.06134703  0.61915381 -0.00716005 -0.89007376]
[-0.04896395  0.81437218 -0.02496153 -1.18499884]
[-0.03267651  1

The examples from [kvfrans.com](http://kvfrans.com/simple-algoritms-for-solving-cartpole/):

In [None]:
def run_episode(env, parameters):
    observation = env.reset()
    totalreward = 0
    for _ in range(200):
        action = 0 if np.matmul(parameters, observation) < 0 else 1
        observation, reward, done, info = env.step(action)
        totalreward += reward
        if done:
            break
    return totalreward

## Random search

We are going to run several tests with different weights initalization and pick up the ones with the highest total reward.

In [None]:
bestparams = None
bestreward = 0
for i in range(10000):
    # Initialize a vector of random weights for each observations (4 for CartPole)
    parameters = np.random.rand(4) * 2 - 1
    reward = run_episode(env, parameters)
    if reward > bestreward:
        bestreward = reward
        bestparams = parameters
        if reward == 200:
            print(f"Stopped at iteration {i}")
            break
print(f"Best reward: {bestreward} with best parameters: {bestparams}")

## Hill climbing

This technique use a trick to avoid testing random weights all the time. Here you intialize randomly the weights then add some noise to the (best) weights. It improves all the time but it could get stucked finding nothing.

In [None]:
noise_scaling = 0.1
parameters = np.random.rand(4) * 2 - 1
bestreward = 0
for i in range(10000):
    newparams = parameters + (np.random.rand(4) * 2 - 1) * noise_scaling
    reward = 0
    reward = run_episode(env, newparams)
    if reward > bestreward:
        bestreward = reward
        parameters = newparams
        if reward == 200:
            print(f"Stopped at iteration {i}")
            break
print(f"Best reward: {bestreward} with best parameters: {parameters}")