# Reinforcement Learning (Cross Entropy Method)

In [None]:
import numpy as np
import gym

env = gym.make('CartPole-v0')

Empezaremos ejecutando algunos ejemplos de interacción con el entorno elegiendo acciones al azar.

In [None]:
def run_random(env):
    for i_episode in range(20):
        observation = env.reset()
        for t in range(100):
            env.render()
            #print(observation)
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                print("Episode finished after {} timesteps".format(t+1))
                break
run_random(env)

Puedes descomentar la impresión de las observaciones para ver qué sale. Corresponden a la posición del coche, la velocidad del coche, el ángulo del péndulo y velocidad de rotación.

In [3]:
mu = [0., 0., 0., 0.]  # first means
sigma = [1., 1., 1., 1.]  # first standard deviations
episodies = 100
iterations = 10

Definimos la función que ejecutará cada episodio (desde el momento inicial hasta que llegamos a los `max_reward` pasos o hasta que el péndulo se nos caiga.)

In [4]:
def run_episode(env, parameters, render=False, max_reward=200):
    observation = env.reset()
    totalreward = 0
    for _ in xrange(max_reward):
        if render:
            env.render()
        action = 0 if np.matmul(parameters, observation) < 0 else 1  # this line is our agent
        observation, reward, done, info = env.step(action)
        totalreward += reward
        if done:
            break
    return totalreward

Comenzamos las iteraciones del método de entropía cruzada.

In [None]:
for iteration in xrange(iterations):

    parameters = []
    for i in xrange(4):
        parameters.append(np.random.normal(mu[i], sigma[i], episodies))
    parameters = np.transpose(parameters)

    rewards = []
    number_of_goals = 0

    for i in xrange(episodies):
        r = run_episode(env, parameters[i], max_reward=500)
        rewards.append(r)
        if r == 500:
            number_of_goals += 1

    # We combine in a list parameteres+rewards and sort it by rewards.
    # To do that we use a lambda function
    l = sorted(zip(parameters, rewards), key=lambda pair: pair[1])
    # We get the last ten (they will be those with the higher reward), but
    # only first component (parameters) is needed. 
    l = list(zip(*l[-10:])[0])

    mu = np.mean(l, 0)
    sigma = np.std(l, 0)

    print "------------"
    print "Iteration:", iteration
    print "Mean:", mu
    print "Standard deviation:", sigma
    print "# goals:", number_of_goals

Ejecutamos un episodio con 1500 pasos a ver qué tal va.

In [6]:
run_episode(env, mu, render=True, max_reward=1500)

1500.0