Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Pendulum"

In [None]:
import numpy as np
from pendulum_env_extended import PendulumEnvExtended
import random 

In [None]:
env = PendulumEnvExtended(render_mode='rgb_array')

Discretización de los estados

In [None]:
x_space = np.linspace(-1, 1, 10)
y_space = np.linspace(-1, 1, 10)
vel_space = np.linspace(-8, 8, 100)
x_space

Obtener el estado a partir de la observación

In [None]:
def get_state(obs):
    x, y, vel = obs
    x_bin = np.digitize(x, x_space)
    y_bin = np.digitize(y, y_space)
    vel_bin = np.digitize(vel, vel_space)
    return x_bin, y_bin, vel_bin

In [None]:
state = get_state(np.array([-0.4, 0.2, 0.3]))
state

Discretización de las acciones

In [None]:
actions = list(np.linspace(-2, 2, 10))
actions

In [None]:
def get_sample_action():
    return np.array([random.choice(actions)], dtype=np.float32)

Inicilización de la tabla Q

In [None]:
Q = np.zeros((len(x_space) + 1, len(y_space) + 1, len(vel_space) + 1, len(actions)))
Q

Obtención de la acción a partir de la tabla Q

In [None]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

Epsilon-Greedy Policy

In [None]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
        print('explore')
    # exploit
    else:
        action = np.array([np.argmax(Q[state])], dtype=np.int32)
        print('exploit')
        
    return action

Ejemplo de episodio 

In [None]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward
    print('->', state, action, reward, obs, done)
    env.render()
print('total_reward', total_reward)