Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Pendulum"

In [None]:
import numpy as np
from pendulum_env_extended import PendulumEnvExtended
import random 

In [None]:
env = PendulumEnvExtended(render_mode='rgb_array')

Discretización de los estados

In [None]:
x_space = np.linspace(-1, 1, 10)
y_space = np.linspace(-1, 1, 10)
vel_space = np.linspace(-8, 8, 100)
x_space

Obtener el estado a partir de la observación

In [None]:
def get_state(obs):
    x, y, vel = obs
    x_bin = np.digitize(x, x_space)
    y_bin = np.digitize(y, y_space)
    vel_bin = np.digitize(vel, vel_space)
    return x_bin, y_bin, vel_bin

In [None]:
state = get_state(np.array([-0.4, 0.2, 0.3]))
state

Discretización de las acciones

In [None]:
actions = list(np.linspace(-2, 2, 10))
actions

In [None]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [None]:
Q = np.zeros((len(x_space) + 1, len(y_space) + 1, len(vel_space) + 1, len(actions)))
Q

Obtención de la acción a partir de la tabla Q

In [None]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action

Epsilon-Greedy Policy

In [None]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = get_sample_action()
        print('explore')
    # exploit
    else:
        action = optimal_policy(state, Q)
        print('exploit')
        
    return action

In [None]:
import wandb

wandb.login()

Ejemplo de episodio 

In [None]:
def train(env, Q, alpha=0.1, gamma=0.9, epsilon=0.1, episodes=100):
    for i in range(episodes):
        obs,_ = env.reset()
        print(obs)
        done = False
        total_reward = 0
        state = get_state(obs)
        while not done:
            # Acción del modelo
            action = epsilon_greedy_policy(state, Q, epsilon)
        
            # Indice de la accion en Q
            action_idx = actions.index(action)
            
            # Acción del ambiente
            real_action = np.array([action])
            
            obs, reward, done, _, _ = env.step(real_action)
            next_state = get_state(obs)
            
            # Usar action_idx para actualizar Q
            Q[state][action_idx] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action_idx])
        
            total_reward += reward
            wandb.log({'total_reward': total_reward})
            print('->', state, action, reward, obs, done)
    print('total_reward', total_reward)

In [None]:
import matplotlib.pyplot as plt

def imgRender():
    img = env.render()
    plt.imshow(img)
    plt.axis('off')
    plt.show()

In [None]:
# test the agent
obs,_ = env.reset()
done = False
while not done:
    state = obs
    action = optimal_policy(get_state(obs), Q)
    obs, reward, done, _, _ = env.step(np.array([action]))
    # imgRender()
    print('->', state, action, reward, obs, done)

In [None]:
# inicializar la corrida
wandb.init(
    # establecer el nombre del proyecto
    project="pendulum",

    # establecer hiperparámetros
    config={
    "alpha": 0.3,
    "gamma": 0.9,
    "epsilon": 0.5,
    "epochs": 1000,
    }
)

# entrenar el agente
train(env, Q, wandb.config.alpha, wandb.config.gamma, wandb.config.epsilon, wandb.config.epochs)


# finalizar la corrida
wandb.finish()