Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Descent Env"

In [None]:
import numpy as np
from descent_env import DescentEnv
import random 
from q_learning_agent import QLearningAgent

In [None]:
# Cambiar render_mode a rgb_array para entrenar/testear
from env_recorder_wrapper import VideoRecorderWrapper 
import wandb
env = DescentEnv(render_mode='rgb_array')

Observation Space

In [None]:
env.observation_space

Action Space

In [None]:
env.action_space

Discretización de los estados

**Nota:** es importante que chequeen el espacio de observación y el espacio de acción del entorno. Los números usados son ejemplos y pueden no ser correctos

In [None]:
altitude_space_start = 0
altitude_space_end = 2.5
altitude_space_num = 30
altitude_space = np.linspace(altitude_space_start, altitude_space_end, altitude_space_num) 

vertical_velocity_space_start = -3
vertical_velocity_space_end = 3
vertical_velocity_space_num = 20
vertical_velocity_space = np.linspace(vertical_velocity_space_start, vertical_velocity_space_end, vertical_velocity_space_num) 

target_altitude_space_start = 0
target_altitude_space_end = 1
target_altitude_space_num = 10
target_altitude_space = np.linspace(target_altitude_space_start, target_altitude_space_end, target_altitude_space_num)

runway_distance_space_start = -1
runway_distance_space_end = 1
runway_distance_space_num = 20
runway_distance_space = np.linspace(runway_distance_space_start, runway_distance_space_end, runway_distance_space_num)


#print("altitude_space:", altitude_space)
#print("vertical_velocity_space:", vertical_velocity_space)
#print("target_altitude_space:", target_altitude_space)
#print("runway_distance_space:", runway_distance_space)


#import gymnasium as gym
#from descent_env import DescentEnv
#import numpy as np
#import random
#
#env = DescentEnv(render_mode="human") 
#obs, info = env.reset()
#
#num_episodes = 5 
#max_steps_per_episode = 500 
#
#all_altitudes = []
#all_vertical_velocities = []
#all_target_altitudes = []
#all_runway_distances = []
#
#for episode in range(num_episodes):
#    obs, info = env.reset()
#    done = False
#    steps = 0
#    print(f"\n--- Episodio {episode + 1} ---")
#    while not done and steps < max_steps_per_episode:
#        # Aquí generas una acción aleatoria válida para tu entorno
#        actions = [-1.0, -0.5, 0.0, 0.5, 1.0] # Acciones discretas sugeridas en el notebook
#        action = np.array([random.choice(actions)])
#
#        obs, reward, done, truncated, info = env.step(action)
#
#        # Recolectar datos
#        all_altitudes.append(obs["altitude"][0])
#        all_vertical_velocities.append(obs["vz"][0])
#        all_target_altitudes.append(obs["target_altitude"][0])
#        all_runway_distances.append(obs["runway_distance"][0])
#        steps += 1
#        
#    env.render()
#    env.close()
#
## Analizar los datos recolectados
#print("\n--- Análisis de rangos observados ---")
#print(f"Altitud: Min={np.min(all_altitudes):.2f}, Max={np.max(all_altitudes):.2f}")
#print(f"Velocidad Vertical: Min={np.min(all_vertical_velocities):.2f}, Max={np.max(all_vertical_velocities):.2f}")
#print(f"Altitud Objetivo: Min={np.min(all_target_altitudes):.2f}, Max={np.max(all_target_altitudes):.2f}")
#print(f"Distancia Pista: Min={np.min(all_runway_distances):.2f}, Max={np.max(all_runway_distances):.2f}")
#"""

In [None]:
actions = list(np.linspace(-1, 1, 10))
agent = QLearningAgent(
    altitude_space, 
    vertical_velocity_space, 
    target_altitude_space,
    runway_distance_space,  
    actions=actions,
    env=env
)

In [None]:
episodes = 3000
epsilon = 0.99
gamma = 0.8
alpha = 0.3
rewards = agent.train_agent(env=env, episodes=episodes, epsilon=epsilon, gamma=gamma, alpha=alpha)

Obtener el estado a partir de la observación

In [None]:
average_reward = np.mean(rewards)
print(f"Promedio de recompensas: {average_reward}")

In [None]:
import matplotlib.pyplot as plt
plt.plot(rewards)
plt.xlabel('Episodio')
plt.ylabel('Recompensa')
plt.title('Recompensas por episodio')
plt.show()

In [None]:
test_rewards = agent.test_agent(env, episodes=500)
test_average_reward = np.mean(test_rewards)
print(f"Promedio de recompensas en test: {test_average_reward}")


In [None]:
# Configuración de wandb
wandb.login(key="b1666b9050a5ade20a5130837a3c3c5ac2e39580")
wandb.init(project="descent_env_training", name="training_run")
# Guardar hiperparámetros
wandb.config.update({
    "epsilon": epsilon,
    "gamma": gamma,
    "alpha": alpha,
    "episodes": episodes,
    "altitude_space_start": altitude_space_start,
    "altitude_space_end": altitude_space_end,
    "altitude_space_num": altitude_space_num,
    "vertical_velocity_space_start": vertical_velocity_space_start,
    "vertical_velocity_space_end": vertical_velocity_space_end,
    "vertical_velocity_space_num": vertical_velocity_space_num,
    "target_altitude_space_start": target_altitude_space_start,
    "target_altitude_space_end": target_altitude_space_end,
    "target_altitude_space_num": target_altitude_space_num,
    "runway_distance_space_start": runway_distance_space_start,
    "runway_distance_space_end": runway_distance_space_end,
    "runway_distance_space_num": runway_distance_space_num
})

# Guardar discretización de los datos
wandb.log({
    "test_average_reward": test_average_reward
})
wandb.finish()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Seleccionar una dimensión de la tabla Q para analizar (por ejemplo, altitud y velocidad vertical)
q_table_slice = agent.q  # Ajusta los índices según las dimensiones de tu tabla Q

# Crear el mapa de calor
plt.figure(figsize=(10, 8))
sns.heatmap(q_table_slice, cmap="viridis", annot=False)
plt.xlabel("Acciones")
plt.ylabel("Estados")
plt.title("Mapa de calor de la tabla Q")
plt.show()

In [None]:
# Train the agent and visualize its execution
wrapper = VideoRecorderWrapper(env, filename='landing_execution.mp4', fps=3)
human_env = DescentEnv(render_mode='human')


