Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Descent Env"

In [1]:
import numpy as np
from descent_env import DescentEnv
import random 
from q_learning_agent import QLearningAgent

pygame 2.5.2 (SDL 2.28.3, Python 3.10.8)
Hello from the pygame community. https://www.pygame.org/contribute.html
Using Python-based geo functions


In [2]:
# Cambiar render_mode a rgb_array para entrenar/testear
env = DescentEnv(render_mode='rgb_array')

Reading config from /Users/agustinvarela/bluesky/settings.cfg
Reading magnetic variation data
Loading global navigation database...
Reading cache: /Users/agustinvarela/bluesky/cache/navdata.p
Successfully loaded OpenAP performance model
Failed to load BADA performance model
Successfully loaded legacy performance model
Successfully loaded plugin AREA
Successfully loaded plugin DATAFEED


Observation Space

In [3]:
env.observation_space

Dict('altitude': Box(-inf, inf, (1,), float64), 'runway_distance': Box(-inf, inf, (1,), float64), 'target_altitude': Box(-inf, inf, (1,), float64), 'vz': Box(-inf, inf, (1,), float64))

Action Space

In [4]:
env.action_space

Box(-1.0, 1.0, (1,), float64)

Discretización de los estados

**Nota:** es importante que chequeen el espacio de observación y el espacio de acción del entorno. Los números usados son ejemplos y pueden no ser correctos

In [5]:
altitude_space = np.linspace(0, 2.5, 50)  
vertical_velocity_space = np.linspace(-3, 3, 100)
target_altitude_space = np.linspace(0, 1, 50) 
runway_distance_space = np.linspace(-1, 1, 50)


#print("altitude_space:", altitude_space)
#print("vertical_velocity_space:", vertical_velocity_space)
#print("target_altitude_space:", target_altitude_space)
#print("runway_distance_space:", runway_distance_space)


#import gymnasium as gym
#from descent_env import DescentEnv
#import numpy as np
#import random
#
#env = DescentEnv(render_mode="human") 
#obs, info = env.reset()
#
#num_episodes = 5 
#max_steps_per_episode = 500 
#
#all_altitudes = []
#all_vertical_velocities = []
#all_target_altitudes = []
#all_runway_distances = []
#
#for episode in range(num_episodes):
#    obs, info = env.reset()
#    done = False
#    steps = 0
#    print(f"\n--- Episodio {episode + 1} ---")
#    while not done and steps < max_steps_per_episode:
#        # Aquí generas una acción aleatoria válida para tu entorno
#        actions = [-1.0, -0.5, 0.0, 0.5, 1.0] # Acciones discretas sugeridas en el notebook
#        action = np.array([random.choice(actions)])
#
#        obs, reward, done, truncated, info = env.step(action)
#
#        # Recolectar datos
#        all_altitudes.append(obs["altitude"][0])
#        all_vertical_velocities.append(obs["vz"][0])
#        all_target_altitudes.append(obs["target_altitude"][0])
#        all_runway_distances.append(obs["runway_distance"][0])
#        steps += 1
#        
#    env.render()
#    env.close()
#
## Analizar los datos recolectados
#print("\n--- Análisis de rangos observados ---")
#print(f"Altitud: Min={np.min(all_altitudes):.2f}, Max={np.max(all_altitudes):.2f}")
#print(f"Velocidad Vertical: Min={np.min(all_vertical_velocities):.2f}, Max={np.max(all_vertical_velocities):.2f}")
#print(f"Altitud Objetivo: Min={np.min(all_target_altitudes):.2f}, Max={np.max(all_target_altitudes):.2f}")
#print(f"Distancia Pista: Min={np.min(all_runway_distances):.2f}, Max={np.max(all_runway_distances):.2f}")
#"""

In [6]:
actions = list(np.linspace(-1, 1, 10))
agent = QLearningAgent(
    altitude_space, 
    vertical_velocity_space, 
    target_altitude_space,
    runway_distance_space,  
    actions=actions,
    env=env
)

In [None]:
rewards = agent.train_agent(env=env, episodes=25000, epsilon=0.99, gamma=0.99, alpha=0.1)

Episode: 0, Reward: -195.14, Epsilon: 0.99
Episode: 1, Reward: -175.89, Epsilon: 0.99
Episode: 2, Reward: -109.61, Epsilon: 0.99
Episode: 3, Reward: -90.38, Epsilon: 0.99
Episode: 4, Reward: -94.33, Epsilon: 0.99
Episode: 5, Reward: -149.62, Epsilon: 0.99
Episode: 6, Reward: -80.97, Epsilon: 0.99
Episode: 7, Reward: -125.52, Epsilon: 0.99
Episode: 8, Reward: -70.61, Epsilon: 0.99
Episode: 9, Reward: -96.40, Epsilon: 0.99
Episode: 10, Reward: -97.41, Epsilon: 0.99
Episode: 11, Reward: -46.82, Epsilon: 0.99
Episode: 12, Reward: -89.89, Epsilon: 0.99
Episode: 13, Reward: -61.93, Epsilon: 0.99
Episode: 14, Reward: -54.15, Epsilon: 0.99
Episode: 15, Reward: -116.05, Epsilon: 0.99
Episode: 16, Reward: -54.63, Epsilon: 0.99
Episode: 17, Reward: -65.43, Epsilon: 0.99
Episode: 18, Reward: -150.90, Epsilon: 0.99
Episode: 19, Reward: -165.59, Epsilon: 0.99
Episode: 20, Reward: -58.61, Epsilon: 0.99
Episode: 21, Reward: -92.73, Epsilon: 0.99
Episode: 22, Reward: -69.28, Epsilon: 0.99
Episode: 23, 

Obtener el estado a partir de la observación

In [None]:
average_reward = np.mean(rewards)
print(f"Promedio de recompensas: {average_reward}")

In [None]:
import matplotlib.pyplot as plt
plt.plot(rewards)
plt.xlabel('Episodio')
plt.ylabel('Recompensa')
plt.title('Recompensas por episodio')
plt.show()

In [None]:
def get_state(obs):
    alt = obs['altitude'][0]
    alt
    vz = obs['vz'][0]
    target_alt = obs['target_altitude'][0]
    runway_dist = obs['runway_distance'][0]
    alt_idx = np.digitize(alt, altitude_space)
    vz_idx = np.digitize(vz, vertical_velocity_space)
    target_alt_idx = np.digitize(target_alt, target_altitude_space)
    runway_dist_idx = np.digitize(runway_dist, runway_distance_space)
    return alt_idx, vz_idx, target_alt_idx, runway_dist_idx

In [None]:
obs = env.observation_space.sample()
print(obs)
state = get_state(obs) # Ejemplo de obs
state

OrderedDict([('altitude', array([-0.88461465])), ('runway_distance', array([-1.12342297])), ('target_altitude', array([1.11719687])), ('vz', array([-0.99368627]))])


(0, 34, 50, 0)

Discretización de las acciones

In [None]:
actions = list(np.linspace(-1, 1, 10))
actions

[-1.0,
 -0.7777777777777778,
 -0.5555555555555556,
 -0.33333333333333337,
 -0.11111111111111116,
 0.11111111111111116,
 0.33333333333333326,
 0.5555555555555554,
 0.7777777777777777,
 1.0]

In [None]:
def get_sample_action():
    return random.choice(actions)

Inicilización de la tabla Q

In [None]:
Q = np.zeros((len(altitude_space), len(vertical_velocity_space), len(target_altitude_space), len(runway_distance_space), len(actions)))
Q

array([[[[[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],

         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],

         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],

         ...,

         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0.

Obtención de la acción a partir de la tabla Q

In [None]:
def optimal_policy(state, Q):
    action = actions[np.argmax(Q[state])]
    return action