# LunarLander con Stable-Baselines3 (DQN)

Este notebook entrena y evalúa un agente DQN de Stable-Baselines3 para el ambiente discreto `LunarLander-v3` de Gymnasium.

- Método: DQN de SB3 con red MLP, experiencia repetida, target updates y exploración epsilon.
- Ambiente: `LunarLander-v3` (acciones: 0=No-op, 1=Motor Izq., 2=Motor Principal, 3=Motor Der.).
- Objetivo: Aterrizar en la zona plana entre banderas (centro), robusto a variaciones del terreno.
- Hardware: CPU.
- Evita logs con `verbose = False`

Incluye entrenamiento con `EvalCallback` y checkpoints, evaluación con y sin vientos, para mayor robustez, y gráficos de aprendizaje y grabación de video.

In [None]:
# Instalación de dependencias. Descomentar:
# %pip install -q gymnasium[box2d]==0.29.1 stable-baselines3==2.3.2 shimmy==1.3.0 swig==4.2.1 tensorboard
# %pip install -q matplotlib==3.8.4 tqdm==4.66.4 imageio==2.34.1 imageio-ffmpeg==0.4.9

import sys, platform


In [None]:
import gymnasium as gym
import numpy as np
import random
import os
import matplotlib.pyplot as plt
from tqdm.auto import trange, tqdm

from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

print('Usando Stable-Baselines3 DQN')


In [None]:
# Definir el ambiente
def make_env(seed: int = SEED, render_mode=None, enable_wind: bool = False, **kwargs):
    env = gym.make('LunarLander-v3', render_mode=render_mode, enable_wind=enable_wind, **kwargs)
    env = Monitor(env)
    env.reset(seed=seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env

# Verificar espacios
_env = make_env()
print('obs_shape', _env.observation_space.shape, 'n_actions', _env.action_space.n)
_env.close()


In [None]:
# Hiperparámetros
TIMESTEPS = 300_000  # Ajustado según el tiempo disponible
POLICY_KWARGS = dict(net_arch=[256, 256])
LEARNING_RATE = 1e-3
GAMMA = 0.99
BUFFER_SIZE = 100_000
LEARNING_STARTS = 5_000
TRAIN_FREQ = 4
TARGET_UPDATE_INTERVAL = 1_000
TAU = 1.0  
EXPLORATION_FRACTION = 0.5
EXPLORATION_FINAL_EPS = 0.05
BATCH_SIZE = 64


In [None]:
# Utilidades SB3: callbacks de evaluación y checkpoints
log_dir = 'logs'
os.makedirs(log_dir, exist_ok=True)

eval_env = make_env(seed=SEED + 100, enable_wind=False)

eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=os.path.join(log_dir, 'best_model'),
    log_path=log_dir,
    eval_freq=10_000,
    deterministic=True,
    render=False,
    n_eval_episodes=10,
)

checkpoint_callback = CheckpointCallback(
    save_freq=50_000,
    save_path=os.path.join(log_dir, 'checkpoints'),
    name_prefix='dqn_lunar',
    save_replay_buffer=True,
    save_vecnormalize=False,
)


In [None]:
# Construir y entrenar el modelo DQN de SB3
train_env = make_env(seed=SEED, enable_wind=False)

model = DQN(
    policy='MlpPolicy',
    env=train_env,
    learning_rate=LEARNING_RATE,
    buffer_size=BUFFER_SIZE,
    learning_starts=LEARNING_STARTS,
    batch_size=BATCH_SIZE,
    gamma=GAMMA,
    train_freq=TRAIN_FREQ,
    target_update_interval=TARGET_UPDATE_INTERVAL,
    tau=TAU,
    exploration_fraction=EXPLORATION_FRACTION,
    exploration_final_eps=EXPLORATION_FINAL_EPS,
    policy_kwargs=POLICY_KWARGS,
    tensorboard_log=log_dir,
    verbose=0,
    seed=SEED,
)

# Entrenar con callbacks
model.learn(total_timesteps=TIMESTEPS, callback=[eval_callback, checkpoint_callback], progress_bar=False)

# Guardar el modelo final
os.makedirs('models', exist_ok=True)
model_path = 'models/dqn_lunarlander_sb3'
model.save(model_path)
print('Modelo guardado en', model_path)


In [None]:
# Construir y entrenar el modelo DQN de SB3
train_env = make_env(seed=SEED, enable_wind=False)

model = DQN(
    policy='MlpPolicy',
    env=train_env,
    learning_rate=LEARNING_RATE,
    buffer_size=BUFFER_SIZE,
    learning_starts=LEARNING_STARTS,
    batch_size=BATCH_SIZE,
    gamma=GAMMA,
    train_freq=TRAIN_FREQ,
    target_update_interval=TARGET_UPDATE_INTERVAL,
    tau=TAU,
    exploration_fraction=EXPLORATION_FRACTION,
    exploration_final_eps=EXPLORATION_FINAL_EPS,
    policy_kwargs=POLICY_KWARGS,
    tensorboard_log=log_dir,
    verbose=1,
    seed=SEED,
)

# Entrenar con callbacks
model.learn(total_timesteps=TIMESTEPS, callback=[eval_callback, checkpoint_callback], progress_bar=False)

# Guardar el modelo final
os.makedirs('models', exist_ok=True)
model_path = 'models/dqn_lunarlander_sb3'
model.save(model_path)
print('Modelo guardado en', model_path)


In [None]:
# Cargar el mejor modelo (si existe) o usar el modelo recién entrenado
best_model_dir = os.path.join(log_dir, 'best_model')
best_model_path = os.path.join(best_model_dir, 'best_model.zip')

if os.path.exists(best_model_path):
    print('Cargando mejor modelo desde', best_model_path)
    best_model = DQN.load(best_model_path)
else:
    print('No hay mejor modelo aún; se usará el modelo actual')
    best_model = model


In [None]:
# TensorBoard: puedes lanzar un servidor para ver curvas (en Colab se integra)
# En local: desde terminal, corre: tensorboard --logdir logs | cat
print('Logs en:', log_dir)

In [None]:
# Evaluación con SB3

# Evaluación sin viento
eval_env_no_wind = make_env(seed=SEED + 1000, enable_wind=False)
mean_reward, std_reward = evaluate_policy(best_model, eval_env_no_wind, n_eval_episodes=100, deterministic=True)
print(f"Evaluación sin viento (100 eps): media={mean_reward:.2f} ± {std_reward:.2f}")

eval_env_no_wind.close()

In [None]:
# Grabación de video opcional (SB3)
# Si estás en Colab, el video se guardará en /content. Aquí se guarda en ./videos

import imageio

def record_video_sb3(model: DQN, filename: str = 'videos/lunar_dqn_episode.mp4', fps: int = 30, seed: int = SEED+999):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    env = make_env(seed=seed, render_mode='rgb_array')
    frames = []
    obs, info = env.reset(seed=seed)
    for _ in range(1000):
        frame = env.render()
        frames.append(frame)
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(int(action))
        if terminated or truncated:
            frame = env.render()
            frames.append(frame)
            break
    env.close()
    imageio.mimwrite(filename, frames, fps=fps, quality=8)
    print('Video guardado en:', filename)

# Descomenta para grabar video con el mejor modelo o el actual
record_video_sb3(best_model, filename='videos/lunar_dqn_episode.mp4', fps=30)


In [None]:
# Evaluación con viento activado

eval_env_wind = make_env(seed=SEED + 2000, enable_wind=True, wind_power=10.0, turbulence_power=1.5)
mean_reward_wind, std_reward_wind = evaluate_policy(best_model, eval_env_wind, n_eval_episodes=100, deterministic=True)
print(f"Evaluación con viento (100 eps): media={mean_reward_wind:.2f} ± {std_reward_wind:.2f}")

eval_env_wind.close()
