# librerías

In [19]:
import os
import json
import plotly
import torch
from torch import nn
import torch.nn.functional as F
import random
from collections import deque, namedtuple
import numpy as np
import gymnasium as gym
from typing import Optional

import mlflow
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient
from mlflow.exceptions import MlflowException

import optuna
import optuna.visualization as vis
from optuna.visualization import plot_parallel_coordinate


# init_components

In [None]:
# Entorno
ENV_NAME                 = "LunarLander-v3"
SEED                     = 42

MEM_LENGTH               = 2048
TARGET_UPDATE            = 1000       # cada cuántos pasos sincronizar target_net
EPSILON_START            = 1.0
EPSILON_END              = 0.01
EPS_DECAY_STEPS          = 300000        # pasos para llegar a EPS_END

# Entrenamiento
MAX_EPISODES             = 5000
MAX_STEPS_PER_EPISODE    = 1000

TRAINING_RATIO           = 4
EMA_RATIO                = 0.01
TAU                      = 0.125
EP                       = 2 #cada cuantos episodios mostrar rendimiento
STOPPING_REWARD_CRITERI  = 200

#optuna
OPTUNA_TRIALS            = 8
GAMMAS                   = [0.9, 0.99]
LEARNING_RATES           = [3e-4, 5e-3]
BATCH_SIZES              = [16, 32, 64, 128]
N_LAYERS                 = [1,2,3,4]
N_NEURONS                = [16,32,64,128]

# Rutas
MLFLOW_TRACKING_URI      = None   # Si se quiere apuntar a servidor remoto, p.ej. "http://localhost:5000"
MLFLOW_EXPERIMENT        = "LunarLander_DQN"

#device
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



Optuna

In [5]:
env = gym.make(ENV_NAME)

#leer dimensiones automáticas
state_dim  = env.observation_space.shape[0]
action_dim = env.action_space.n



# replay_buffer

In [6]:
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))


class ReplayBuffer:
    def __init__(self, 
                 capacity: int = MEM_LENGTH):
        self.capacity = capacity
        self.memory = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        """
        Almacena una experiencia en el buffer de experiencias.
        """
        self.memory.append(Transition(state, action, reward, next_state, done))

    def sample(self, 
               batch_size: int):
        """Devuelve un batch aleatorio de experiencias."""
        return random.sample(self.memory, batch_size)

    def __len__(self):
        """Devuelve longitud del buffer"""
        return len(self.memory)

# neural_network

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self, 
                 input_dim: int, 
                 output_dim: int,
                 n_layers: int,         
                 n_neurons: int):       

        super(DQN, self).__init__()

        layers = []

        # Primera capa
        in_features = input_dim
        out_features = n_neurons
        for i in range(n_layers):
            layers.append(nn.Linear(in_features, out_features))
            in_features = out_features
            out_features = max(1, out_features // 2)  # Reducimos a la mitad, mínimo 1 neurona

        # Capa final
        layers.append(nn.Linear(in_features, output_dim))

        self.layers = nn.ModuleList(layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        for layer in self.layers[:-1]:  
            x = F.relu(layer(x))
        return self.layers[-1](x) 

def learning_transfer(Q_model: nn.Module, 
                      T_model: nn.Module, 
                      tau: float):
    
    q_sd = Q_model.state_dict()
    t_sd = T_model.state_dict()
    for k in q_sd.keys():
        t_sd[k].copy_(tau * q_sd[k] + (1.0 - tau) * t_sd[k])
    T_model.load_state_dict(t_sd)     


# policy

In [8]:
class EpsilonGreedyPolicy:
    def __init__(self,
                 action_dim: int,
                 q_model: torch.nn.Module,
                 start: float = EPSILON_START, 
                 end: float = EPSILON_END, 
                 decay_steps: int = EPS_DECAY_STEPS):
        """
        Inicializa la política epsilon-greedy.
        """
        self.start = start              
        self.end = end                  
        self.decay_steps = decay_steps  
        self.steps_done = 0  
        self.q_model = q_model            
        self.action_dim = action_dim    

    def get_epsilon(self) -> float:
        # Decaimiento lineal:
        frac = max(0, (self.decay_steps - self.steps_done)) / self.decay_steps
        return self.end + (self.start - self.end) * frac

    def select_action(self, state: torch.Tensor) -> int:
        eps_threshold = self.get_epsilon()
        self.steps_done += 1

        if random.random() < eps_threshold:
            # Explorar
            action = random.randrange(self.action_dim)
        else:
            # Explotar
            with torch.no_grad():
                #state = state.unsqueeze(0).to(DEVICE)
                q_values = self.q_model(state)
                action = q_values.argmax(dim=1).item()
                # action = q_values.max(1)[1].item() 
        return action


# optimize_model

In [9]:
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

def optimize_model(samples: list,
                   q_model: torch.nn.Module,
                   t_model: torch.nn.Module,
                   optimizer: torch.optim.Optimizer,
                   gamma: float,
                   batch_size: int
                   ):
    
    if len(samples) < batch_size:
        return

    batch = samples

    # Convertir lista de np.ndarray → un tensor (batch_size x state_dim)
    
    state_batch = torch.stack([t.state for t in batch], dim=0).squeeze(1).to(DEVICE) #(batch_size, n_states)
    next_state_batch = torch.stack([t.next_state for t in batch], dim=0).squeeze(1).to(DEVICE)  #(batch_size, n_states)

    #Convertir acciones/rew/done a tensores con las formas adecuadas
    action_batch = torch.tensor([t.action for t in batch], dtype=torch.int64, device=DEVICE).unsqueeze(1)  # (batch_size, 1)
    reward_batch = torch.tensor([t.reward for t in batch], dtype=torch.float32, device=DEVICE).unsqueeze(1) # (batch_size, 1)
    done_batch   = torch.tensor([t.done for t in batch], dtype=torch.float32, device=DEVICE).unsqueeze(1) # (batch_size, 1)
        

    #Predicciones actuales Q(s_t, a_t; theta)
    state_action_values = q_model(state_batch).gather(1, action_batch)

    # 4.2. Double DQN: Selección con q_model, evaluación con T_MODEL
    with torch.no_grad():
        # Seleccionamos la acción que maximiza Q con la red online
        next_state_best_actions = q_model(next_state_batch).max(1)[1].unsqueeze(1)  # (batch_size x 1)
        # Evaluamos esas acciones con la red objetivo
        next_state_values = t_model(next_state_batch).gather(1, next_state_best_actions)
        # Si es terminal, anulamos valor futuro
        next_state_values = next_state_values * (1 - done_batch)
        # Construimos el target
        target_values = reward_batch + (gamma * next_state_values)

    # 4.3. Cálculo de pérdida MSE y optimización
    loss = F.mse_loss(state_action_values, target_values)

    optimizer.zero_grad()
    loss.backward()
    # Clipping de gradientes (opcional, pero generalmente útil)
    # for param in q_model.parameters():
    #     param.grad.data.clamp_(-1, 1)
    optimizer.step()

    return loss.item()

# eval_agent

In [10]:
def evaluate(env: gym.Env,
             state_dim: int,
             action_dim: int,
             n_layers: int,
             n_neurons: int,
             q_model: Optional[nn.Module] = None,
             model_path: Optional[str] = None,
             num_episodes: int = 5,
             render: bool = False) -> float:
    
    if q_model is None:
        assert model_path is not None, "O debes pasar q_model, o un model_path"
        q_model = DQN(input_dim = state_dim, output_dim = action_dim, n_layers = n_layers, n_neurons = n_neurons)
        q_model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    q_model.to(DEVICE).eval()


    rewards_por_episodio = [] #acumula rewards de todos los episodios para calcualr promedio por episodio

    for ep in range(1, num_episodes + 1):
        state, _ = env.reset()
        total_reward = 0.0 #reward de episodio actual

        for time_step in range(1, MAX_STEPS_PER_EPISODE):
            # 3) Seleccionamos acción greedy: argmax Q(s, a)
            state_tensor = torch.tensor(state, dtype=torch.float32, device=DEVICE).unsqueeze(0)
            with torch.no_grad():
                q_values = q_model(state_tensor)
            action = torch.argmax(q_values, dim=1).item()

            # 4) Interactuamos con el entorno
            next_state, reward, done, truncated, _ = env.step(action)
            total_reward += reward

            if done or truncated:
                break

            if render:
                env.render()
            state = next_state
        rewards_por_episodio.append(total_reward)
        #print(f"[Eval] Episodio {ep}/{num_episodes} → Recompensa: {total_reward:.2f}")
    env.close()
    reward_avg = np.mean(rewards_por_episodio)
    #print(f"[Eval] Recompensa media en {num_episodes} episodios: {reward_avg:.2f}\n")
    return reward_avg, rewards_por_episodio


# build_agent

In [11]:
def build_agent(state_dim: int,
                action_dim: int,
                n_layers: int, 
                n_neurons: int,
                learning_rate: int) -> dict:
    """
    Construye y devuelve un dict con:
      - 'q_online'   : red Q principal (PyTorch Module)
      - 'q_target'   : red target (idéntica a q_online al inicio, en modo eval)
      - 'optimizer'  : optimizador Adam sobre los parámetros de q_online
      - 'buffer'     : ReplayBuffer() con capacidad MEM_LENGTH por defecto
      - 'policy'     : EpsilonGreedyPolicy( q_online, action_dim, start=EPSILON_START,
                    end=EPSILON_END, decay_steps=EPS_DECAY_STEPS )
      - 'total_steps': contador global de pasos, inicializado a 0
    """
    q_online = DQN(input_dim = state_dim, output_dim = action_dim, n_layers = n_layers, n_neurons = n_neurons).to(DEVICE)
    q_target = DQN(input_dim=state_dim, output_dim=action_dim, n_layers = n_layers, n_neurons = n_neurons).to(DEVICE)
    q_target.load_state_dict(q_online.state_dict())
    q_target.eval()

    optimizer = torch.optim.Adam(q_online.parameters(), lr=learning_rate)
    buffer = ReplayBuffer()
    policy = EpsilonGreedyPolicy(q_model = q_online,
                                 action_dim = action_dim,
                                 start = EPSILON_START,
                                 end = EPSILON_END,
                                 decay_steps = EPS_DECAY_STEPS)
    return {
        "q_online": q_online,
        "q_target": q_target,
        "optimizer": optimizer,
        "buffer": buffer,
        "policy": policy,
        "total_steps": 0
    }

# agent = build_agent(state_dim=8,
#                     action_dim=2)
# print(agent)
# print(isinstance(agent["optimizer"]), torch.optim.Optimizer)



# mlflow

In [14]:

def get_next_trial_id(experiment_name: str = MLFLOW_EXPERIMENT) -> int:
    client = MlflowClient()
    exp = client.get_experiment_by_name(experiment_name)
    if exp is None:
        # crea el experimento si no existía
        exp_id = client.create_experiment(experiment_name)
    else:
        exp_id = exp.experiment_id

    # cuenta todos los runs en ese experimento
    all_runs = client.search_runs(
        experiment_ids=[exp_id],
        run_view_type=mlflow.tracking.client.ViewType.ALL,
        max_results=50000
    )
    return len(all_runs) + 1

# train

In [None]:

def train(
        env: gym.Env,
        n_layers: int,
        hidden_size: int,
        q_online: torch.nn.Module,
        q_target: torch.nn.Module,
        optimizer: torch.optim.Optimizer,
        buffer: ReplayBuffer,
        policy: EpsilonGreedyPolicy,
        total_steps: int,
        learning_rate: float,
        gamma: float,
        batch_size: int,
        experiment_name: str = MLFLOW_EXPERIMENT):
    """
    Esta función implementa la lógica de un paso de entrenamiento
    """
    #cerrar cualquier run activo
    try:
        if mlflow.active_run() is not None:
            mlflow.end_run()
    except MlflowException:
        pass
    
    mlflow.set_experiment(experiment_name) #crear experimento
    #consultar nº del siguiente trial
    trial_id = get_next_trial_id(experiment_name)
    run_name = f"trial-{trial_id}"

    with mlflow.start_run(run_name=run_name): #abrir run
        #Registro hiperparámetros
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("gamma", gamma)
        mlflow.log_param("n_layers", n_layers)
        mlflow.log_param("hidden_size", hidden_size)
        mlflow.log_param("learning_rate", learning_rate)

        mlflow.log_param("training_ratio", TRAINING_RATIO)
        mlflow.log_param("tau", TAU)
        mlflow.log_param("max_episodes", MAX_EPISODES)
        mlflow.log_param("max_steps_per_ep", MAX_STEPS_PER_EPISODE)
    
        episode_count = 0
        running_reward_train = None #promedio de recompensas historico suavizado por EMA_RATIO
        running_reward_eval = None #promedio de recompensas historico de validacion suavizado por EMA_RATIO
        historic_reward = []
        best_eval_reward = -float('inf')
    
        for episode in range(1, MAX_EPISODES + 1):
            state, _ = env.reset()
            state = torch.tensor(state, dtype=torch.float32, device=DEVICE).unsqueeze(0)
        
            episode_reward = 0.0 #suma de todas las recompensas obtenidas en un único episodio (actual)

            for time_step in range(1, MAX_STEPS_PER_EPISODE + 1):
                #seleccionar accion
                action = policy.select_action(state)

                #realizar accion
                next_state, reward, done, truncated, _ = env.step(action)
                next_state = torch.tensor(next_state, dtype=torch.float32, device=DEVICE).unsqueeze(0)
                
                #actualizar running_reward
                episode_reward += reward

                #guardar experiencia en el buffer
                buffer.push(state, action, reward, next_state, done)

                ###TRAIN MODEL###
                #heuristica: si el buffer tiene suficientes experiencias entrenamos
                if len(buffer) >= 2 * batch_size and time_step % TRAINING_RATIO == 0:
                    #samplear 32 experiencias del buffer según su error
                    samples = buffer.sample(batch_size=batch_size)

                    #entrenar el modelo: devuelve error cuadratico medio
                    loss = optimize_model(samples=samples,
                                        q_model=q_online,
                                        t_model=q_target,
                                        optimizer=optimizer,
                                        gamma = gamma,
                                        batch_size = batch_size)

                    #transferencia de pesos: solo se modifican un %
                    learning_transfer(q_online, q_target, TAU)

                #actualizar estado
                state = next_state
                total_steps += 1

                if done:
                    break

            #evaluacion
            mean_eval_rewards, list_eval_rewards = evaluate(
                env = env,
                state_dim = env.observation_space.shape[0],
                action_dim = env.action_space.n,
                n_layers = n_layers,
                n_neurons = hidden_size,
                q_model = q_online,
                model_path = None,
                num_episodes = 5,
                render = False
            )

            #actualizar medias moviles de evaluacion
            running_reward_eval = (
                mean_eval_rewards
                if running_reward_eval is None
                else EMA_RATIO * mean_eval_rewards + (1 - EMA_RATIO) * running_reward_eval
            )
            running_reward_train = (
                episode_reward
                if running_reward_train is None
                else EMA_RATIO * episode_reward + (1 - EMA_RATIO) * running_reward_train
            )

            historic_reward.append(running_reward_train)

            if mean_eval_rewards > best_eval_reward:
                best_eval_reward = mean_eval_rewards
                ckpt_path = f"models/checkpoints/checkpoint_trial{trial_id}.pth"
                torch.save(q_online.state_dict(),ckpt_path)
                mlflow.log_artifact(ckpt_path, artifact_path="checkpoints") #subir a MLflow todos los checkpoints
                mlflow.pytorch.log_model(q_online, artifact_path = "best_model", registered_model_name=f"lunar-lander-trial-{trial_id}")
                
                torch.save(q_online.state_dict(), "models/best_model/best_model.pt") #en local solo el mejor modelo

            episode_count += 1

            #log de métricas
            mlflow.log_metric("episode_count", episode_count, step=episode)
            mlflow.log_metric("train_reward", episode_reward, step=episode)
            mlflow.log_metric("running_train_reward", running_reward_train, step=episode)
            mlflow.log_metric("eval_reward", mean_eval_rewards, step=episode)
            mlflow.log_metric("running_eval_reward", running_reward_eval, step=episode)
            mlflow.log_metric("epsilon", policy.get_epsilon(), step=episode)
            if "loss" in locals():
                mlflow.log_metric("loss", loss, step=episode)

            #mostrar logs
            if episode_count % EP == 0 and 'loss' in locals(): #'loss' in locals(): so existe loss (si se ha entrenado el modelo en este episodio)
                
                #mostrar info de entrenamiento
                template = (
                    "[Train] Episode {}/{MAX}  |  "
                    "Running Eval Reward: {:.2f}  |  "
                    "ε: {:.3f}  |  "
                    "Loss: {:.2f} | "
                    "Steps: {:.2f} | "
                ).replace("{MAX}", str(MAX_EPISODES))
                print(template.format(
                    episode_count,
                    running_reward_eval,
                    policy.get_epsilon(),
                    loss,
                    time_step,

                ))
                #mostrar info de evaluacion
                for idx, r in enumerate(list_eval_rewards, start=1):
                    print(f"[Eval] Episodio {idx}/5 → Recompensa: {r:.2f}")
                print(f"[Eval] Recompensa media en 5 episodios: {mean_eval_rewards:.2f}\n")


            #condicion para asegurarse de que se ha resuelto la tarea
            if running_reward_eval > STOPPING_REWARD_CRITERIA:
                print("Solved at episode {}!".format(episode_count))
                break


    env.close()
    
    return q_online, best_eval_reward, total_steps

In [None]:
def objective(trial):
    #hiperparámetros
    learning_rate = trial.suggest_float("learning_rate", LEARNING_RATES[0], LEARNING_RATES[1], log = True)
    gamma = trial.suggest_float("gamma",          GAMMAS[0],           GAMMAS[1])
    batch_size = trial.suggest_categorical("batch_size", BATCH_SIZES)
    n_layers   = trial.suggest_categorical("n_layers",   N_LAYERS)
    hidden_size = trial.suggest_categorical("hidden_size", N_NEURONS)

    #construir agente con hiperparámetros
    agent = build_agent(
    state_dim = state_dim,
    action_dim = action_dim,
    n_layers = n_layers,
    n_neurons = hidden_size,
    learning_rate = learning_rate
    )
    
    q_online    = agent["q_online"]
    q_target    = agent["q_target"]
    optimizer   = agent["optimizer"]
    buffer      = agent["buffer"]
    policy      = agent["policy"]
    total_steps = agent["total_steps"]

    #entrenamiento
    q_model, eval_best_reward, _ = train(
        env = env,
        n_layers = n_layers,
        hidden_size = hidden_size,
        q_online = q_online,
        q_target = q_target,
        optimizer = optimizer,
        buffer = buffer,
        policy = policy,
        total_steps = total_steps,
        learning_rate = learning_rate,
        gamma = gamma,
        batch_size = batch_size,
        experiment_name = MLFLOW_EXPERIMENT
    )

    return eval_best_reward


In [None]:
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(), #usar logica bayesiana para escoger hiperparametros
    pruner=optuna.pruners.MedianPruner() #parar entrenamiento si cada X épocas no se supera la mediana de los anteriores
)
study.optimize(objective, n_trials = OPTUNA_TRIALS)
print("Mejores params:", study.best_params)

with open("models/best_model/best_params.json", "w") as f:
    json.dump(study.best_params, f, indent=4)

fig = vis.plot_parallel_coordinate(study, params=list(study.best_params.keys()))
fig.write_image("models/best_model/parallel_coordinate.png")

[I 2025-06-11 18:48:09,387] A new study created in memory with name: no-name-2558d286-e7d1-41b7-a137-f29489c1112e
2025/06/11 18:48:10 INFO mlflow.tracking.fluent: Experiment with name 'LunarLander_DQN' does not exist. Creating a new experiment.
Successfully registered model 'lunar-lander-trial-1'.
Created version '1' of model 'lunar-lander-trial-1'.


[Train] Episode 2/10  |  Running Eval Reward: -448.36  |  ε: 0.999  |  Loss: 1.68 | Steps: 86.00 | 
[Eval] Episodio 1/5 → Recompensa: -555.81
[Eval] Episodio 2/5 → Recompensa: -669.01
[Eval] Episodio 3/5 → Recompensa: -615.87
[Eval] Episodio 4/5 → Recompensa: -675.18
[Eval] Episodio 5/5 → Recompensa: -600.44
[Eval] Recompensa media en 5 episodios: -623.26

[Train] Episode 4/10  |  Running Eval Reward: -457.92  |  ε: 0.999  |  Loss: 7.09 | Steps: 91.00 | 
[Eval] Episodio 1/5 → Recompensa: -707.10
[Eval] Episodio 2/5 → Recompensa: -652.74
[Eval] Episodio 3/5 → Recompensa: -391.91
[Eval] Episodio 4/5 → Recompensa: -863.16
[Eval] Episodio 5/5 → Recompensa: -3382.06
[Eval] Recompensa media en 5 episodios: -1199.39

[Train] Episode 6/10  |  Running Eval Reward: -464.66  |  ε: 0.998  |  Loss: 4.21 | Steps: 127.00 | 
[Eval] Episodio 1/5 → Recompensa: -1674.28
[Eval] Episodio 2/5 → Recompensa: -674.94
[Eval] Episodio 3/5 → Recompensa: -574.98
[Eval] Episodio 4/5 → Recompensa: -1453.93
[Eval] Ep

[I 2025-06-11 18:48:17,138] Trial 0 finished with value: -446.59731647711925 and parameters: {'learning_rate': 0.0006076509717845676, 'gamma': 0.9507895883722522, 'batch_size': 16, 'n_layers': 3, 'hidden_size': 128}. Best is trial 0 with value: -446.59731647711925.


[Train] Episode 10/10  |  Running Eval Reward: -475.63  |  ε: 0.997  |  Loss: 610.16 | Steps: 72.00 | 
[Eval] Episodio 1/5 → Recompensa: -928.03
[Eval] Episodio 2/5 → Recompensa: -550.88
[Eval] Episodio 3/5 → Recompensa: -656.90
[Eval] Episodio 4/5 → Recompensa: -492.59
[Eval] Episodio 5/5 → Recompensa: -626.72
[Eval] Recompensa media en 5 episodios: -651.02



Registered model 'lunar-lander-trial-2' already exists. Creating a new version of this model...
Created version '3' of model 'lunar-lander-trial-2'.
Registered model 'lunar-lander-trial-2' already exists. Creating a new version of this model...
Created version '4' of model 'lunar-lander-trial-2'.
Registered model 'lunar-lander-trial-2' already exists. Creating a new version of this model...
Created version '5' of model 'lunar-lander-trial-2'.


[Train] Episode 4/10  |  Running Eval Reward: -427.23  |  ε: 0.999  |  Loss: 83.72 | Steps: 129.00 | 
[Eval] Episodio 1/5 → Recompensa: -547.91
[Eval] Episodio 2/5 → Recompensa: -430.36
[Eval] Episodio 3/5 → Recompensa: -652.58
[Eval] Episodio 4/5 → Recompensa: -552.91
[Eval] Episodio 5/5 → Recompensa: -435.61
[Eval] Recompensa media en 5 episodios: -523.88

[Train] Episode 6/10  |  Running Eval Reward: -431.41  |  ε: 0.998  |  Loss: 80.80 | Steps: 103.00 | 
[Eval] Episodio 1/5 → Recompensa: -691.25
[Eval] Episodio 2/5 → Recompensa: -412.93
[Eval] Episodio 3/5 → Recompensa: -761.95
[Eval] Episodio 4/5 → Recompensa: -476.65
[Eval] Episodio 5/5 → Recompensa: -443.02
[Eval] Recompensa media en 5 episodios: -557.16

[Train] Episode 8/10  |  Running Eval Reward: -438.54  |  ε: 0.998  |  Loss: 170.02 | Steps: 77.00 | 
[Eval] Episodio 1/5 → Recompensa: -438.87
[Eval] Episodio 2/5 → Recompensa: -681.49
[Eval] Episodio 3/5 → Recompensa: -1162.81
[Eval] Episodio 4/5 → Recompensa: -886.89
[Eval] 

[I 2025-06-11 18:48:29,573] Trial 1 finished with value: -338.77427987885375 and parameters: {'learning_rate': 0.00041844637764903357, 'gamma': 0.9183833332238363, 'batch_size': 128, 'n_layers': 1, 'hidden_size': 64}. Best is trial 1 with value: -338.77427987885375.


[Train] Episode 10/10  |  Running Eval Reward: -445.16  |  ε: 0.997  |  Loss: 7.29 | Steps: 121.00 | 
[Eval] Episodio 1/5 → Recompensa: -1154.51
[Eval] Episodio 2/5 → Recompensa: -552.07
[Eval] Episodio 3/5 → Recompensa: -773.91
[Eval] Episodio 4/5 → Recompensa: -1478.88
[Eval] Episodio 5/5 → Recompensa: -827.01
[Eval] Recompensa media en 5 episodios: -957.28



Registered model 'lunar-lander-trial-3' already exists. Creating a new version of this model...
Created version '5' of model 'lunar-lander-trial-3'.
Registered model 'lunar-lander-trial-3' already exists. Creating a new version of this model...
Created version '6' of model 'lunar-lander-trial-3'.
Registered model 'lunar-lander-trial-3' already exists. Creating a new version of this model...
Created version '7' of model 'lunar-lander-trial-3'.


[Train] Episode 4/10  |  Running Eval Reward: -904.59  |  ε: 0.999  |  Loss: 8.67 | Steps: 73.00 | 
[Eval] Episodio 1/5 → Recompensa: -1119.46
[Eval] Episodio 2/5 → Recompensa: -663.82
[Eval] Episodio 3/5 → Recompensa: -839.41
[Eval] Episodio 4/5 → Recompensa: -1620.10
[Eval] Episodio 5/5 → Recompensa: -640.96
[Eval] Recompensa media en 5 episodios: -976.75



Registered model 'lunar-lander-trial-3' already exists. Creating a new version of this model...
Created version '8' of model 'lunar-lander-trial-3'.


[Train] Episode 6/10  |  Running Eval Reward: -898.35  |  ε: 0.998  |  Loss: 139.00 | Steps: 63.00 | 
[Eval] Episodio 1/5 → Recompensa: -754.51
[Eval] Episodio 2/5 → Recompensa: -648.18
[Eval] Episodio 3/5 → Recompensa: -485.71
[Eval] Episodio 4/5 → Recompensa: -624.28
[Eval] Episodio 5/5 → Recompensa: -771.70
[Eval] Recompensa media en 5 episodios: -656.88



Registered model 'lunar-lander-trial-3' already exists. Creating a new version of this model...
Created version '9' of model 'lunar-lander-trial-3'.


[Train] Episode 8/10  |  Running Eval Reward: -889.02  |  ε: 0.998  |  Loss: 101.18 | Steps: 117.00 | 
[Eval] Episodio 1/5 → Recompensa: -691.72
[Eval] Episodio 2/5 → Recompensa: -612.85
[Eval] Episodio 3/5 → Recompensa: -633.63
[Eval] Episodio 4/5 → Recompensa: -630.81
[Eval] Episodio 5/5 → Recompensa: -300.60
[Eval] Recompensa media en 5 episodios: -573.92



[I 2025-06-11 18:48:49,585] Trial 2 finished with value: -284.0742240334569 and parameters: {'learning_rate': 0.0043973237088562375, 'gamma': 0.984575961988907, 'batch_size': 128, 'n_layers': 2, 'hidden_size': 128}. Best is trial 2 with value: -284.0742240334569.


[Train] Episode 10/10  |  Running Eval Reward: -883.40  |  ε: 0.997  |  Loss: 32.48 | Steps: 68.00 | 
[Eval] Episodio 1/5 → Recompensa: -495.23
[Eval] Episodio 2/5 → Recompensa: -595.18
[Eval] Episodio 3/5 → Recompensa: -634.67
[Eval] Episodio 4/5 → Recompensa: -681.16
[Eval] Episodio 5/5 → Recompensa: -614.73
[Eval] Recompensa media en 5 episodios: -604.19



Registered model 'lunar-lander-trial-4' already exists. Creating a new version of this model...
Created version '6' of model 'lunar-lander-trial-4'.
Registered model 'lunar-lander-trial-4' already exists. Creating a new version of this model...
Created version '7' of model 'lunar-lander-trial-4'.


[Train] Episode 2/10  |  Running Eval Reward: -134.39  |  ε: 0.999  |  Loss: 2.25 | Steps: 132.00 | 
[Eval] Episodio 1/5 → Recompensa: -167.13
[Eval] Episodio 2/5 → Recompensa: 13.44
[Eval] Episodio 3/5 → Recompensa: -36.78
[Eval] Episodio 4/5 → Recompensa: -190.27
[Eval] Episodio 5/5 → Recompensa: -123.77
[Eval] Recompensa media en 5 episodios: -100.90

[Train] Episode 4/10  |  Running Eval Reward: -143.15  |  ε: 0.999  |  Loss: 5.41 | Steps: 57.00 | 
[Eval] Episodio 1/5 → Recompensa: -862.89
[Eval] Episodio 2/5 → Recompensa: -483.58
[Eval] Episodio 3/5 → Recompensa: -760.34
[Eval] Episodio 4/5 → Recompensa: -339.63
[Eval] Episodio 5/5 → Recompensa: -423.77
[Eval] Recompensa media en 5 episodios: -574.04

[Train] Episode 6/10  |  Running Eval Reward: -150.78  |  ε: 0.998  |  Loss: 2.15 | Steps: 104.00 | 
[Eval] Episodio 1/5 → Recompensa: -643.85
[Eval] Episodio 2/5 → Recompensa: -598.55
[Eval] Episodio 3/5 → Recompensa: -423.37
[Eval] Episodio 4/5 → Recompensa: -490.86
[Eval] Episodio

[I 2025-06-11 18:48:58,811] Trial 3 finished with value: -100.9025930578072 and parameters: {'learning_rate': 0.004248729132550595, 'gamma': 0.9822112865288861, 'batch_size': 16, 'n_layers': 3, 'hidden_size': 16}. Best is trial 3 with value: -100.9025930578072.


[Train] Episode 10/10  |  Running Eval Reward: -168.93  |  ε: 0.997  |  Loss: 6.78 | Steps: 91.00 | 
[Eval] Episodio 1/5 → Recompensa: -1020.33
[Eval] Episodio 2/5 → Recompensa: -512.34
[Eval] Episodio 3/5 → Recompensa: -516.38
[Eval] Episodio 4/5 → Recompensa: -660.24
[Eval] Episodio 5/5 → Recompensa: -484.05
[Eval] Recompensa media en 5 episodios: -638.67

Mejores params: {'learning_rate': 0.004248729132550595, 'gamma': 0.9822112865288861, 'batch_size': 16, 'n_layers': 3, 'hidden_size': 16}
