In [2]:
import numpy as np
import random
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import gymnasium as gym
import matplotlib.pyplot as plt
import cv2

In [3]:
physical_devices = tf.config.list_physical_devices("GPU")
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Creacion del modelo


In [4]:
def build_model(state_dim, action_dim):
    model = Sequential(
        [
            Dense(128, input_dim=state_dim, activation="relu"),
            Dense(128, activation="relu"),
            Dense(action_dim, activation="linear"),
        ]
    )
    model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mae"])
    return model

## Clase DQNAgent con historial de pérdidas


In [5]:
class DQNAgent:
    def __init__(
        self,
        state_dim,
        action_dim,
        replay_size=10000,
        batch_size=64,
        gamma=0.99,
        epsilon=1.0,
        epsilon_decay=0.995,
        epsilon_min=0.01,
        discretization_bins=10,  # Nuevas variables para la discretización
    ):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.replay_buffer = deque(maxlen=replay_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.loss_history = []  # Para almacenar la pérdida promedio por episodio
        self.mae_history = []  # Para almacenar el MAE promedio por episodio
        self.discretization_bins = (
            discretization_bins  # Número de intervalos para discretizar cada dimensión
        )

        # Redes Q y Target
        self.q_network = build_model(state_dim, action_dim)
        self.target_network = build_model(state_dim, action_dim)
        self.update_target_network()

        self.bins = [
            np.linspace(-1, 1, discretization_bins),  # Cos(theta1)
            np.linspace(-1, 1, discretization_bins),  # Sin(theta1)
            np.linspace(-1, 1, discretization_bins),  # Cos(theta2)
            np.linspace(-1, 1, discretization_bins),  # Sin(theta2)
            np.linspace(-12.567, 12.567, discretization_bins),  # Velocidad theta1
            np.linspace(-28.274, 28.274, discretization_bins),  # Velocidad theta2
        ]

    def update_target_network(self):
        self.target_network.set_weights(self.q_network.get_weights())

    def select_action(self, state):
        """Selecciona una acción usando ε-greedy."""
        if np.random.rand() < self.epsilon:
            return random.randint(0, self.action_dim - 1)
        state = np.expand_dims(state, axis=0)
        q_values = self.q_network.predict(state, verbose=0)
        return np.argmax(q_values[0])

    def store_experience(self, state, action, reward, next_state, terminated):
        """Guarda la experiencia discretizando los estados."""
        discretized_state = self.discretize_state(state)
        discretized_next_state = self.discretize_state(next_state)
        self.replay_buffer.append(
            (discretized_state, action, reward, discretized_next_state, terminated)
        )

    def train(self):
        """Entrena la red con experiencias almacenadas."""
        if len(self.replay_buffer) < self.batch_size:
            return

        batch = random.sample(self.replay_buffer, self.batch_size)
        states, actions, rewards, next_states, terminateds = zip(*batch)

        states = np.array(states)
        next_states = np.array(next_states)

        q_values = self.q_network.predict(states, verbose=0)
        q_values_next = self.target_network.predict(next_states, verbose=0)

        for i in range(self.batch_size):
            if terminateds[i]:
                q_values[i, actions[i]] = rewards[i]
            else:
                q_values[i, actions[i]] = rewards[i] + self.gamma * np.max(
                    q_values_next[i]
                )

        history = self.q_network.fit(states, q_values, epochs=1, verbose=0)
        self.loss_history.append(history.history["loss"][0])
        self.mae_history.append(history.history["mae"][0])

    def discretize_state(self, state):
        """Convierte un estado continuo a uno discreto."""
        discretized = []
        for i, feature in enumerate(state):
            discretized.append(np.digitize(feature, self.bins[i]) - 1)
        return tuple(discretized)

In [6]:
def plot_scores(scores):
    plt.figure(figsize=(10, 6))
    plt.plot(scores, label="Puntaje por episodio")
    plt.xlabel("Episodio")
    plt.ylabel("Puntaje")
    plt.title("Evolución del Puntaje")
    plt.legend()
    plt.grid()
    plt.show()


def plot_loss(loss_history):
    plt.figure(figsize=(10, 6))
    plt.plot(loss_history, label="Pérdida por batch")
    plt.xlabel("Batch")
    plt.ylabel("Pérdida (Loss)")
    plt.title("Evolución de la Pérdida")
    plt.legend()
    plt.grid()
    plt.show()


def plot_mae(mae_history):
    plt.figure(figsize=(10, 6))
    plt.plot(mae_history, label="MAE por batch")
    plt.xlabel("Batch")
    plt.ylabel("Error Absoluto Medio (MAE)")
    plt.title("Evolución del MAE")
    plt.legend()
    plt.grid()
    plt.show()

## Entrenamiento del DQN con métricas y video


In [7]:
def train_dqn(
    env_name="Acrobot-v1",
    episodes=10,
    target_update=10,
    video_filename="dqn_training_video.mp4",
):
    env = gym.make(env_name, render_mode="rgb_array")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    agent = DQNAgent(state_dim, action_dim)
    scores = []

    env.reset()
    sample_frame = env.render()[0]
    height, width, channels = (
        sample_frame.shape
        if len(sample_frame.shape) == 3
        else (sample_frame.shape[0], sample_frame.shape[1], 1)
    )

    obj_video = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(
        video_filename, obj_video, 30, (width, height), isColor=(channels == 3)
    )

    for episode in range(episodes):
        state = env.reset()[0]
        state = np.array(state, dtype=np.float32)
        episode_score = 0

        while True:
            action = agent.select_action(state)
            next_state, reward, terminated, truncated, info = env.step(action)
            next_state = np.array(next_state, dtype=np.float32)
            episode_score += reward

            target_height = -np.cos(next_state[0]) - np.cos(
                next_state[2] + next_state[0]
            )
            reward += target_height * 0.5

            if terminated and episode_score < -100:
                reward = -10

            agent.store_experience(state, action, reward, next_state, terminated)
            agent.train()

            state = next_state

            frame = env.render()[0]
            frame_bgr = (
                cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                if len(frame.shape) == 3
                else cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)
            )
            out.write(frame_bgr)

            if terminated:
                break

        agent.epsilon = max(agent.epsilon * agent.epsilon_decay, agent.epsilon_min)
        scores.append(episode_score)

        if episode % target_update == 0:
            agent.update_target_network()

        print(
            f"Episode {episode}, Score: {episode_score}, Epsilon: {agent.epsilon:.2f}"
        )

    out.release()
    env.close()

    plot_scores(scores)
    plot_loss(agent.loss_history)
    plot_mae(agent.mae_history)

    return scores, agent.loss_history, agent.mae_history


## Entrenar y graficar resultados


In [None]:
scores, loss_history, mae_history = train_dqn(episodes=10)


Episode 0, Score: -1274.0, Epsilon: 0.99
Episode 1, Score: -1282.0, Epsilon: 0.99
Episode 2, Score: -1536.0, Epsilon: 0.99
Episode 3, Score: -1299.0, Epsilon: 0.98
Episode 4, Score: -3649.0, Epsilon: 0.98
Episode 5, Score: -919.0, Epsilon: 0.97
Episode 6, Score: -6288.0, Epsilon: 0.97
