# DoubleDunk con REINFORCE (GPU: CUDA o MPS)

Entrenamos un agente REINFORCE con baseline (actor-crítico) para `ALE/DoubleDunk-v5` (Atari) usando Gymnasium + ALE.

- Algoritmo: REINFORCE con baseline, entropía y grad clipping.
- Observaciones: preprocesamiento a 84x84 en escala de grises + apilado de 4 frames.
- Hardware: usa automáticamente CUDA (NVIDIA), MPS (Apple Silicon) o CPU.
- Checkpoints: guardado periódico y mejor modelo.
- Video: grabación de episodio con el agente y con el mejor modelo.

Referencia: [ALE DoubleDunk](https://ale.farama.org/environments/double_dunk/).


## Mejoras para el Rendimiento

DoubleDunk es un juego complejo donde inicialmente obtienes recompensas negativas. Esto es **NORMAL** - no significa que haya recompensas 0, sino que el agente está aprendiendo desde un estado donde comete errores.

**Correcciones aplicadas:**
1. **Shape mismatch corregido**: Los valores ahora tienen forma consistente con los returns 
2. **Entropía balanceada**: Decae gradualmente para mantener exploración inicial
3. **Learning rate ajustado**: 3e-4 para convergencia más estable
4. **Más episodios**: 8000 para dar tiempo al agente de aprender el juego

El agente mejorará gradualmente de recompensas muy negativas (-20) hacia positivas conforme aprenda a jugar basketball.


In [8]:
# Instalación (si es necesario)
# !pip install -q gymnasium[atari] ale-py torch torchvision imageio

import sys, platform
print({'python': sys.version.split()[0], 'platform': platform.platform()})


{'python': '3.13.5', 'platform': 'macOS-15.5-arm64-arm-64bit-Mach-O'}


In [9]:
import gymnasium as gym
import gymnasium
import ale_py
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.optim import Adam
from dataclasses import dataclass
from typing import List
from collections import deque
import random
import os
import cv2
import imageio
import warnings

warnings.filterwarnings('ignore')

# Registrar entornos ALE
gymnasium.register_envs(ale_py)

SEED = 123
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Device: CUDA -> MPS -> CPU
device = (
    torch.device('cuda') if torch.cuda.is_available() else
    (torch.device('mps') if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() else torch.device('cpu'))
)
print('Using device:', device)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


Using device: mps


In [10]:
# Wrappers: preprocesamiento y frame stacking
class SimpleFrameStack(gym.Wrapper):
    def __init__(self, env, k: int = 4):
        super().__init__(env)
        self.k = k
        self.frames = deque(maxlen=k)
        obs_space = env.observation_space
        h, w = obs_space.shape[0], obs_space.shape[1]
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(h, w, k), dtype=np.uint8)

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.frames.clear()
        for _ in range(self.k):
            self.frames.append(obs)
        return self._get_ob(), info

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.frames.append(obs)
        return self._get_ob(), reward, terminated, truncated, info

    def _get_ob(self):
        return np.stack(list(self.frames), axis=-1)


def make_env(seed: int = SEED, render_mode=None):
    env = gym.make('ALE/DoubleDunk-v5', render_mode=render_mode)
    class GrayResizeWrapper(gym.ObservationWrapper):
        def __init__(self, env):
            super().__init__(env)
            h, w = 84, 84
            self.observation_space = gym.spaces.Box(low=0, high=255, shape=(h, w), dtype=np.uint8)
        def observation(self, obs):
            gray = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
            resized = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_AREA)
            return resized
    env = GrayResizeWrapper(env)
    env = SimpleFrameStack(env, 4)
    env.reset(seed=seed)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    return env

# to tensor CHW [0,1]
def obs_to_tensor(obs) -> torch.Tensor:
    arr = obs if isinstance(obs, np.ndarray) else np.array(obs)
    if arr.ndim == 3 and arr.shape[-1] == 4:
        arr = np.transpose(arr, (2, 0, 1))
    elif arr.ndim == 2:
        arr = np.stack([arr]*4, axis=0)
    tensor = torch.from_numpy(arr).float() / 255.0
    return tensor.unsqueeze(0).to(device)


In [11]:
# Modelo CNN actor-crítico
class AtariActorCritic(nn.Module):
    def __init__(self, in_channels: int, n_actions: int):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=8, stride=4), nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(),
        )
        self.flatten = nn.Flatten()
        self.fc = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512), nn.ReLU(),
        )
        self.policy_head = nn.Linear(512, n_actions)
        self.value_head = nn.Linear(512, 1)

        for m in self.modules():
            if isinstance(m, (nn.Conv2d, nn.Linear)):
                nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
                nn.init.zeros_(m.bias)

    def forward(self, x: torch.Tensor):
        z = self.features(x)
        z = self.flatten(z)
        z = self.fc(z)
        logits = self.policy_head(z)
        value = self.value_head(z).squeeze(-1)
        return logits, value

    def act(self, x: torch.Tensor):
        logits, value = self.forward(x)
        dist = Categorical(logits=logits)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        entropy = dist.entropy()
        return action.item(), log_prob, entropy, value


In [12]:
@dataclass
class Config:
    total_episodes: int = 8000
    max_steps_per_episode: int = 8000
    gamma: float = 0.99
    learning_rate: float = 3e-4
    entropy_coef: float = 0.01
    value_coef: float = 0.5
    grad_clip_norm: float = 0.5
    checkpoint_dir: str = 'checkpoints_doubledunk'
    checkpoint_every_episodes: int = 50
    eval_every_episodes: int = 100
    eval_episodes: int = 10

cfg = Config()
os.makedirs(cfg.checkpoint_dir, exist_ok=True)
print(cfg)

# Utils

def compute_returns(rewards: List[float], gamma: float) -> torch.Tensor:
    G = 0.0
    returns = []
    for r in reversed(rewards):
        G = r + gamma * G
        returns.append(G)
    returns.reverse()
    ret = torch.tensor(returns, dtype=torch.float32, device=device)
    return ret.view(-1)


def save_checkpoint(model: nn.Module, optimizer: torch.optim.Optimizer, episode: int, path: str):
    torch.save({'episode': episode,
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict()}, path)


def load_checkpoint(model: nn.Module, optimizer: torch.optim.Optimizer, path: str):
    ckpt = torch.load(path, map_location=device)
    model.load_state_dict(ckpt['model'])
    optimizer.load_state_dict(ckpt['optimizer'])
    return ckpt.get('episode', 0)


def evaluate(agent: nn.Module, episodes: int = 10, render: bool = False) -> float:
    env = make_env(seed=SEED + 999, render_mode='human' if render else None)
    agent.eval()
    rewards = []
    with torch.no_grad():
        for ep in range(episodes):
            obs, info = env.reset(seed=SEED + 999 + ep)
            total_r = 0.0
            for t in range(cfg.max_steps_per_episode):
                x = obs_to_tensor(obs)
                logits, _ = agent(x)
                action = torch.argmax(F.softmax(logits, dim=-1), dim=-1).item()
                obs, r, terminated, truncated, info = env.step(action)
                total_r += float(r)
                if terminated or truncated:
                    break
            rewards.append(total_r)
    env.close()
    agent.train()
    return float(np.mean(rewards))


Config(total_episodes=8000, max_steps_per_episode=8000, gamma=0.99, learning_rate=0.0003, entropy_coef=0.01, value_coef=0.5, grad_clip_norm=0.5, checkpoint_dir='checkpoints_doubledunk', checkpoint_every_episodes=50, eval_every_episodes=100, eval_episodes=10)


In [13]:
# Entrenamiento REINFORCE con baseline

env = make_env(seed=SEED, render_mode=None)
n_actions = env.action_space.n
in_channels = 4

agent = AtariActorCritic(in_channels=in_channels, n_actions=n_actions).to(device)
optimizer = Adam(agent.parameters(), lr=cfg.learning_rate)

start_episode = 0
ckpt_path = os.path.join(cfg.checkpoint_dir, 'reinforce_doubledunk.pt')
if os.path.exists(ckpt_path):
    print('Cargando checkpoint desde', ckpt_path)
    start_episode = load_checkpoint(agent, optimizer, ckpt_path)

best_eval = -float('inf')

for ep in range(start_episode, cfg.total_episodes):
    obs, info = env.reset(seed=SEED + ep)
    log_probs, entropies, values, rewards = [], [], [], []
    total_reward = 0.0

    for t in range(cfg.max_steps_per_episode):
        x = obs_to_tensor(obs)
        action, log_prob, entropy, value = agent.act(x)
        obs, reward, terminated, truncated, info = env.step(action)
        log_probs.append(log_prob)
        entropies.append(entropy)
        values.append(value)
        rewards.append(float(reward))
        total_reward += float(reward)
        if terminated or truncated:
            break

    returns = compute_returns(rewards, cfg.gamma)
    values_t = torch.stack(values)
    log_probs_t = torch.stack(log_probs)
    entropies_t = torch.stack(entropies)

    # Asegurar que values_t y returns tengan la misma forma
    if values_t.dim() > 1:
        values_t = values_t.squeeze(-1)
    
    advantages = returns - values_t
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    # Entropy decay: más exploración al inicio
    entropy_coef = cfg.entropy_coef * (0.5 + 0.5 * (1 - (ep / max(1, cfg.total_episodes))))

    policy_loss = -(log_probs_t * advantages.detach()).mean()
    value_loss = F.mse_loss(values_t, returns)
    entropy_bonus = entropies_t.mean()

    loss = policy_loss + cfg.value_coef * value_loss - entropy_coef * entropy_bonus

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    nn.utils.clip_grad_norm_(agent.parameters(), cfg.grad_clip_norm)
    optimizer.step()

    if (ep + 1) % cfg.checkpoint_every_episodes == 0:
        save_checkpoint(agent, optimizer, ep + 1, ckpt_path)

    if (ep + 1) % cfg.eval_every_episodes == 0:
        avg_eval = evaluate(agent, episodes=cfg.eval_episodes, render=False)
        print(f'Ep {ep+1} | Reward entrenamiento: {total_reward:.1f} | Eval-10 media: {avg_eval:.1f}')
        if avg_eval > best_eval:
            best_eval = avg_eval
            torch.save({'model': agent.state_dict(), 'avg_eval': best_eval}, os.path.join(cfg.checkpoint_dir, 'best.pt'))

env.close()
print('Entrenamiento finalizado.')


Cargando checkpoint desde checkpoints_doubledunk/reinforce_doubledunk.pt
Ep 3100 | Reward entrenamiento: -16.0 | Eval-10 media: -2.4
Ep 3200 | Reward entrenamiento: -24.0 | Eval-10 media: -10.0
Ep 3300 | Reward entrenamiento: -20.0 | Eval-10 media: -2.4
Ep 3400 | Reward entrenamiento: -12.0 | Eval-10 media: -2.8
Ep 3500 | Reward entrenamiento: -18.0 | Eval-10 media: -1.4
Ep 3600 | Reward entrenamiento: -22.0 | Eval-10 media: -1.2
Ep 3700 | Reward entrenamiento: -22.0 | Eval-10 media: -1.8
Ep 3800 | Reward entrenamiento: -20.0 | Eval-10 media: -2.8
Ep 3900 | Reward entrenamiento: -14.0 | Eval-10 media: -3.6
Ep 4000 | Reward entrenamiento: -22.0 | Eval-10 media: -2.8
Ep 4100 | Reward entrenamiento: -20.0 | Eval-10 media: -2.6
Ep 4200 | Reward entrenamiento: -22.0 | Eval-10 media: -8.8
Ep 4300 | Reward entrenamiento: -10.0 | Eval-10 media: -7.0
Ep 4400 | Reward entrenamiento: -20.0 | Eval-10 media: -7.2
Ep 4500 | Reward entrenamiento: -22.0 | Eval-10 media: -21.8
Ep 4600 | Reward entrenam

KeyboardInterrupt: 

In [14]:
# Evaluación y video (agente actual y mejor modelo)

# Evaluación 10 episodios
avg10 = evaluate(agent, episodes=10, render=False)
print(f'Recompensa promedio en 10 episodios (agente actual): {avg10:.2f}')

# Función de grabación de video

def record_video(model: nn.Module, filename: str = 'videos/doubledunk_reinforce.mp4', fps: int = 30, seed: int = SEED+2024):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    env = make_env(seed=seed, render_mode='rgb_array')
    frames = []
    obs, info = env.reset(seed=seed)
    with torch.no_grad():
        for t in range(cfg.max_steps_per_episode):
            x = obs_to_tensor(obs)
            logits, _ = model(x)
            action = torch.argmax(F.softmax(logits, dim=-1), dim=-1).item()
            frame = env.render()
            frames.append(frame)
            obs, r, terminated, truncated, info = env.step(action)
            if terminated or truncated:
                frame = env.render()
                frames.append(frame)
                break
    env.close()
    imageio.mimwrite(filename, frames, fps=fps, quality=8)
    print('Video guardado en:', filename)

# Grabar con agente actual
record_video(agent, filename='videos/doubledunk_reinforce.mp4', fps=30)

# Grabar con mejor modelo si existe
best_path = os.path.join(cfg.checkpoint_dir, 'best.pt')
if os.path.exists(best_path):
    print('Cargando mejor modelo desde', best_path)
    env_tmp = make_env(seed=SEED+3030)
    n_actions_best = env_tmp.action_space.n
    env_tmp.close()
    best_agent = AtariActorCritic(in_channels=4, n_actions=n_actions_best).to(device)
    ckpt_best = torch.load(best_path, map_location=device)
    best_agent.load_state_dict(ckpt_best['model'])
    record_video(best_agent, filename='videos/doubledunk_best.mp4', fps=30, seed=SEED+3030)
else:
    print('No se encontró best.pt; aún no hay mejor modelo guardado')


Recompensa promedio en 10 episodios (agente actual): -14.00




Video guardado en: videos/doubledunk_reinforce.mp4
Cargando mejor modelo desde checkpoints_doubledunk/best.pt




Video guardado en: videos/doubledunk_best.mp4
