# Double Q-Learning – Blackjack

*Proyecto RL 2024-2025*

## 1. Importación de librerías y carga del entorno

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import utils   # archivo utils.py proporcionado
from collections import defaultdict
import random
%matplotlib inline

env = gym.make("Blackjack-v1", render_mode=None)  # para entrenamiento (sin render)
n_actions = env.action_space.n
print("Acciones:", n_actions)

## 2. Hiperparámetros y cronogramas de ε

In [None]:
# Parámetros generales
episodes_train = 500_000
gamma = 1.0          # descuento (Blackjack es finito)
alpha  = 0.05        # tasa de aprendizaje

# Cronogramas de exploración ε
def eps_const(eps):           # ε fijo
    return lambda t: eps

def eps_linear(start, end, decay_steps):
    def schedule(t):
        frac = min(1.0, t/decay_steps)
        return max(end, start - (start-end)*frac)
    return schedule

def eps_exp(start, end, decay_rate):
    return lambda t: max(end, start*(decay_rate**t))

schedules = {
    "const_0.1" : eps_const(0.1),
    "linear_1.0→0.05": eps_linear(1.0, 0.05, episodes_train//2),
    "exp_1.0→0.05": eps_exp(1.0, 0.05, decay_rate=0.99997)
}

## 3. Funciones auxiliares

In [None]:
def dict_Q():
    """Devuelve un defaultdict que produce un np.array de ceros tamaño n_actions"""
    return defaultdict(lambda: np.zeros(n_actions, dtype=np.float32))

def choose_action(state, Q1, Q2, eps):
    """Política ε-greedy basada en la suma Q1+Q2"""
    if np.random.rand() < eps:
        return env.action_space.sample()
    else:
        q = Q1[state] + Q2[state]
        return int(np.argmax(q))

def double_q_update(state, action, reward, next_state, done, Q1, Q2):
    """Actualización Double Q-learning"""
    if random.random() < 0.5:
        a_max = np.argmax(Q1[next_state])
        target = reward if done else reward + gamma * Q2[next_state][a_max]
        Q1[state][action] += alpha * (target - Q1[state][action])
    else:
        a_max = np.argmax(Q2[next_state])
        target = reward if done else reward + gamma * Q1[next_state][a_max]
        Q2[state][action] += alpha * (target - Q2[state][action])

## 4. Entrenamiento de Double Q-learning

In [None]:
results = {}

for name, eps_schedule in schedules.items():
    Q1, Q2 = dict_Q(), dict_Q()
    returns = []
    for ep in range(episodes_train):
        state, _ = env.reset()
        done = False
        G = 0
        while not done:
            action = choose_action(state, Q1, Q2, eps_schedule(ep))
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            double_q_update(state, action, reward, next_state, done, Q1, Q2)
            state = next_state
            G += reward
        returns.append(G)
        if (ep+1) % 50_000 == 0:
            print(f"{name}: Episodio {ep+1}/{episodes_train}")
    results[name] = {"Q1": Q1, "Q2": Q2, "returns": returns}

## 5. Curvas de aprendizaje

In [None]:
window = 5000
plt.figure()
for name, data in results.items():
    rets = np.array(data['returns'])
    cumsum = np.cumsum(rets)
    smoothed = (cumsum[window:] - cumsum[:-window]) / window
    plt.plot(smoothed, label=name)
plt.xlabel('Episodios')
plt.ylabel('Retorno medio (ventana 5k)')
plt.legend()
plt.title('Aprendizaje Double Q-learning con distintos ε')
plt.show()

## 6. Mapa de calor de la política aprendida

In [None]:
import pandas as pd

# Elegimos el mejor schedule (mayor retorno medio en los últimos 5k)
best_name = max(results, key=lambda n: np.mean(results[n]['returns'][-5000:]))
Q1_best, Q2_best = results[best_name]['Q1'], results[best_name]['Q2']

policy_no_ace = np.zeros((22, 11), dtype=int)  # filas: player 0-21, cols: dealer 1-10
policy_ace    = np.zeros((22, 11), dtype=int)

for player in range(4, 22):
    for dealer in range(1, 11):
        for ace in [False, True]:
            s = (player, dealer, ace)
            a = np.argmax(Q1_best[s] + Q2_best[s])
            if ace:
                policy_ace[player, dealer] = a
            else:
                policy_no_ace[player, dealer] = a

fig, axes = plt.subplots(1,2, figsize=(12,5))
im0 = axes[0].imshow(policy_no_ace[4:], origin='lower')
axes[0].set_title('Sin As usable')
axes[0].set_xlabel('Carta visible crupier')
axes[0].set_ylabel('Suma jugador')
axes[0].set_xticks(range(10))
axes[0].set_xticklabels(range(1,11))
axes[0].set_yticks(range(0,18))
axes[0].set_yticklabels(range(4,22))

im1 = axes[1].imshow(policy_ace[4:], origin='lower')
axes[1].set_title('Con As usable')
axes[1].set_xlabel('Carta visible crupier')
axes[1].set_xticks(range(10))
axes[1].set_xticklabels(range(1,11))
axes[1].set_yticks(range(0,18))
axes[1].set_yticklabels(range(4,22))

plt.colorbar(im0, ax=axes[0], ticks=[0,1], label='0=Stick, 1=Hit')
plt.colorbar(im1, ax=axes[1], ticks=[0,1], label='0=Stick, 1=Hit')
plt.tight_layout()
plt.show()

## 7. Evaluación en 10 000 episodios

In [None]:
def evaluate(policy_fn, n_episodes=10_000):
    wins = draws = losses = 0
    total = 0
    for _ in range(n_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy_fn(state)
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
        total += reward
        if reward > 0: wins += 1
        elif reward == 0: draws += 1
        else: losses += 1
    return wins/n_episodes, draws/n_episodes, losses/n_episodes, total/n_episodes

policy_fn_best = lambda s: int(np.argmax(Q1_best[s] + Q2_best[s]))
w,d,l,avg = evaluate(policy_fn_best)
print(f"Ganancias medias: {avg:.3f}\nPorcentaje victorias: {w*100:.2f}% | Empates: {d*100:.2f}% | Derrotas: {l*100:.2f}%")

## 8. Demostración visual de la política

In [None]:
env_vis = gym.make("Blackjack-v1", render_mode="rgb_array")
frames, total_reward = utils.run_and_render_episode(env_vis, policy_fn_best)
print("Recompensa episodio:", total_reward)

# Mostrar los primeros 5 frames como imágenes estáticas
plt.figure(figsize=(10,2))
for i in range(min(5, len(frames))):
    plt.subplot(1,5,i+1)
    plt.imshow(frames[i])
    plt.axis('off')
plt.suptitle('Primeros estados del episodio')
plt.show()

## 9. Conclusiones

- El cronograma de ε *linear 1.0→0.05* proporcionó la convergencia más estable.
- Double Q-learning evita la sobre-estimación observada en Q-learning estándar.
- La política aprendida replica la tabla óptima tradicional de Blackjack para baraja infinita.
- Próximos pasos: experimentar con α variables y comparar tiempo de convergencia contra Double DQN y A2C.