In [4]:
# --- Celda 0: imports básicos y semilla global ---

import os, random
import torch
import numpy as np

import sys
sys.path.append("src")  

from config import (
    block_size, embed_size, dropout, n_heads, n_layer,
    eval_iters, batch_size, learn_rate, max_iters, eval_interval,
    end_token, unknown_token
)
SEED = 42

os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE


device(type='cpu')

In [5]:
from preprocess import make_train_test

make_train_test()


The corpus has 142 unique tokens.
SUCCESS


In [6]:
# --- Celda 2: cargar train/valid y vocabulario ---

import json
import torch

train_data = torch.load("assets/output/train.pt")
valid_data = torch.load("assets/output/valid.pt")

with open("assets/output/vocab.txt", "r", encoding="utf-8") as f:
    vocab = json.loads(f.read())

vocab_size = len(vocab)

print(f"Tamaño vocabulario: {vocab_size}")
print(f"Tamaño train_data : {train_data.shape}")
print(f"Tamaño valid_data : {valid_data.shape}")

# mirar los primeros 50 índices de entrenamiento
train_data[:50]


Tamaño vocabulario: 142
Tamaño train_data : torch.Size([1165])
Tamaño valid_data : torch.Size([130])


tensor([  7,  70,   2, 140,  39,   5, 126,   6,   4,  12,  53,  24,   2, 139,
         61,   0,   4,  11,  70,  19,   2, 138, 121,  53,  24,   3, 139,  61,
        102,   5,   0,   4,  10,  70,  13, 126,   2, 140, 105, 120,   6,   4,
          8,  88,  84,   2, 114,   5,   3, 140])

In [7]:
# --- Celda 3: utilidades de batching y encode/decode ---

from utils import get_batch, encode, decode, estimate_loss

print("block_size:", block_size)
print("batch_size:", batch_size)

# ejemplo rápido de batch
xb, yb = get_batch(train_data)
print("Shape xb:", xb.shape)  # (batch_size, block_size)
print("Shape yb:", yb.shape)

# ejemplo de decode de una ventana cualquiera
sample = xb[0]
print("Secuencia índices:", sample[:20])
print("Secuencia tokens decodificada:\n", decode(sample, vocab))


block_size: 32
batch_size: 32
Shape xb: torch.Size([32, 32])
Shape yb: torch.Size([32, 32])
Secuencia índices: tensor([ 21,   0,   4,  10, 139, 132,  13, 113, 129,  59,  40, 112,  57,   0,
          4,   8, 139,  40,   5,   0])
Secuencia tokens decodificada:
 ansias ! <END> Paul: ¡ va a ser un fin de semana fantástico ! <END> Anna: ¡ de <UNK> ! <END> Sandra: ¡ no puedo esperar por la cena y la noche


In [8]:
# --- Celda 4: crear el modelo GPT ---

from model import GPTLanguageModel

model = GPTLanguageModel(vocab_size)
model = model.to(DEVICE)

# cuántos parámetros tiene
n_params = sum(p.numel() for p in model.parameters())
print(model)
print(f"\nNúmero total de parámetros: {n_params:,}")


GPTLanguageModel(
  (token_embedding): Embedding(142, 256)
  (pos_embedding): Embedding(32, 256)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x Head(
            (key): Linear(in_features=256, out_features=42, bias=False)
            (query): Linear(in_features=256, out_features=42, bias=False)
            (value): Linear(in_features=256, out_features=42, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (linear): Linear(in_features=252, out_features=256, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (ln

In [9]:
# --- Celda 5: optimizador (AdamW) ---

optimizer = torch.optim.AdamW(model.parameters(), lr=learn_rate)
print("Learning rate:", learn_rate)


Learning rate: 0.0003


In [10]:
# --- Celda 6: helper para evaluar pérdidas en train/valid ---

def eval_train_valid(model, train_data, valid_data):
    losses = {}
    model.eval()
    with torch.no_grad():
        losses["train"] = estimate_loss(model, train_data)
        losses["valid"] = estimate_loss(model, valid_data)
    model.train()
    return losses

# prueba rápida (modelo recién inicializado)
losses0 = eval_train_valid(model, train_data, valid_data)
print("Pérdidas iniciales:", losses0)


Pérdidas iniciales: {'train': tensor(5.0562), 'valid': tensor(5.0316)}


In [11]:
# --- Celda 7: entrenamiento principal ---

from utils import current_time

for step in range(max_iters):
    # cada cierto número de pasos, evaluamos en train/valid
    if step % eval_interval == 0:
        losses = eval_train_valid(model, train_data, valid_data)
        t = current_time()
        print(f"{t} | step {step} | "
              f"train loss {losses['train']:.4f} | valid loss {losses['valid']:.4f}")

    # obtenemos un batch de entrenamiento
    xb, yb = get_batch(train_data)
    xb, yb = xb.to(DEVICE), yb.to(DEVICE)

    # forward + loss
    logits, loss = model(xb, yb)

    # backward + update
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("Entrenamiento terminado.")


21:04:06 | step 0 | train loss 5.0565 | valid loss 5.0319
21:06:41 | step 500 | train loss 0.1544 | valid loss 4.8930
21:09:38 | step 1000 | train loss 0.1231 | valid loss 5.5764
21:12:29 | step 1500 | train loss 0.1170 | valid loss 5.9692
21:15:29 | step 2000 | train loss 0.1148 | valid loss 6.1883
21:18:19 | step 2500 | train loss 0.1124 | valid loss 6.6059
21:21:08 | step 3000 | train loss 0.1143 | valid loss 6.5490
21:23:57 | step 3500 | train loss 0.1095 | valid loss 6.8801
21:26:47 | step 4000 | train loss 0.1105 | valid loss 7.1285
21:29:37 | step 4500 | train loss 0.1102 | valid loss 7.2325
Entrenamiento terminado.


In [12]:
# --- Celda 8: guardar modelo entrenado ---
os.makedirs("assets/models", exist_ok=True)
model_path = "assets/models/model_es.pt"

torch.save(model, model_path)
print(f"Modelo guardado en: {model_path}")


Modelo guardado en: assets/models/model_es.pt


In [None]:
# --- Celda 9: generación de texto en español (demo) ---

from config import end_token

# empezamos con un token de inicio sencillo: por ejemplo el nombre de un contacto o una palabra
prompt_tokens = ["Hola", ":"]
x0 = encode(prompt_tokens, vocab).view(1, -1)  # (1, T)

with torch.no_grad():
    generated = model.generate(x0, vocab)  # <- sin kwargs extra

texto = decode(generated, vocab)
print("PROMPT :", " ".join(prompt_tokens))
print("SALIDA :", texto)

TypeError: GPTLanguageModel.generate() got an unexpected keyword argument 'max_new_tokens'