# LSTM_torch_v3 — LSTM + Random Search (PyTorch)

**Objetivo:** ajuste de hiperparámetros sobre un modelo **LSTM** base.  
**dataset:** `songdb_funcional_v4`

**Estructura:**  
1) Entorno (semillas, GPU/versions).  
2) Configuración.  
3) Carga de CSV + tokenización.  
4) Split train/val/test.  
5) Vocab + codificación.  
6) Dataset + DataLoaders.  
7) Modelo LSTM con `tie_weights` opcional.  
8) Metricas y entrenamiento (Top@k, MRR, PPL).  
9) A/B Test: selección manual de hiperparámetros.  
10) Random Search (espacio de búsqueda,tabla de resultados, checkpoints).  
11) Empaquetar el mejor checkpoint con metadatos y nombre informativo

## 1) Entorno

In [None]:
#@title  Semillas y determinismo
import random, os, numpy as np, torch
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [None]:
#@title Comprobar GPU/versions
import sys, torch
print("Python:", sys.version)  # python 3.11.13
print("PyTorch:", torch.__version__)  # PyTorch: 2.6.0+cu124
print("CUDA disponible:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("⚠️ Activa GPU: Runtime ▶ Change runtime type ▶ GPU")


Python: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
PyTorch: 2.8.0+cu126
CUDA disponible: True
GPU: Tesla T4


## 2) Configuración

In [None]:
from dataclasses import dataclass

@dataclass
class Config:
    # Ruta al CSV
    data_path: str = "/content/songdb_funcional_v4.csv"
    # Columna con la secuencia
    sequence_col: str = "funcional_prog"  # cambia a "chordprog" si prefieres

    # Splits
    val_size: float = 0.10
    test_size: float = 0.10
    random_state: int = 42

    # Entrenamiento base (puede ser sobrescrito por RS)
    seq_len: int = 24
    batch_size: int = 128
    epochs: int = 6
    lr: float = 2e-3
    weight_decay: float = 1e-4
    dropout: float = 0.2
    embedding_dim: int = 128
    hidden_size: int = 256
    num_layers: int = 2
    grad_clip: float = 1.0
    amp: bool = True
    # Scheduler
    scheduler: str = "cosine"       # {"none","onecycle","cosine"}
    pct_start: float = 0.15         # solo si scheduler="onecycle"
    div_factor: float = 10.0        # solo si scheduler="onecycle"
    final_div_factor: float = 1e3   # solo si scheduler="onecycle"

    # Guardado
    save_dir: str = "/content/models_rs"
    tokenizer_path: str = "lstm_tokenizer_.json"

    # Filtrado
    min_seq_len: int = 8. #Probar distintos valores

cfg = Config()
print(cfg)


Config(data_path='/content/songdb_funcional_v4_2.csv', sequence_col='funcional_prog', val_size=0.1, test_size=0.1, random_state=42, seq_len=24, batch_size=128, epochs=6, lr=0.002, weight_decay=0.0001, dropout=0.2, embedding_dim=128, hidden_size=256, num_layers=2, grad_clip=1.0, amp=True, scheduler='cosine', pct_start=0.15, div_factor=10.0, final_div_factor=1000.0, save_dir='/content/models_rs', tokenizer_path='lstm_tokenizer_.json', min_seq_len=8.0)


## 3) Cargar CSV y tokenizar

In [None]:
#@title opción A) Subir archivo
try:
    from google.colab import files
    uploaded = files.upload()
except Exception:
    pass

Saving songdb_funcional_v4_2.csv to songdb_funcional_v4_2.csv


In [None]:
#@title opción B) montar desde Google Drive
try:
    from google.colab import drive
    # drive.mount('/content/drive') # Descomentar para utilizar Drive
except Exception:
    pass

In [None]:
#@title Tokenización de las progresiones funcinoales
import pandas as pd, ast, re, json

df = pd.read_csv(cfg.data_path)
assert cfg.sequence_col in df.columns, f"Columna {cfg.sequence_col} no encontrada en el CSV."
print("Filas totales:", len(df))
display(df[[cfg.sequence_col]].head(3))

def parse_tokens_simple(s: str):
    # Si viene como lista en string, intenta parsear
    if isinstance(s, str) and s.strip().startswith("[") and s.strip().endswith("]"):
        try:
            lst = ast.literal_eval(s)
            if isinstance(lst, list):
                return [str(t) for t in lst]
        except Exception:
            pass
    # Normalizar a tokens por espacios
    s = str(s).replace("|", " ").replace("\n", " ")
    toks = [t for t in re.findall(r"\S+", s) if t.strip()]
    return toks

df["_tokens_"] = df[cfg.sequence_col].apply(parse_tokens_simple)
df = df[df["_tokens_"].apply(len) >= cfg.min_seq_len].reset_index(drop=True)
print("Filas tras filtro min_seq_len:", len(df))


Filas totales: 2613


Unnamed: 0,funcional_prog
0,vi #ivø V/III V/VI vi IV ii V7 iii vi ii V7 I ...
1,VII VII I vi ii V7 VII VII I vi ii V7 I IV #iv...
2,i VI V/V V7 i VI V/V V7 i VI iiø V7 i VI iiø V...


Filas tras filtro min_seq_len: 2612


## 4) Split train/val/test (simple, por filas)

In [None]:

from sklearn.model_selection import train_test_split
train_df, tmp_df = train_test_split(df, test_size=cfg.val_size+cfg.test_size, random_state=cfg.random_state, shuffle=True)
rel_test = cfg.test_size / (cfg.val_size + cfg.test_size) if (cfg.val_size + cfg.test_size) > 0 else 0.5
val_df, test_df = train_test_split(tmp_df, test_size=rel_test, random_state=cfg.random_state, shuffle=True)

train_seqs = train_df["_tokens_"].tolist()
val_seqs   = val_df["_tokens_"].tolist()
test_seqs  = test_df["_tokens_"].tolist()
print(len(train_seqs), len(val_seqs), len(test_seqs))


2089 261 262


## 5) Vocabulario y codificación

In [None]:
from collections import Counter
PAD, UNK, BOS, EOS = "<pad>", "<unk>", "<bos>", "<eos>"

def build_vocab(seqs, min_freq=1):
    c = Counter()
    for s in seqs: c.update(s)
    vocab = [PAD, UNK, BOS, EOS] + [t for t,f in c.items() if f >= min_freq and t not in {PAD,UNK,BOS,EOS}]
    stoi = {t:i for i,t in enumerate(vocab)} # token to id dict
    itos = {i:t for t,i in stoi.items()}     # id to token dict
    return vocab, stoi, itos

vocab, stoi, itos = build_vocab(train_seqs, 1)
print("Vocab size:", len(vocab))

with open(cfg.tokenizer_path, "w") as f:
    json.dump({"vocab": list(vocab)}, f, ensure_ascii=False, indent=2)

def encode(seq, add_bos=True):
    ids = [stoi[BOS]] if add_bos else []
    ids += [stoi.get(t, stoi[UNK]) for t in seq]
    return ids


Vocab size: 86


## 6) Dataset (context→next) + DataLoaders

In [None]:

import torch
from torch.utils.data import Dataset, DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class NextTokenDataset(Dataset):
    def __init__(self, sequences, seq_len):
        self.samples = []
        for seq in sequences:
            ids = encode(seq, add_bos=True)
            if len(ids) <= seq_len: continue
            for i in range(seq_len, len(ids)):
                self.samples.append((ids[i-seq_len:i], ids[i]))
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

def make_dataloaders(train_seqs, val_seqs, test_seqs, seq_len, batch_size):
    """

    """
    train_data = NextTokenDataset(train_seqs, seq_len)
    val_data   = NextTokenDataset(val_seqs,   seq_len)
    test_data  = NextTokenDataset(test_seqs,  seq_len)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader   = DataLoader(val_data,   batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(test_data,  batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader, len(test_data)


## 7) Modelo LSTM (con `tie_weights` opcional)

In [None]:

import torch.nn as nn

class ChordLSTM(nn.Module):
    """
    LSTM sencillo con opción tie_weights. Si tie_weights=True y hidden_size!=embedding_dim,
    se añade una proyección H->E antes de decodificar con los pesos compartidos de la embedding.
    """
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout, tie_weights=False):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers,
                           batch_first=True, dropout=dropout if num_layers>1 else 0.0)
        self.dropout = nn.Dropout(dropout)

        self.tie_weights = tie_weights
        if tie_weights:
            self.proj = nn.Linear(hidden_size, embedding_dim, bias=False) if hidden_size != embedding_dim else nn.Identity()
            self.decoder = nn.Linear(embedding_dim, vocab_size, bias=False)
            self.decoder.weight = self.emb.weight
        else:
            self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        e = self.emb(x)                 # (B, T, E)
        o, _ = self.rnn(e)              # (B, T, H)
        h_t = self.dropout(o[:, -1, :]) # (B, H)
        if self.tie_weights:
            e_t = self.proj(h_t)        # (B, E)
            return self.decoder(e_t)    # (B, V)
        else:
            return self.fc(h_t)         # (B, V)


## 8) Métricas y entrenamiento (Top@k, MRR, PPL)

In [None]:

import math, time, os, torch
import torch.nn.functional as F
from torch.amp import GradScaler, autocast

def topk_metrics(logits, targets, ks=(1,3,5)):
    """
    Definción de métricas Top@k y MRR.
    """
    out = {}
    with torch.no_grad():
        for k in ks:
            topk = logits.topk(k, dim=-1).indices
            out[f"Top@{k}"] = (topk == targets.unsqueeze(1)).any(dim=1).float().mean().item()
        ranks = (logits.argsort(dim=-1, descending=True) == targets.unsqueeze(1)).nonzero(as_tuple=False)[:,1] + 1
        out["MRR"] = (1.0 / ranks.float()).mean().item()
    return out

def evaluate(model, loader, criterion, device):
    """
    Función de evaluación
    """
    model.eval()
    total, n = 0.0, 0
    agg = {"Top@1":0.0,"Top@3":0.0,"Top@5":0.0,"MRR":0.0}
    with torch.no_grad():
        for x,y in loader:
            x,y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            b = x.size(0); total += loss.item()*b; n += b
            m = topk_metrics(logits, y)
            for k in agg: agg[k] += m[k]*b
    for k in agg: agg[k] /= max(1,n)
    return {"loss": total/max(1,n), "ppl": math.exp(total/max(1,n)), **agg}

def train_once(model, train_loader, val_loader, epochs, lr, weight_decay,
               grad_clip=1.0, amp=True, save_path=None, label_smoothing=0.0,
               patience=2, device=torch.device("cpu"),
               scheduler_type="cosine", pct_start=0.15, div_factor=10.0,   #nuevo
               final_div_factor=1e3):                                   #nuevo
    """
    train_once: bucle de entrenamiento con early stopping por MRR.

    Parámetros clave:
    - scheduler_type: "cosine" (por defecto), "onecycle" o "none".
    - label_smoothing: e.g., 0.05 en train/val. (En test solemos usar 0.0 para comparabilidad.)
    - pct_start / div_factor / final_div_factor: SOLO aplican si scheduler_type == "onecycle".
      Con "cosine" y "none" se ignoran.

    Notas:
    - Se hace scheduler.step() por batch (T_max = epochs * steps_per_epoch para cosine).
    - Guardamos el mejor estado según MRR de validación y lo restauramos al final.
"""

    scaler = GradScaler('cuda' if device.type=='cuda' else 'cpu',
                        enabled=(amp and device.type=='cuda'))
    crit = torch.nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Scheduler
    steps_per_epoch = len(train_loader)
    if scheduler_type == "onecycle":
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            opt, max_lr=lr, steps_per_epoch=steps_per_epoch, epochs=epochs,
            pct_start=pct_start,
            anneal_strategy='cos',
            div_factor=div_factor,
            final_div_factor=final_div_factor
        )
    elif scheduler_type == "cosine":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            opt, T_max=epochs * steps_per_epoch
        )
    else:
        scheduler = None

    best_mrr, best_state = -1.0, None
    no_improve = 0

    for ep in range(1, epochs+1):
        model.train()
        for x,y in train_loader:
            x,y = x.to(device), y.to(device)
            opt.zero_grad(set_to_none=True)
            with autocast('cuda', enabled=(amp and device.type=='cuda')):
                logits = model(x); loss = crit(logits,y)
            scaler.scale(loss).backward()
            if grad_clip is not None:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            scaler.step(opt); scaler.update()
            if scheduler is not None:
                scheduler.step()

        valm = evaluate(model, val_loader, crit, device)
        print(f"Epoch {ep} | val loss {valm['loss']:.4f} ppl {valm['ppl']:.2f} Top@1 {valm['Top@1']:.3f} Top@3 {valm['Top@3']:.3f} Top@5 {valm['Top@5']:.3f} MRR {valm['MRR']:.3f}")
        if valm["MRR"] > best_mrr:
            best_mrr, no_improve = valm["MRR"], 0
            best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
            if save_path:
                os.makedirs(os.path.dirname(save_path), exist_ok=True)
                torch.save({"model_state": model.state_dict()}, save_path)
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f"⏹️ Early stopping (patience={patience})")
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return best_mrr


## 9) A/B Tests: comparativa de parámetros

In [None]:
#@title A/B helper
import copy, torch.nn as nn, random, numpy as np, torch
from dataclasses import replace

def _reseed(seed: int):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def run_single(cfg_local, *, tie_weights=False, label_smoothing=0.05,
               test_label_smoothing=0.0):
    """
    Ejecuta 1 entrenamiento + test con la config dada (sin RS).
    - tie_weights: para probar rápido con/ sin weight tying
    - label_smoothing: en training/validación
    - test_label_smoothing: en test (0.0 para comparabilidad entre runs)
    """
    # reseed para que las A/B sean comparables
    _reseed(cfg_local.random_state)

    # dataloaders
    train_loader_t, val_loader_t, test_loader_t, _ = make_dataloaders(
        train_seqs, val_seqs, test_seqs,
        seq_len=cfg_local.seq_len, batch_size=cfg_local.batch_size
    )

    # modelo
    model = ChordLSTM(
        vocab_size=len(vocab),
        embedding_dim=cfg_local.embedding_dim,
        hidden_size=cfg_local.hidden_size,
        num_layers=cfg_local.num_layers,
        dropout=cfg_local.dropout,
        tie_weights=tie_weights
    ).to(device)

    # train (usa el scheduler definido en cfg_local)
    _ = train_once(
        model, train_loader_t, val_loader_t,
        epochs=cfg_local.epochs, lr=cfg_local.lr, weight_decay=cfg_local.weight_decay,
        grad_clip=cfg_local.grad_clip, amp=cfg_local.amp, save_path=None,
        label_smoothing=label_smoothing, patience=2, device=device,
        scheduler_type=cfg_local.scheduler, pct_start=cfg_local.pct_start,
        div_factor=cfg_local.div_factor, final_div_factor=cfg_local.final_div_factor
    )

    # test (elige coherencia con training o comparabilidad entre runs)
    test_crit = nn.CrossEntropyLoss(label_smoothing=test_label_smoothing)
    return evaluate(model, test_loader_t, test_crit, device)


In [None]:
#@title Mini A/B test function: compara Top@1 y MRR y muestra difs ===
from dataclasses import replace

def compare_ab(cfg_A, cfg_B, *,
               tie_A=False, tie_B=False,
               ls_train=0.05, ls_test=0.0,
               label_A="A", label_B="B"):
    """
    Ejecuta A y B con mismos seeds/hparams (salvo los cambios que pongas en cfg_A/cfg_B)
    y muestra Top@1 / MRR + diferencias.
    - ls_train: label smoothing usado en training/validación
    - ls_test:  label smoothing del criterio en test (0.0 para comparabilidad)
    - tie_A/B:  activa/desactiva tie_weights en cada iteración
    """
    print(f"\n>>> Run {label_A}")
    resA = run_single(cfg_A, tie_weights=tie_A, label_smoothing=ls_train, test_label_smoothing=ls_test)
    print(f"{label_A}: Top@1={resA['Top@1']:.4f}  MRR={resA['MRR']:.4f}")

    print(f"\n>>> Run {label_B}")
    resB = run_single(cfg_B, tie_weights=tie_B, label_smoothing=ls_train, test_label_smoothing=ls_test)
    print(f"{label_B}: Top@1={resB['Top@1']:.4f}  MRR={resB['MRR']:.4f}")

    d_top1 = resB['Top@1'] - resA['Top@1']
    d_mrr  = resB['MRR']  - resA['MRR']
    print("\nΔ (B − A):  Top@1={:+.4f}  MRR={:+.4f}".format(d_top1, d_mrr))

    return resA, resB


In [None]:
#@title 9.1) A/B test: scheduler 'none' vs 'onecycle' (mismos hparams)
cfgA = replace(cfg, scheduler="none")
cfgB = replace(cfg, scheduler="onecycle")
compare_ab(cfgA, cfgB, tie_A=False, tie_B=False, ls_train=0.05, ls_test=0.0,
           label_A="none", label_B="onecycle")


>>> Run none
Epoch 1 | val loss 2.4178 ppl 11.22 Top@1 0.433 Top@3 0.670 Top@5 0.759 MRR 0.577
Epoch 2 | val loss 2.3498 ppl 10.48 Top@1 0.445 Top@3 0.685 Top@5 0.772 MRR 0.591
Epoch 3 | val loss 2.3194 ppl 10.17 Top@1 0.457 Top@3 0.691 Top@5 0.775 MRR 0.599
Epoch 4 | val loss 2.3177 ppl 10.15 Top@1 0.460 Top@3 0.691 Top@5 0.776 MRR 0.600
Epoch 5 | val loss 2.3246 ppl 10.22 Top@1 0.465 Top@3 0.694 Top@5 0.775 MRR 0.604
Epoch 6 | val loss 2.3393 ppl 10.37 Top@1 0.460 Top@3 0.690 Top@5 0.767 MRR 0.599
none: Top@1=0.4390  MRR=0.5845

>>> Run onecycle
Epoch 1 | val loss 2.5604 ppl 12.94 Top@1 0.412 Top@3 0.629 Top@5 0.722 MRR 0.550
Epoch 2 | val loss 2.3916 ppl 10.93 Top@1 0.437 Top@3 0.675 Top@5 0.766 MRR 0.581
Epoch 3 | val loss 2.3299 ppl 10.28 Top@1 0.451 Top@3 0.687 Top@5 0.768 MRR 0.594
Epoch 4 | val loss 2.3017 ppl 9.99 Top@1 0.465 Top@3 0.695 Top@5 0.778 MRR 0.605
Epoch 5 | val loss 2.2936 ppl 9.91 Top@1 0.466 Top@3 0.697 Top@5 0.779 MRR 0.606
Epoch 6 | val loss 2.2991 ppl 9.97 To

({'loss': 2.1717863362683265,
  'ppl': 8.77394326332569,
  'Top@1': 0.4390402926212962,
  'Top@3': 0.6824879387644003,
  'Top@5': 0.7615073675639255,
  'MRR': 0.5844942458233353},
 {'loss': 2.132555847358231,
  'ppl': 8.436401434432007,
  'Top@1': 0.44425609648375364,
  'Top@3': 0.6866605818574751,
  'Top@5': 0.7658104057619167,
  'MRR': 0.588845919623868})

In [None]:
#@title 9.2) A/B test: scheduler 'cosine' vs 'onecycle'
cfgA = replace(cfg, scheduler="cosine")
cfgB = replace(cfg, scheduler="onecycle")
compare_ab(cfgA, cfgB, tie_A=False, tie_B=False, ls_train=0.05, ls_test=0.0,
           label_A="cosine", label_B="onecycle")


>>> Run cosine
Epoch 1 | val loss 2.4219 ppl 11.27 Top@1 0.430 Top@3 0.672 Top@5 0.756 MRR 0.576
Epoch 2 | val loss 2.3421 ppl 10.40 Top@1 0.446 Top@3 0.684 Top@5 0.775 MRR 0.592
Epoch 3 | val loss 2.3156 ppl 10.13 Top@1 0.454 Top@3 0.693 Top@5 0.774 MRR 0.598
Epoch 4 | val loss 2.3064 ppl 10.04 Top@1 0.461 Top@3 0.696 Top@5 0.778 MRR 0.602
Epoch 5 | val loss 2.3017 ppl 9.99 Top@1 0.466 Top@3 0.694 Top@5 0.779 MRR 0.605
Epoch 6 | val loss 2.3053 ppl 10.03 Top@1 0.466 Top@3 0.695 Top@5 0.777 MRR 0.605
cosine: Top@1=0.4463  MRR=0.5903

>>> Run onecycle
Epoch 1 | val loss 2.4679 ppl 11.80 Top@1 0.421 Top@3 0.655 Top@5 0.749 MRR 0.566
Epoch 2 | val loss 2.3614 ppl 10.61 Top@1 0.444 Top@3 0.680 Top@5 0.769 MRR 0.588
Epoch 3 | val loss 2.3139 ppl 10.11 Top@1 0.457 Top@3 0.691 Top@5 0.775 MRR 0.599
Epoch 4 | val loss 2.3001 ppl 9.97 Top@1 0.461 Top@3 0.695 Top@5 0.782 MRR 0.602
Epoch 5 | val loss 2.2974 ppl 9.95 Top@1 0.464 Top@3 0.697 Top@5 0.780 MRR 0.605
Epoch 6 | val loss 2.3019 ppl 9.99

({'loss': 2.139258915076341,
  'ppl': 8.493141157198066,
  'Top@1': 0.4463424180380632,
  'Top@3': 0.687051767141136,
  'Top@5': 0.7686790978913202,
  'MRR': 0.5902759316054919},
 {'loss': 2.1423883281423737,
  'ppl': 8.519761335148631,
  'Top@1': 0.4451688621870805,
  'Top@3': 0.6858782112823811,
  'Top@5': 0.7655496155831724,
  'MRR': 0.5893271848343702})

In [None]:
#@title 9.3) A/B test: tie_weights False vs True (con mismo scheduler)
cfg_fix = replace(cfg, scheduler="onecycle")  # o "none"
compare_ab(cfg_fix, cfg_fix, tie_A=False, tie_B=True, ls_train=0.05, ls_test=0.0,
           label_A="tie=False", label_B="tie=True")



>>> Run tie=False
Epoch 1 | val loss 2.4679 ppl 11.80 Top@1 0.421 Top@3 0.655 Top@5 0.749 MRR 0.566
Epoch 2 | val loss 2.3614 ppl 10.61 Top@1 0.444 Top@3 0.680 Top@5 0.769 MRR 0.588
Epoch 3 | val loss 2.3139 ppl 10.11 Top@1 0.457 Top@3 0.691 Top@5 0.775 MRR 0.599
Epoch 4 | val loss 2.3001 ppl 9.97 Top@1 0.461 Top@3 0.695 Top@5 0.782 MRR 0.602
Epoch 5 | val loss 2.2974 ppl 9.95 Top@1 0.464 Top@3 0.697 Top@5 0.780 MRR 0.605
Epoch 6 | val loss 2.3019 ppl 9.99 Top@1 0.465 Top@3 0.696 Top@5 0.780 MRR 0.605
tie=False: Top@1=0.4452  MRR=0.5893

>>> Run tie=True
Epoch 1 | val loss 2.5061 ppl 12.26 Top@1 0.401 Top@3 0.666 Top@5 0.751 MRR 0.556
Epoch 2 | val loss 2.3819 ppl 10.83 Top@1 0.436 Top@3 0.680 Top@5 0.771 MRR 0.583
Epoch 3 | val loss 2.3329 ppl 10.31 Top@1 0.453 Top@3 0.692 Top@5 0.774 MRR 0.596
Epoch 4 | val loss 2.3205 ppl 10.18 Top@1 0.458 Top@3 0.697 Top@5 0.779 MRR 0.601
Epoch 5 | val loss 2.3150 ppl 10.12 Top@1 0.458 Top@3 0.699 Top@5 0.780 MRR 0.602
Epoch 6 | val loss 2.3292 pp

({'loss': 2.1423883281423737,
  'ppl': 8.519761335148631,
  'Top@1': 0.4451688621870805,
  'Top@3': 0.6858782112823811,
  'Top@5': 0.7655496155831724,
  'MRR': 0.5893271848343702},
 {'loss': 2.151621562030328,
  'ppl': 8.598790570479268,
  'Top@1': 0.44855913470506137,
  'Top@3': 0.6884861132019516,
  'Top@5': 0.7707654194378576,
  'MRR': 0.5918436967606867})

Se probaron distintos parámetros de entrenamiento:
- scheduler: 'cosine', 'OneCycle', 'none'
- weight_tying: True, False.

Todas ellas aportaron ligeras mejoras en métricas Top@1/MRR.

Finalmente, se seleccionó la configuración con scheduler: 'cosine' y tie_weights= True, aunque las diferencias respecto a OneCycle sin tying no fueron estadísticamente significativas.

## 10) Random Search (espacio de búsqueda, resultados, checkpoints)

In [None]:

import math, time, copy, random, os, gc
import pandas as pd
from pprint import pprint
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def log_uniform(a,b):
    return math.exp(random.uniform(math.log(a), math.log(b)))

def sample_cfg(base_cfg_dict):
    cfg_local = copy.deepcopy(base_cfg_dict)
    cfg_local["embedding_dim"]   = random.choice([128,160,192])
    cfg_local["hidden_size"]     = random.choice([256,320,384])
    cfg_local["num_layers"]      = random.choice([1,2])
    cfg_local["dropout"]         = random.choice([0.2,0.3])

    # Optim
    cfg_local["lr"]              = log_uniform(5e-4, 5e-3)
    cfg_local["weight_decay"]    = random.choice([0.0, 1e-4, 5e-4])
    cfg_local["grad_clip"]       = random.choice([0.5, 1.0])

    # Data y task
    cfg_local["batch_size"]      = random.choice([64, 128])
    cfg_local["seq_len"]         = random.choice([16, 24])

    # Criterio y regularizacion
    cfg_local["label_smoothing"] = random.choice([0.03, 0.05])
    cfg_local["tie_weights"]     = True # fijado por pruebas A/B

    # Nuevo
    cfg_local["scheduler"] = "cosine" # fijado por pruebas A/B

    # OneCycle params (no aplican, pero mantenemos keys por compatibilidad)
    cfg_local["pct_start"]      = base_cfg_dict.get("pct_start", 0.15)
    cfg_local["div_factor"]     = base_cfg_dict.get("div_factor", 10.0)
    cfg_local["final_div_factor"]= base_cfg_dict.get("final_div_factor", 1e3)

    # Entrenamiento
    cfg_local["epochs"]   = base_cfg_dict["epochs"]
    cfg_local["patience"] = 2
    cfg_local["amp"]      = base_cfg_dict["amp"]
    return cfg_local

BASE = cfg  # dataclass
RESULTS = []
BEST = {"mrr": -1, "path": None, "cfg": None, "test": None}

N_TRIALS = 20  # n combinaciones a probar

for t in range(1, N_TRIALS+1):
    trial_cfg = sample_cfg(vars(BASE))  # dict
    print(f"\n=== TRIAL {t}/{N_TRIALS} ===")
    print({k:trial_cfg[k] for k in ["embedding_dim","hidden_size","num_layers","dropout",
                                     "seq_len","batch_size","lr","weight_decay","grad_clip",
                                     "label_smoothing"]}),


    # DataLoaders específicos del trial
    train_loader_t, val_loader_t, test_loader_t, n_test = make_dataloaders(
        train_seqs, val_seqs, test_seqs,
        seq_len=trial_cfg["seq_len"], batch_size=trial_cfg["batch_size"]
    )

    # Modelo para el trial
    model_t = ChordLSTM(vocab_size=len(vocab),
                        embedding_dim=trial_cfg["embedding_dim"],
                        hidden_size=trial_cfg["hidden_size"],
                        num_layers=trial_cfg["num_layers"],
                        dropout=trial_cfg["dropout"],
                        tie_weights=trial_cfg["tie_weights"]).to(device)

    # Entrenamiento con early stopping por MRR (val)
    save_path = os.path.join(BASE.save_dir, f"rs_trial_{t}.pt")
    best_mrr_val = train_once(
        model_t, train_loader_t, val_loader_t,
        epochs=trial_cfg["epochs"], lr=trial_cfg["lr"], weight_decay=trial_cfg["weight_decay"],
        grad_clip=trial_cfg["grad_clip"], amp=trial_cfg["amp"],
        save_path=save_path, label_smoothing=trial_cfg["label_smoothing"],
        patience=trial_cfg["patience"], device=device,
        scheduler_type=trial_cfg["scheduler"],
        pct_start=trial_cfg["pct_start"], div_factor=trial_cfg["div_factor"],
        final_div_factor=trial_cfg["final_div_factor"]
    )

    # Evaluación en test (coherente con training)
    test_crit = nn.CrossEntropyLoss(label_smoothing=trial_cfg["label_smoothing"])
    testm = evaluate(model_t, test_loader_t, test_crit, device)


    row = {"trial": t, **{k:trial_cfg[k] for k in ["embedding_dim","hidden_size","num_layers","dropout",
                                                   "seq_len","batch_size","lr","weight_decay","grad_clip",
                                                   "label_smoothing","tie_weights"]},
           "tie_weights": True, "scheduler": "cosine",
           "val_best_MRR": best_mrr_val,
           **{k:testm[k] for k in ["loss","ppl","Top@1","Top@3","Top@5","MRR"]},
           "n_test": n_test, "ckpt": save_path}
    RESULTS.append(row)

    # Track best por MRR (test)
    if testm["MRR"] > BEST["mrr"]:
        BEST = {"mrr": testm["MRR"], "path": save_path, "cfg": trial_cfg, "test": testm}

    # Limpieza
    del model_t; gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Mostrar resultados
df_rs = pd.DataFrame(RESULTS).sort_values(by="MRR", ascending=False)
print("Top 10 trials por MRR (test):")
display(df_rs.head(10))

# Guardar CSV con todos los ensayos
os.makedirs(BASE.save_dir, exist_ok=True)
csv_path = os.path.join(BASE.save_dir, "random_search_results.csv")
df_rs.to_csv(csv_path, index=False)
print("Resultados guardados en:", csv_path)

print("\n=== MEJOR ENSAYO (por MRR en test) ===")
print("Config:", {k:BEST["cfg"][k] for k in ["embedding_dim","hidden_size","num_layers","dropout",
                                             "seq_len","batch_size","lr","weight_decay","grad_clip",
                                             "label_smoothing","tie_weights", "scheduler"]})
print("Metrics (test):", BEST["test"])
print("Checkpoint:", BEST["path"])



=== TRIAL 1/20 ===
{'embedding_dim': 192, 'hidden_size': 256, 'num_layers': 1, 'dropout': 0.3, 'seq_len': 24, 'batch_size': 64, 'lr': 0.0008787429588337066, 'weight_decay': 0.0, 'grad_clip': 0.5, 'label_smoothing': 0.03}
Epoch 1 | val loss 2.4071 ppl 11.10 Top@1 0.419 Top@3 0.657 Top@5 0.753 MRR 0.567
Epoch 2 | val loss 2.3254 ppl 10.23 Top@1 0.429 Top@3 0.674 Top@5 0.769 MRR 0.578
Epoch 3 | val loss 2.3036 ppl 10.01 Top@1 0.438 Top@3 0.681 Top@5 0.769 MRR 0.585
Epoch 4 | val loss 2.2411 ppl 9.40 Top@1 0.454 Top@3 0.686 Top@5 0.777 MRR 0.597
Epoch 5 | val loss 2.2277 ppl 9.28 Top@1 0.458 Top@3 0.690 Top@5 0.778 MRR 0.600
Epoch 6 | val loss 2.2243 ppl 9.25 Top@1 0.459 Top@3 0.692 Top@5 0.780 MRR 0.601

=== TRIAL 2/20 ===
{'embedding_dim': 128, 'hidden_size': 256, 'num_layers': 1, 'dropout': 0.2, 'seq_len': 16, 'batch_size': 128, 'lr': 0.0016007565680120513, 'weight_decay': 0.0, 'grad_clip': 0.5, 'label_smoothing': 0.05}
Epoch 1 | val loss 2.4485 ppl 11.57 Top@1 0.424 Top@3 0.668 Top@5 

Unnamed: 0,trial,embedding_dim,hidden_size,num_layers,dropout,seq_len,batch_size,lr,weight_decay,grad_clip,...,scheduler,val_best_MRR,loss,ppl,Top@1,Top@3,Top@5,MRR,n_test,ckpt
16,17,192,320,2,0.3,16,64,0.001371,0.0005,1.0,...,cosine,0.60837,2.248762,9.476001,0.45359,0.69563,0.775026,0.597564,9610,/content/models_rs/rs_trial_17.pt
11,12,192,320,2,0.2,16,128,0.002395,0.0,1.0,...,cosine,0.605005,2.334087,10.320029,0.450989,0.695525,0.77794,0.596172,9610,/content/models_rs/rs_trial_12.pt
9,10,128,256,2,0.3,16,64,0.003952,0.0001,1.0,...,cosine,0.60414,2.322862,10.204841,0.44974,0.696878,0.779188,0.595479,9610,/content/models_rs/rs_trial_10.pt
17,18,128,320,1,0.2,16,64,0.001938,0.0,0.5,...,cosine,0.604135,2.271196,9.690984,0.449948,0.692716,0.777211,0.595159,9610,/content/models_rs/rs_trial_18.pt
4,5,160,256,2,0.3,16,64,0.001889,0.0005,0.5,...,cosine,0.606909,2.329096,10.268655,0.447451,0.695838,0.776691,0.593974,9610,/content/models_rs/rs_trial_5.pt
12,13,160,256,2,0.2,16,64,0.004496,0.0005,1.0,...,cosine,0.604112,2.351196,10.498117,0.450676,0.688658,0.770656,0.593781,9610,/content/models_rs/rs_trial_13.pt
8,9,160,256,1,0.3,16,128,0.000816,0.0001,1.0,...,cosine,0.603441,2.32675,10.244592,0.444433,0.697919,0.77846,0.593088,9610,/content/models_rs/rs_trial_9.pt
1,2,128,256,1,0.2,16,128,0.001601,0.0,0.5,...,cosine,0.60211,2.347062,10.454803,0.447451,0.691988,0.772529,0.592816,9610,/content/models_rs/rs_trial_2.pt
15,16,192,256,1,0.3,16,128,0.004424,0.0,1.0,...,cosine,0.60159,2.268224,9.662229,0.4436,0.690531,0.77461,0.591012,9610,/content/models_rs/rs_trial_16.pt
2,3,192,320,1,0.2,16,64,0.002495,0.0001,1.0,...,cosine,0.595211,2.370365,10.701294,0.443809,0.688033,0.77232,0.590707,9610,/content/models_rs/rs_trial_3.pt


Resultados guardados en: /content/models_rs/random_search_results.csv

=== MEJOR ENSAYO (por MRR en test) ===
Config: {'embedding_dim': 192, 'hidden_size': 320, 'num_layers': 2, 'dropout': 0.3, 'seq_len': 16, 'batch_size': 64, 'lr': 0.0013711030226214714, 'weight_decay': 0.0005, 'grad_clip': 1.0, 'label_smoothing': 0.03, 'tie_weights': True, 'scheduler': 'cosine'}
Metrics (test): {'loss': 2.2487623936179775, 'ppl': 9.476001016978005, 'Top@1': 0.4535900104120296, 'Top@3': 0.695629552537023, 'Top@5': 0.7750260146053723, 'MRR': 0.5975637307772403}
Checkpoint: /content/models_rs/rs_trial_17.pt


warnings a revisar:
/tmp/ipython-input-1199272555.py:31: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
  scaler = torch.cuda.amp.GradScaler(enabled=(amp and device.type=='cuda'))
/tmp/ipython-input-1199272555.py:43: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with torch.cuda.amp.autocast(enabled=(amp and device.type=='cuda')):

## 11) Empaquetar el mejor checkpoint con metadatos y nombre informativo

In [None]:
# === Re-entrenar SOLO el mejor trial del CSV y exportar ===
import os, json, sys, datetime, torch, pandas as pd
import torch.nn as nn

CSV_PATH = "random_search_results.csv"
TOKENIZER_PATH = "lstm_tokenizer_.json"

# 1) Leer mejor fila por MRR
df = pd.read_csv(CSV_PATH)
best = df.sort_values("MRR", ascending=False).iloc[0]

cfg_best = {
    "embedding_dim":   int(best["embedding_dim"]),
    "hidden_size":     int(best["hidden_size"]),
    "num_layers":      int(best["num_layers"]),
    "dropout":         float(best["dropout"]),
    "seq_len":         int(best["seq_len"]),
    "batch_size":      int(best["batch_size"]),
    "lr":              float(best["lr"]),
    "weight_decay":    float(best["weight_decay"]),
    "grad_clip":       float(best["grad_clip"]),
    "label_smoothing": float(best["label_smoothing"]),
    "tie_weights":     bool(best.get("tie_weights", True)),
    "scheduler":       str(best.get("scheduler", "cosine")),
}

print("Reentrenando config:", cfg_best)

# 2) Dataloaders con la config ganadora (usa tus funciones ya definidas)
train_loader, val_loader, test_loader, n_test = make_dataloaders(
    train_seqs, val_seqs, test_seqs,
    seq_len=cfg_best["seq_len"], batch_size=cfg_best["batch_size"]
)

# 3) Modelo
model = ChordLSTM(
    vocab_size=len(vocab),
    embedding_dim=cfg_best["embedding_dim"],
    hidden_size=cfg_best["hidden_size"],
    num_layers=cfg_best["num_layers"],
    dropout=cfg_best["dropout"],
    tie_weights=cfg_best["tie_weights"]
).to(device)

# 4) Entrenar con el mismo esquema (early stop por MRR)
_ = train_once(
    model, train_loader, val_loader,
    epochs=cfg.epochs, lr=cfg_best["lr"], weight_decay=cfg_best["weight_decay"],
    grad_clip=cfg_best["grad_clip"], amp=cfg.amp,
    save_path=None, label_smoothing=cfg_best["label_smoothing"],
    patience=2, device=device,
    scheduler_type=cfg_best["scheduler"], pct_start=cfg.pct_start,
    div_factor=cfg.div_factor, final_div_factor=cfg.final_div_factor
)

# 5) Test coherente con el training
test_crit = nn.CrossEntropyLoss(label_smoothing=cfg_best["label_smoothing"])
testm = evaluate(model, test_loader, test_crit, device)
print("Test:", testm)

# 6) Exportar con vocab del tokenizer (reproducible)
with open(TOKENIZER_PATH, "r") as f:
    tok = json.load(f)
vocab = tok["vocab"]
stoi = {t:i for i,t in enumerate(vocab)}
itos = {i:t for i,t in enumerate(vocab)}

export = {
    "model_state": model.state_dict(),
    "model_class": "ChordLSTM",
    "config": cfg_best,
    "metrics_test": testm,
    "stoi": stoi,
    "itos": itos,
    "vocab_size": len(stoi),
    "created_at": datetime.datetime.utcnow().isoformat() + "Z",
    "env": {"python": sys.version, "torch": torch.__version__, "cuda_available": torch.cuda.is_available()},
}

out_dir = "."
name_info = f"Top1-{testm['Top@1']:.4f}_MRR-{testm['MRR']:.4f}_ppl-{testm['ppl']:.3f}"
best_named = os.path.join(out_dir, f"lstm_rs_best__{name_info}.pt")
stable_best = os.path.join(out_dir, "lstm_rs_best.pt")

torch.save(export, best_named)
torch.save(export, stable_best)

print("✅ Guardado")
print(" ├ OUT 1:", best_named)
print(" └ OUT 2:", stable_best)


Reentrenando config: {'embedding_dim': 192, 'hidden_size': 320, 'num_layers': 2, 'dropout': 0.3, 'seq_len': 16, 'batch_size': 64, 'lr': 0.0013711030226214, 'weight_decay': 0.0005, 'grad_clip': 1.0, 'label_smoothing': 0.03, 'tie_weights': True, 'scheduler': 'cosine'}
Epoch 1 | val loss 2.3547 ppl 10.53 Top@1 0.426 Top@3 0.666 Top@5 0.755 MRR 0.573
Epoch 2 | val loss 2.2820 ppl 9.80 Top@1 0.442 Top@3 0.680 Top@5 0.767 MRR 0.587
Epoch 3 | val loss 2.2506 ppl 9.49 Top@1 0.448 Top@3 0.692 Top@5 0.773 MRR 0.593
Epoch 4 | val loss 2.2280 ppl 9.28 Top@1 0.459 Top@3 0.690 Top@5 0.776 MRR 0.601
Epoch 5 | val loss 2.2312 ppl 9.31 Top@1 0.462 Top@3 0.696 Top@5 0.777 MRR 0.603
Epoch 6 | val loss 2.2347 ppl 9.34 Top@1 0.463 Top@3 0.692 Top@5 0.776 MRR 0.603
Test: {'loss': 2.252060778257626, 'ppl': 9.50730811622195, 'Top@1': 0.45036420395421434, 'Top@3': 0.6936524453818116, 'Top@5': 0.7761706556043317, 'MRR': 0.5956792587544246}
✅ Guardado
 ├ OUT 1: ./lstm_rs_best__Top1-0.4504_MRR-0.5957_ppl-9.507.pt

  "created_at": datetime.datetime.utcnow().isoformat() + "Z",
