(15p) Using the test dataset, calculate the perplexity of each language model. Report the results obtained. If you experience variable overflow, use probabilities in log space.

El link de la carpeta donde estan los conjuntos de entrenamiento + testing y los modelos entrenados es:

https://uniandes-my.sharepoint.com/:f:/g/personal/a_mosquerah2_uniandes_edu_co/Em-od1gldI9BnnTjpUXqZXcB8fjXUoybI35zktWPWzyqpw?e=6dT4ng

(Solo se puede abrir con correo uniandes)

In [2]:
# carpeta conjunto de entrenamiento y de testing de 20news y de BAC
tercer_punto_folder: str = "tercer-punto"
# carpeta modelos de unigramas, bigramas y trigramas de cada corpus.
cuarto_punto_folder: str = "cuarto-punto"
group_code: str = "group-ansada"

In [3]:
# =========================
# Perplejidad con backoff y trigramas en DISCO (SQLite) — Tipado + Docstrings
# =========================
import os
import re
import gzip
import json
import math
import sqlite3
from functools import lru_cache
from typing import Dict, Optional, Tuple, Callable, Iterator


def _json_load_any(path: str) -> Dict[str, float]:
    """
    Carga un diccionario JSON {clave: prob} desde .json o .json.gz sin materializar más de lo necesario.

    Args:
        path: Ruta al archivo .json o .json.gz.

    Returns:
        Dict[str, float]: Mapa clave→probabilidad (por ejemplo, "w1 w2": 0.00123).
    """
    if path.endswith(".gz"):
        with gzip.open(path, "rt", encoding="utf-8") as f:  # type: ignore[name-defined]
            return json.load(f)
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def read_ngram_model_smart(prefix: str, n: int) -> Optional[Dict[str, float]]:
    """
    Localiza y carga el modelo n-grama del corpus dado.

    Convención de nombres:
      n=1 -> *_unigrams.json(.gz)
      n=2 -> *_bigrams.json(.gz)
      n=3 -> *_trigrams.json(.gz)

    Args:
        prefix: "20N" o "BAC".
        n: Orden del modelo (1, 2, 3).

    Returns:
        Dict[str, float] si existe; None en caso contrario.
    """
    suffix: str = {1: "uni", 2: "bi", 3: "tri"}[n]
    base: str = os.path.join(cuarto_punto_folder, "models", f"{prefix}_{group_code}")
    candidates = [f"{base}_{suffix}grams.json", f"{base}_{suffix}grams.json.gz"]
    for p in candidates:
        if os.path.exists(p):
            try:
                model = _json_load_any(p)
                print(f"[OK] Modelo {n}-gramas cargado: {os.path.basename(p)}")
                return model
            except Exception as e:  # pragma: no cover (logging)
                print(f"[WARN] Falló carga de {p}: {e}")
    print(f"[ERROR] No se encontró modelo {n}-gramas para {prefix}")
    return None


def _trigram_json_gz_to_sqlite(gz_path: str, sqlite_path: str) -> None:
    """
    Convierte un JSON(.gz) de trigramas {"w1 w2 w3": p, ...} a una tabla SQLite:
        tri(w1 TEXT, w2 TEXT, w3 TEXT, p REAL, PRIMARY KEY(w1,w2,w3))

    Se procesa en streaming para evitar usar RAM alta.

    Args:
        gz_path: Ruta al .json.gz de trigramas.
        sqlite_path: Ruta de salida .sqlite.
    """
    if os.path.exists(sqlite_path):
        return
    os.makedirs(os.path.dirname(sqlite_path), exist_ok=True)
    conn = sqlite3.connect(sqlite_path)
    cur = conn.cursor()
    cur.executescript(
        "PRAGMA journal_mode=OFF; PRAGMA synchronous=OFF; PRAGMA temp_store=MEMORY;"
        "CREATE TABLE tri (w1 TEXT, w2 TEXT, w3 TEXT, p REAL, PRIMARY KEY(w1,w2,w3));"
    )

    pat: re.Pattern[str] = re.compile(r'^\s*"([^"]+)":\s*([0-9eE+\-\.]+)\s*,?\s*$')
    batch: list[tuple[str, str, str, float]] = []

    with gzip.open(gz_path, "rt", encoding="utf-8") as f:  # type: ignore[name-defined]
        _ = f.readline()  # '{'
        for line in f:
            s = line.strip()
            if s == "}":
                break
            m = pat.match(s)
            if not m:
                continue
            key, p_str = m.group(1), m.group(2)
            parts = key.split(" ")
            if len(parts) != 3:
                continue
            batch.append((parts[0], parts[1], parts[2], float(p_str)))
            if len(batch) >= 50_000:
                cur.executemany("INSERT OR REPLACE INTO tri(w1,w2,w3,p) VALUES(?,?,?,?)", batch)
                conn.commit()
                batch.clear()
    if batch:
        cur.executemany("INSERT OR REPLACE INTO tri(w1,w2,w3,p) VALUES(?,?,?,?)", batch)
        conn.commit()
    cur.execute("ANALYZE")
    conn.commit()
    conn.close()
    print(f"[OK] Trigramas indexados en {sqlite_path}")


def _make_tri_getter(prefix: str) -> Tuple[Callable[[str, str, str], Optional[float]], sqlite3.Connection]:
    """
    Prepara el acceso a trigramas vía SQLite y devuelve:
      - una función tri_get(w1, w2, w3) -> prob o None
      - la conexión SQLite (para cerrar al terminar)

    Args:
        prefix: "20N" o "BAC".

    Returns:
        (tri_get, conn)
    """
    gz = os.path.join(cuarto_punto_folder, "models", f"{prefix}_{group_code}_trigrams.json.gz")
    db = os.path.join(cuarto_punto_folder, "models", f"{prefix}_{group_code}_trigrams.sqlite")
    if not os.path.exists(gz):
        gz_alt = gz[:-3]
        if os.path.exists(gz_alt):
            raise RuntimeError("Convierte el .json de trigramas a .json.gz o ajusta el parser.")
        raise FileNotFoundError(f"No existe {gz} ni {gz_alt}")
    _trigram_json_gz_to_sqlite(gz, db)

    conn = sqlite3.connect(db)
    cur = conn.cursor()

    @lru_cache(maxsize=200_000)
    def tri_get(a: str, b: str, c: str) -> Optional[float]:
        row = cur.execute("SELECT p FROM tri WHERE w1=? AND w2=? AND w3=?", (a, b, c)).fetchone()
        return float(row[0]) if row else None

    return tri_get, conn


def calculate_perplexity_backoff_sqltri(
    testing_file: str,
    uni: Dict[str, float],
    bi: Dict[str, float],
    tri_get: Callable[[str, str, str], Optional[float]]
) -> Tuple[float, float, float]:
    """
    Calcula perplejidad de modelos unigrama/bigrama/trigrama sobre un corpus de prueba.

    - Unigrama: usa p_unigram(w). Si falta, usa epsilon.
    - Bigrama : P(w1) * Π P(w_i | w_{i-1}); si (w_{i-1}, w_i) falta, backoff a p_unigram(w_i).
    - Trigrama: P(w1) * P(w2|w1) * Π P(w_i | w_{i-2}, w_{i-1});
                si tri falta -> backoff a bi; si bi falta -> backoff a uni.

    Se acumulan log-probabilidades con *clipping* para evitar underflow.

    Args:
        testing_file: Ruta del archivo de prueba (una oración por línea).
        uni: Diccionario unigrama {"w": p}.
        bi: Diccionario bigrama {"w1 w2": p}.
        tri_get: Función que consulta trigramas en disco.

    Returns:
        (ppl_unigram, ppl_bigram, ppl_trigram)
    """
    epsilon_uni: float = 1.0 / max(1, 10 * len(uni))  # piso seguro OOV
    total_log_u: float = 0.0
    total_log_b: float = 0.0
    total_log_t: float = 0.0
    total_tokens: int = 0

    def _log(p: float) -> float:
        return math.log(p if p > 1e-300 else 1e-300)

    with open(testing_file, "r", encoding="utf-8") as f:
        for raw in f:
            s = raw.strip()
            if not s:
                continue
            toks = s.split()
            n = len(toks)
            if n == 0:
                continue

            total_tokens += n

            # --- Unigram ---
            for w in toks:
                p = uni.get(w, epsilon_uni)
                total_log_u += _log(p)

            # --- Bigram ---
            p1 = uni.get(toks[0], epsilon_uni)
            total_log_b += _log(p1)
            for i in range(n - 1):
                a, b_ = toks[i], toks[i + 1]
                p_bi = bi.get(f"{a} {b_}")
                if p_bi is None:
                    p_bi = uni.get(b_, epsilon_uni)
                total_log_b += _log(p_bi)

            # --- Trigram ---
            p1 = uni.get(toks[0], epsilon_uni)
            total_log_t += _log(p1)
            if n >= 2:
                p2 = bi.get(f"{toks[0]} {toks[1]}")
                if p2 is None:
                    p2 = uni.get(toks[1], epsilon_uni)
                total_log_t += _log(p2)
            for i in range(n - 2):
                a, b_, c = toks[i], toks[i + 1], toks[i + 2]
                p_tri = tri_get(a, b_, c)
                if p_tri is None:
                    p_tri = bi.get(f"{b_} {c}")
                    if p_tri is None:
                        p_tri = uni.get(c, epsilon_uni)
                total_log_t += _log(p_tri)

    if total_tokens == 0:
        return float("inf"), float("inf"), float("inf")

    ppl_u: float = math.exp(-total_log_u / total_tokens)
    ppl_b: float = math.exp(-total_log_b / total_tokens)
    ppl_t: float = math.exp(-total_log_t / total_tokens)
    return ppl_u, ppl_b, ppl_t


def run_perplexity_for(prefix: str) -> Optional[Tuple[float, float, float]]:
    """
    Ejecuta el cálculo de perplejidad para un corpus ('20N' o 'BAC'):
      - Carga uni/bi en RAM.
      - Abre acceso a tri en SQLite.
      - Evalúa sobre el archivo de testing de 'tercer-punto/'.

    Args:
        prefix: "20N" o "BAC".

    Returns:
        (ppl_uni, ppl_bi, ppl_tri) o None si faltan modelos.
    """
    testing_file: str = os.path.join(tercer_punto_folder, f"{prefix}_{group_code}_testing.txt")

    uni_raw = read_ngram_model_smart(prefix, 1)
    bi_raw = read_ngram_model_smart(prefix, 2)
    if not all([uni_raw, bi_raw]):
        print(f"[SKIP] Faltan uni/bi para {prefix}")
        return None

    # Llaves en RAM tal como están en archivo (sin duplicar estructuras)
    uni: Dict[str, float] = {k: float(v) for k, v in uni_raw.items()}
    bi: Dict[str, float] = {k: float(v) for k, v in bi_raw.items()}

    tri_get, conn = _make_tri_getter(prefix)
    try:
        pu, pb, pt = calculate_perplexity_backoff_sqltri(testing_file, uni, bi, tri_get)
        print(f"\n=== Perplejidad {prefix} ===")
        print(f"Unigramas : {pu:.4f}")
        print(f"Bigramas  : {pb:.4f}")
        print(f"Trigramas : {pt:.4f}")
        return pu, pb, pt
    finally:
        conn.close()


def run_all_and_report(save_path: Optional[str] = None) -> Dict[str, Dict[str, float]]:
    """
    Ejecuta perplejidad para ambos corpus (20N/BAC) y devuelve (y opcionalmente guarda) un reporte.

    Args:
        save_path: Ruta JSON para guardar resultados; si None, no guarda.

    Returns:
        Dict con estructura:
        {
          "20N": {"unigram": ..., "bigram": ..., "trigram": ...},
          "BAC": {"unigram": ..., "bigram": ..., "trigram": ...}
        }
    """
    report: Dict[str, Dict[str, float]] = {}
    for prefix in ("20N", "BAC"):
        res = run_perplexity_for(prefix)
        if res is not None:
            pu, pb, pt = res
            report[prefix] = {"unigram": pu, "bigram": pb, "trigram": pt}
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(report, f, ensure_ascii=False, indent=2)
        print(f"[OK] Reporte guardado en {save_path}")
    return report


if __name__ == '__main__':
    print("\n--- Calculando perplejidad (RAM<10GB, VRAM≈0, trigram en SQLite) ---")
    run_perplexity_for("20N")
    run_perplexity_for("BAC")
    # Opcional: guardar reporte
    # run_all_and_report(os.path.join(cuarto_punto_folder, "perplexity_report.json"))


--- Calculando perplejidad (RAM<10GB, VRAM≈0, trigram en SQLite) ---
[OK] Modelo 1-gramas cargado: 20N_group-ansada_unigrams.json.gz
[OK] Modelo 2-gramas cargado: 20N_group-ansada_bigrams.json.gz
[OK] Trigramas indexados en cuarto-punto/models/20N_group-ansada_trigrams.sqlite

=== Perplejidad 20N ===
Unigramas : 1105.8665
Bigramas  : 1898.1203
Trigramas : 4892.7925
[OK] Modelo 1-gramas cargado: BAC_group-ansada_unigrams.json.gz
[OK] Modelo 2-gramas cargado: BAC_group-ansada_bigrams.json.gz
[OK] Trigramas indexados en cuarto-punto/models/BAC_group-ansada_trigrams.sqlite

=== Perplejidad BAC ===
Unigramas : 827.3796
Bigramas  : 1038.3964
Trigramas : 5444.9151


(15p) Using your best language model, build a method/function that automatically generates sentences by receiving the first word of a sentence as input. Take different tests and document them.

In [4]:
# ===============================
# Generador de oraciones (RAM<10GB) usando trigramas en SQLite + backoff
# ===============================
import os
import re
import json
import gzip
import math
import sqlite3
import random
from functools import lru_cache
from typing import Dict, List, Tuple, Optional, Callable

cuarto_punto_folder: str = "cuarto-punto"
group_code: str = "group-ansada"


# ---------- Utils ----------
def _json_load_any(path: str) -> Dict[str, float]:
    """
    Carga un diccionario JSON {clave: prob} desde .json o .json.gz.

    Args:
        path: Ruta del archivo.

    Returns:
        Dict[str, float]: Mapa de probabilidades.
    """
    if path.endswith(".gz"):
        with gzip.open(path, "rt", encoding="utf-8") as f:
            return json.load(f)
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def _path(prefix: str, kind: str) -> str:
    """
    Construye rutas a modelos.

    Args:
        prefix: "20N" o "BAC".
        kind: uno de {"uni","bi","tri_json","tri_db"}.

    Returns:
        str: Ruta al recurso solicitado.
    """
    base = os.path.join(cuarto_punto_folder, "models", f"{prefix}_{group_code}")
    if kind == "uni":
        return base + "_unigrams.json.gz" if os.path.exists(base + "_unigrams.json.gz") else base + "_unigrams.json"
    if kind == "bi":
        return base + "_bigrams.json.gz" if os.path.exists(base + "_bigrams.json.gz") else base + "_bigrams.json"
    if kind == "tri_json":
        return base + "_trigrams.json.gz" if os.path.exists(base + "_trigrams.json.gz") else base + "_trigrams.json"
    if kind == "tri_db":
        return base + "_trigrams.sqlite"
    raise ValueError(f"kind inválido: {kind}")


# ---------- JSON(.gz) -> SQLite para trigramas (streaming) ----------
def _trigram_json_to_sqlite(tri_json_path: str, tri_db_path: str) -> None:
    """
    Indexa trigramas {"w1 w2 w3": p} en SQLite (tabla tri(w1,w2,w3,p)) sin cargar todo en RAM.

    Args:
        tri_json_path: Ruta a *_trigrams.json(.gz).
        tri_db_path: Ruta de salida *.sqlite.
    """
    if os.path.exists(tri_db_path):
        return
    os.makedirs(os.path.dirname(tri_db_path), exist_ok=True)
    conn = sqlite3.connect(tri_db_path)
    cur = conn.cursor()
    cur.executescript("""
        PRAGMA journal_mode=OFF;
        PRAGMA synchronous=OFF;
        PRAGMA temp_store=MEMORY;
        CREATE TABLE tri (w1 TEXT, w2 TEXT, w3 TEXT, p REAL, PRIMARY KEY(w1,w2,w3));
    """)

    pat: re.Pattern[str] = re.compile(r'^\s*"([^"]+)":\s*([0-9eE+\-\.]+)\s*,?\s*$')
    batch: List[Tuple[str, str, str, float]] = []

    fh = gzip.open(tri_json_path, "rt", encoding="utf-8") if tri_json_path.endswith(".gz") \
         else open(tri_json_path, "r", encoding="utf-8")
    with fh:
        _ = fh.readline()  # '{'
        for line in fh:
            s = line.strip()
            if s == "}":
                break
            m = pat.match(s)
            if not m:
                continue
            key, p_str = m.group(1), m.group(2)
            parts = key.split(" ")
            if len(parts) != 3:
                continue
            batch.append((parts[0], parts[1], parts[2], float(p_str)))
            if len(batch) >= 50_000:
                cur.executemany("INSERT OR REPLACE INTO tri(w1,w2,w3,p) VALUES(?,?,?,?)", batch)
                conn.commit()
                batch.clear()
    if batch:
        cur.executemany("INSERT OR REPLACE INTO tri(w1,w2,w3,p) VALUES(?,?,?,?)", batch)
        conn.commit()

    cur.execute("ANALYZE")
    conn.commit()
    conn.close()
    print(f"[OK] Trigramas indexados: {tri_db_path}")


def _make_tri_access(prefix: str) -> Tuple[Callable[[str, str], List[Tuple[str, float]]], sqlite3.Connection]:
    """
    Prepara acceso rápido a seguidores de trigramas vía SQLite.

    Args:
        prefix: "20N" o "BAC".

    Returns:
        (followers_fn, conn) donde followers_fn(w1,w2) -> [(w3, p), ...] ordenado por p desc.
    """
    tri_json = _path(prefix, "tri_json")
    tri_db = _path(prefix, "tri_db")
    _trigram_json_to_sqlite(tri_json, tri_db)
    conn = sqlite3.connect(tri_db)
    cur = conn.cursor()

    @lru_cache(maxsize=200_000)
    def followers(w1: str, w2: str) -> List[Tuple[str, float]]:
        rows = cur.execute("SELECT w3,p FROM tri WHERE w1=? AND w2=?", (w1, w2)).fetchall()
        rows.sort(key=lambda x: x[1], reverse=True)
        return [(str(w3), float(p)) for (w3, p) in rows]

    return followers, conn


# ---------- Muestreador ----------
def _sample(
    scored: List[Tuple[str, float]],
    temperature: float = 0.9,
    top_k: int = 50,
    top_p: float = 0.95,
    rng: Optional[random.Random] = None
) -> str:
    """
    Muestrea un token desde una lista (token, score) con temperatura + top-k + nucleus (top-p).

    Args:
        scored: Candidatos (token, puntuación/probabilidad).
        temperature: Suavizado (>=0). 1.0 = sin cambio.
        top_k: recorta a los k mejores (si >0).
        top_p: recorta por prob acumulada (0<p<=1).
        rng: generador aleatorio (reproducible si se pasa semilla).

    Returns:
        str: token elegido (o </s> si no hay candidatos).
    """
    if not scored:
        return "</s>"
    if rng is None:
        rng = random

    # Orden y top-k
    scored = sorted(scored, key=lambda x: x[1], reverse=True)
    if top_k and top_k > 0:
        scored = scored[:min(top_k, len(scored))]

    # Normaliza a prob y aplica top-p (nucleus)
    total = sum(max(0.0, p) for _, p in scored) or 1e-12
    probs = [max(0.0, p) / total for _, p in scored]

    cut: List[Tuple[str, float]] = []
    acc = 0.0
    for (tok, _), p in zip(scored, probs):
        cut.append((tok, p))
        acc += p
        if top_p is not None and acc >= top_p:
            break

    # Temperatura sobre log-prob
    logs = [math.log(max(1e-12, p)) / max(1e-6, temperature) for _, p in cut]
    m = max(logs)
    exps = [math.exp(x - m) for x in logs]
    z = sum(exps)
    adj = [e / z for e in exps]

    r = rng.random()
    c = 0.0
    for (tok, _), p in zip(cut, adj):
        c += p
        if r <= c:
            return tok
    return cut[-1][0]


# ---------- Generador (elige orden 1/2/3; por defecto 3 con backoff) ----------
def build_sentence_generator(prefix: str = "20N", default_seed: Optional[int] = 7):
    """
    Construye un generador de oraciones para el corpus dado.

    Carga unigrama y bigrama en RAM; trigramas se consultan en SQLite.
    Por defecto genera con TRIGRAMAS (mejor fluidez) y backoff a uni.

    Args:
        prefix: "20N" o "BAC".
        default_seed: Semilla por defecto para reproducibilidad.

    Returns:
        (gen_fn, close_fn)
          - gen_fn(first_word, max_len=30, temperature=0.9, top_k=60, top_p=0.95,
                  seed=None, order=3) -> (sentence, tokens)
          - close_fn(): cierra la conexión SQLite interna.
    """
    uni: Dict[str, float] = _json_load_any(_path(prefix, "uni"))      # {"w": p}
    bi: Dict[str, float] = _json_load_any(_path(prefix, "bi"))        # {"w1 w2": p}
    tri_followers, tri_conn = _make_tri_access(prefix)

    uni_items: List[Tuple[str, float]] = [(w, float(p)) for w, p in uni.items()]

    def gen(
        first_word: str,
        max_len: int = 30,
        temperature: float = 0.9,
        top_k: int = 60,
        top_p: float = 0.95,
        seed: Optional[int] = default_seed,
        order: int = 3
    ) -> Tuple[str, List[str]]:
        """
        Genera una oración dada la primera palabra.

        Args:
            first_word: Primera palabra de la oración (se normaliza a minúsculas).
            max_len: longitud máxima de tokens (incluye etiquetas).
            temperature: parámetro de muestreo.
            top_k: recorte por k mejores candidatos.
            top_p: nucleus sampling.
            seed: semilla para reproducibilidad.
            order: 1=unigrama (best-by-perplexity), 2=bigrama, 3=trigrama con backoff.

        Returns:
            (superficie_sin_etiquetas, lista_de_tokens_incluyendo_<s>/</s>)
        """
        rng = random.Random(seed)
        fw = first_word.strip().lower()
        if fw not in uni:
            fw = "<UNK>" if "<UNK>" in uni else fw

        tokens: List[str] = ["<s>", fw]

        def _next_from_bigram(a: str) -> str:
            # candidatos del bigrama a->?
            cands = [(w2, float(p)) for (w12, p) in bi.items() if w12.startswith(a + " ")]
            return _sample(cands if cands else uni_items, temperature, top_k, top_p, rng)

        while len(tokens) < max_len:
            if tokens[-1] == "</s>":
                break

            if order == 1:
                nxt = _sample(uni_items, temperature, top_k, top_p, rng)

            elif order == 2:
                prev1 = tokens[-1]
                nxt = _next_from_bigram(prev1)

            else:  # order == 3 (por defecto)
                prev2 = tokens[-2] if len(tokens) >= 2 else "<s>"
                prev1 = tokens[-1]
                cand = tri_followers(prev2, prev1)
                if cand:
                    nxt = _sample(cand, temperature, top_k, top_p, rng)
                else:
                    # backoff simple a unigramas si no hay contexto
                    nxt = _sample(uni_items, temperature, top_k, top_p, rng)

            tokens.append(nxt)
            if len(tokens) >= max_len - 1 and "</s>" in uni:
                tokens.append("</s>")

        surface = [t for t in tokens if t not in ("<s>", "</s>")]
        return " ".join(surface), tokens

    def close() -> None:
        """Cierra recursos asociados (conexión SQLite)."""
        try:
            tri_conn.close()
        except Exception:
            pass

    return gen, close


# ===============================
# PRUEBAS (documentación breve)
# ===============================
if __name__ == '__main__':
    os.makedirs("quinto-punto", exist_ok=True)

    tests = [
        ("20N", ["the", "this", "i", "if"]),
        ("BAC", ["i", "my", "today", "we"]),
    ]
    configs = [
        # Calidad (trigrama backoff)
        {"order": 3, "temperature": 0.7, "top_k": 60, "top_p": 0.95, "seed": 7},
        {"order": 3, "temperature": 1.0, "top_k": 60, "top_p": 0.95, "seed": 21},
        # Estricto “best-by-perplexity” (unigrama)
        {"order": 1, "temperature": 1.0, "top_k": 0, "top_p": 1.0, "seed": 7},
    ]

    report_lines: List[Dict[str, object]] = []
    for prefix, starters in tests:
        gen, close = build_sentence_generator(prefix)
        print(f"\n--- Samples {prefix} ---")
        for cfg in configs:
            for w in starters:
                s, toks = gen(w, max_len=25, **cfg)  # type: ignore[arg-type]
                print(f"[{prefix}] w='{w}' cfg={cfg} -> {s}")
                report_lines.append({
                    "corpus": prefix, "first_word": w, **cfg,
                    "max_len": 25, "sentence": s, "tokens": toks
                })
        close()

    # Guarda las muestras (documentación de pruebas)
    with open("quinto-punto/sentence_samples.jsonl", "w", encoding="utf-8") as f:
        for r in report_lines:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print("\n[OK] Muestras guardadas en quinto-punto/sentence_samples.jsonl")


--- Samples 20N ---
[20N] w='the' cfg={'order': 3, 'temperature': 0.7, 'top_k': 60, 'top_p': 0.95, 'seed': 7} -> the NUM and NUM
[20N] w='this' cfg={'order': 3, 'temperature': 0.7, 'top_k': 60, 'top_p': 0.95, 'seed': 7} -> this is a matter of have you ever hated someone
[20N] w='i' cfg={'order': 3, 'temperature': 0.7, 'top_k': 60, 'top_p': 0.95, 'seed': 7} -> i dont know about the clipper chip and other <UNK> assholes just move closer
[20N] w='if' cfg={'order': 3, 'temperature': 0.7, 'top_k': 60, 'top_p': 0.95, 'seed': 7} -> if you are
[20N] w='the' cfg={'order': 3, 'temperature': 1.0, 'top_k': 60, 'top_p': 0.95, 'seed': 21} -> the first three decades before embarking on a good life
[20N] w='this' cfg={'order': 3, 'temperature': 1.0, 'top_k': 60, 'top_p': 0.95, 'seed': 21} -> this is that he would have an argument is invalid because it was NUM NUM NUM NUM NUM NUM a clone of mswindows
[20N] w='i' cfg={'order': 3, 'temperature': 1.0, 'top_k': 60, 'top_p': 0.95, 'seed': 21} -> i dont bel