(15p) Using the test dataset, calculate the perplexity of each language model. Report the results obtained. If you experience variable overflow, use probabilities in log space.

El link de la carpeta donde estan los conjuntos de entrenamiento + testing y los modelos entrenados es:

https://uniandes-my.sharepoint.com/:f:/g/personal/a_mosquerah2_uniandes_edu_co/Em-od1gldI9BnnTjpUXqZXcB8fjXUoybI35zktWPWzyqpw?e=6dT4ng

(Solo se puede abrir con correo uniandes)

In [None]:
# carpeta conjunto de entrenamiento y de testing de 20news y de BAC
tercer_punto_folder = "tercer-punto"
# carpeta modelos de unigramas, bigramas y trigramas de cada corpus. 
cuarto_punto_folder = "cuarto-punto"

In [2]:
import json
import os
import math
from collections import defaultdict

# ---------- Utilidades de carga/normalización ----------

def read_ngram_model(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            model = json.load(f)
        print(f"[OK] Modelo cargado desde '{file_path}'")
        return model
    except FileNotFoundError:
        print(f"[ERROR] Archivo no encontrado: '{file_path}'")
        return None
    except Exception as e:
        print(f"[ERROR] No se pudo cargar el modelo desde '{file_path}': {e}")
        return None

def _normalize_ngram_keys(model_dict, n):
    """
    Convierte llaves 'a b' / 'a b c' en tuplas ('a','b'[, 'c']) para O(1) consistente y evita splits repetidos.
    Devuelve (dict_normalizado, seguidores_por_contexto)
    seguidores_por_contexto:
        - bigrama: dict prev -> #tipos_siguientes
        - trigrama: dict (prev1,prev2) -> #tipos_siguientes
    """
    out = {}
    followers = defaultdict(int)
    if n == 1:
        # unigramas ya están por token -> p(token)
        # no se necesita followers
        for k, v in model_dict.items():
            out[(k,)] = float(v)
        return out, {}
    elif n == 2:
        next_types = defaultdict(set)
        for k, v in model_dict.items():
            a, b = k.split(' ', 1)
            out[(a, b)] = float(v)
            next_types[a].add(b)
        followers = {a: len(bs) for a, bs in next_types.items()}
        return out, followers
    elif n == 3:
        next_types = defaultdict(set)
        for k, v in model_dict.items():
            a, b, c = k.split(' ', 2)
            out[(a, b, c)] = float(v)
            next_types[(a, b)].add(c)
        followers = {ctx: len(cs) for ctx, cs in next_types.items()}
        return out, followers
    else:
        raise ValueError("n debe ser 1, 2 o 3")

# ---------- Perplejidad optimizada ----------

def calculate_perplexity(
    testing_file,
    unigram_model, bigram_model, trigram_model,
    vocab_size=None
):
    """
    Calcula perplejidad usando:
      - Accesos O(1) (tuplas como claves)
      - Conteos de contexto precomputados (evita sum(...) por llave)
      - Log-probabilidades para estabilidad
      - Lectura streaming del archivo de prueba
    """
    if any(m is None for m in (unigram_model, bigram_model, trigram_model)):
        return None, None, None

    # Normaliza llaves y precomputa seguidores/contexts
    uni, _ = _normalize_ngram_keys(unigram_model, 1)
    bi, followers_bi = _normalize_ngram_keys(bigram_model, 2)
    tri, followers_tri = _normalize_ngram_keys(trigram_model, 3)

    # Tamaño de vocabulario: por defecto = #unigramas distintos
    if vocab_size is None:
        vocab_size = len(uni)

    # Precalcula denominadores de Laplace para velocidad
    # Nota: tu fórmula original usa (#tipos_siguientes + V) como denominador de suavizado.
    # Mantengo esa lógica para equivalencia funcional, pero ahora O(1).
    laplace_uni_denom = (len(uni) + vocab_size)

    total_log_u = 0.0
    total_log_b = 0.0
    total_log_t = 0.0
    total_tokens = 0

    # Lectura streaming (sin cargar todo a memoria)
    try:
        with open(testing_file, 'r', encoding='utf-8') as f:
            for raw in f:
                s = raw.strip()
                if not s:
                    continue
                tokens = s.split()
                n = len(tokens)
                if n == 0:
                    continue

                total_tokens += n

                # -------- Unigram --------
                for w in tokens:
                    p = uni.get((w,), 0.0)
                    if p <= 0.0:
                        p = 1.0 / laplace_uni_denom
                    total_log_u += math.log(p)

                # -------- Bigram --------
                # P(w1) + Π P(w_i | w_{i-1})
                # w1 como unigrama
                p1 = uni.get((tokens[0],), 0.0)
                if p1 <= 0.0:
                    p1 = 1.0 / laplace_uni_denom
                total_log_b += math.log(p1)

                for i in range(n - 1):
                    a, b_ = tokens[i], tokens[i + 1]
                    p = bi.get((a, b_), 0.0)
                    if p <= 0.0:
                        context_types = followers_bi.get(a, 0)
                        p = 1.0 / (context_types + vocab_size)
                    total_log_b += math.log(p)

                # -------- Trigram --------
                # P(w1) * P(w2|w1) * Π P(w_i | w_{i-2}, w_{i-1})
                # w1 como unigrama
                p1 = uni.get((tokens[0],), 0.0)
                if p1 <= 0.0:
                    p1 = 1.0 / laplace_uni_denom
                total_log_t += math.log(p1)

                if n >= 2:
                    p2 = bi.get((tokens[0], tokens[1]), 0.0)
                    if p2 <= 0.0:
                        ctx_types = followers_bi.get(tokens[0], 0)
                        p2 = 1.0 / (ctx_types + vocab_size)
                    total_log_t += math.log(p2)

                for i in range(n - 2):
                    a, b_, c = tokens[i], tokens[i + 1], tokens[i + 2]
                    p = tri.get((a, b_, c), 0.0)
                    if p <= 0.0:
                        ctx_types = followers_tri.get((a, b_), 0)
                        p = 1.0 / (ctx_types + vocab_size)
                    total_log_t += math.log(p)

    except Exception as e:
        print(f"[ERROR] No se pudo leer el archivo de prueba '{testing_file}': {e}")
        return None, None, None

    if total_tokens == 0:
        return float('inf'), float('inf'), float('inf')

    ppl_u = math.exp(-total_log_u / total_tokens)
    ppl_b = math.exp(-total_log_b / total_tokens)
    ppl_t = math.exp(-total_log_t / total_tokens)
    return ppl_u, ppl_b, ppl_t

# ---------- Script principal ----------

if __name__ == '__main__':
    group_code = "my_group"

    training_file_20N = os.path.join(tercer_punto_folder, f"20N_{group_code}_training.txt")
    testing_file_20N  = os.path.join(tercer_punto_folder, f"20N_{group_code}_testing.txt")
    training_file_BAC = os.path.join(tercer_punto_folder, f"BAC_{group_code}_training.txt")
    testing_file_BAC  = os.path.join(tercer_punto_folder, f"BAC_{group_code}_testing.txt")

    print("\n\n--- Calculando la perplejidad ---")

    # Modelos 20N
    unigrams_20N = read_ngram_model(os.path.join(cuarto_punto_folder, "models", f"20N_{group_code}_unigrams.json"))
    bigrams_20N  = read_ngram_model(os.path.join(cuarto_punto_folder, "models", f"20N_{group_code}_bigrams.json"))
    trigrams_20N = read_ngram_model(os.path.join(cuarto_punto_folder, "models", f"20N_{group_code}_trigrams.json"))

    # vocab: usa #unigramas; evita re-leer el corpus completo
    vocab_size_20N = len(unigrams_20N) if unigrams_20N else None

    if all([unigrams_20N, bigrams_20N, trigrams_20N]):
        pp_uni_20N, pp_bi_20N, pp_tri_20N = calculate_perplexity(
            testing_file_20N,
            unigrams_20N, bigrams_20N, trigrams_20N,
            vocab_size=vocab_size_20N
        )
        print(f"\nResultados de Perplejidad para 20N (Corpus):")
        print(f"  Unigramas: {pp_uni_20N:.2f}")
        print(f"  Bigramas:  {pp_bi_20N:.2f}")
        print(f"  Trigramas: {pp_tri_20N:.2f}")

    # Modelos BAC
    unigrams_BAC = read_ngram_model(os.path.join(cuarto_punto_folder, "models", f"BAC_{group_code}_unigrams.json"))
    bigrams_BAC  = read_ngram_model(os.path.join(cuarto_punto_folder, "models", f"BAC_{group_code}_bigrams.json"))
    trigrams_BAC = read_ngram_model(os.path.join(cuarto_punto_folder, "models", f"BAC_{group_code}_trigrams.json"))

    vocab_size_BAC = len(unigrams_BAC) if unigrams_BAC else None

    if all([unigrams_BAC, bigrams_BAC, trigrams_BAC]):
        pp_uni_BAC, pp_bi_BAC, pp_tri_BAC = calculate_perplexity(
            testing_file_BAC,
            unigrams_BAC, bigrams_BAC, trigrams_BAC,
            vocab_size=vocab_size_BAC
        )
        print(f"\nResultados de Perplejidad para BAC (Corpus):")
        print(f"  Unigramas: {pp_uni_BAC:.2f}")
        print(f"  Bigramas:  {pp_bi_BAC:.2f}")
        print(f"  Trigramas: {pp_tri_BAC:.2f}")



--- Calculando la perplejidad ---
[OK] Modelo cargado desde 'cuarto-punto/models/20N_my_group_unigrams.json'
[OK] Modelo cargado desde 'cuarto-punto/models/20N_my_group_bigrams.json'
[OK] Modelo cargado desde 'cuarto-punto/models/20N_my_group_trigrams.json'

Resultados de Perplejidad para 20N (Corpus):
  Unigramas: 1103.00
  Bigramas:  2362.26
  Trigramas: 11280.97
[OK] Modelo cargado desde 'cuarto-punto/models/BAC_my_group_unigrams.json'
[OK] Modelo cargado desde 'cuarto-punto/models/BAC_my_group_bigrams.json'
[OK] Modelo cargado desde 'cuarto-punto/models/BAC_my_group_trigrams.json'

Resultados de Perplejidad para BAC (Corpus):
  Unigramas: 848.49
  Bigramas:  1212.09
  Trigramas: 13106.30


(15p) Using your best language model, build a method/function that automatically generates sentences by receiving the first word of a sentence as input. Take different tests and document them.

In [3]:
import json, os, math, random
from collections import defaultdict

def _load_trigram_unigram(prefix="20N", group_code="my_group", base_dir=cuarto_punto_folder+"/models"):
    with open(os.path.join(base_dir, f"{prefix}_{group_code}_trigrams.json"), "r", encoding="utf-8") as f:
        tri_raw = json.load(f)
    with open(os.path.join(base_dir, f"{prefix}_{group_code}_unigrams.json"), "r", encoding="utf-8") as f:
        uni_raw = json.load(f)

    tri = {}
    followers = defaultdict(set)
    for k, v in tri_raw.items():
        a,b,c = k.split(" ", 2)
        tri[(a,b,c)] = float(v)
        followers[(a,b)].add(c)

    uni = { (k,): float(v) for k, v in uni_raw.items() }
    vocab = {k[0] for k in uni.keys()}
    return tri, followers, uni, vocab

def _sample(scored, temperature=0.9, top_k=50, top_p=0.95, rng=None):
    if rng is None: rng = random
    scored = sorted(scored, key=lambda x: x[1], reverse=True)
    if top_k and top_k>0:
        scored = scored[:min(top_k, len(scored))]
    total = sum(s for _, s in scored) or 1e-12
    probs = [s/total for _, s in scored]
    # top-p
    cut, acc = [], 0.0
    for (t,s), p in zip(scored, probs):
        cut.append((t,s))
        acc += p
        if top_p is not None and acc >= top_p: break
    scored = cut
    # temperatura sobre log-score
    logs = [math.log(max(1e-12, s))/max(1e-6, temperature) for _, s in scored]
    m = max(logs)
    exps = [math.exp(x-m) for x in logs]
    z = sum(exps)
    probs = [e/z for e in exps]
    r, acc = rng.random(), 0.0
    for (tok,_), p in zip(scored, probs):
        acc += p
        if r <= acc: return tok
    return scored[-1][0]

def build_trigram_only_generator(prefix="20N", group_code="my_group", base_dir=cuarto_punto_folder+"/models"):
    tri, followers, uni, vocab = _load_trigram_unigram(prefix, group_code, base_dir)
    def gen(first_word, max_len=30, temperature=0.9, top_k=50, top_p=0.95, seed=None):
        rng = random.Random(seed)
        fw = first_word.strip().lower()
        if fw not in vocab:
            fw = "<unk>" if "<unk>" in vocab else fw
        tokens = ["<s>", fw]
        while len(tokens) < max_len:
            if tokens[-1] == "</s>": break
            prev2 = tokens[-2] if len(tokens)>=2 else "<s>"
            prev1 = tokens[-1]
            cand = followers.get((prev2, prev1), None)
            if cand:
                scored = [ (w, tri.get((prev2, prev1, w), 0.0)) for w in cand ]
                nxt = _sample(scored, temperature, top_k, top_p, rng)
            else:
                # fallback mínimo al unigrama (sigue siendo “un solo modelo” en el informe)
                scored = [ (w, uni.get((w,), 0.0)) for w in vocab ]
                nxt = _sample(scored, temperature, top_k, top_p, rng)
            tokens.append(nxt)
            if len(tokens) >= max_len-1 and "</s>" in vocab:
                tokens.append("</s>")
                break
        surface = [t for t in tokens if t not in ("<s>", "</s>")]
        return " ".join(surface), tokens
    return gen

if __name__ == '__main__':
    # A) recomendado (backoff): usa el generador grande que ya te di antes.
    # B) solo trigramas:
    gen_tri_20n = build_trigram_only_generator(prefix="20N", group_code="my_group")
    gen_tri_bac = build_trigram_only_generator(prefix="BAC", group_code="my_group")

    for w in ["the","this","i","if"]:
        s,_ = gen_tri_20n(w, max_len=25, temperature=0.9, top_k=60, top_p=0.95, seed=7)
        print(f"[20N|tri-only] '{w}': {s}")

    for w in ["i","my","today","we"]:
        s,_ = gen_tri_bac(w, max_len=25, temperature=0.9, top_k=60, top_p=0.95, seed=7)
        print(f"[BAC|tri-only] '{w}': {s}")

[20N|tri-only] 'the': the fact that he was still able to read the readme file for info related to the security options to srt may NUM
[20N|tri-only] 'this': this is a general rule
[20N|tri-only] 'i': i am not asking much
[20N|tri-only] 'if': if you are correct
[BAC|tri-only] 'i': i was a big deal but it was like a normal person
[BAC|tri-only] 'my': my first day at work and it was like a normal person
[BAC|tri-only] 'today': today i was supposed to be in the back of my life
[BAC|tri-only] 'we': we have to admit that im going to get a good thing
