(5p) Read the files and build two large consolidate files that are the union of all the documents in 20N and BAC.

**SECTION FOR UPLOADING DATASETS OF 20NEWS & BAC**

In [None]:
# "CAMBIAR POR TU UBICACION EN TU PC"
newsgroups_dataset = "20news-18828.tar.gz" # http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz
bac_dataset = "BAC.zip" # https://huggingface.co/datasets/barilan/blog_authorship_corpus

!mkdir -p datasets
!tar -xzf {newsgroups_dataset} -C datasets
!unzip -o {bac_dataset} -d datasets
!unzip -o "datasets/blogs.zip" -d datasets
!mkdir -p large-files

Archive:  BAC.zip
mapname:  conversion of  failed
 extracting: datasets/blogs.zip      ^C
Archive:  datasets/blogs.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of datasets/blogs.zip or
        datasets/blogs.zip.zip, and cannot find datasets/blogs.zip.ZIP, period.


In [None]:
import os
import sys
from pathlib import Path
import shutil

def iter_files(folder_path: str):
    """Itera recursivamente con os.scandir (menos overhead que os.walk)."""
    stack = [Path(folder_path)]
    while stack:
        p = stack.pop()
        for entry in os.scandir(p):
            if entry.is_dir(follow_symlinks=False):
                stack.append(entry.path)
            elif entry.is_file(follow_symlinks=False):
                yield Path(entry.path)

def join_files_in_one_stream(folder_path: str, output_file_path: str, sep=b"\n"):
    """
    Une todos los archivos de texto en un solo archivo, copiando en binario por bloques.
    - O(n) en tamaño total de datos.
    - Casi cero overhead de Python: usa buffers grandes.
    """
    out_path = Path(output_file_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    # 'xb' para fallar si existe; si prefieres sobrescribir, usa 'wb'
    with open(out_path, 'wb') as out_f:
        for fpath in iter_files(folder_path):
            try:
                # Copia por bloques grandes (shutil.copyfileobj ya usa buffering)
                with open(fpath, 'rb') as in_f:
                    shutil.copyfileobj(in_f, out_f, length=1024 * 1024)  # 1 MiB
                out_f.write(sep)
            except Exception as e:
                print(f"[WARN] No se pudo copiar {fpath}: {e}", file=sys.stderr)

if __name__ == '__main__':
    join_files_in_one_stream("datasets/20news-18828", "large-files/20news.txt")
    join_files_in_one_stream("datasets/blogs", "large-files/blogs.txt")

(5p) Tokenize by sentence.
- Normalize, but DO NOT eliminate stop words.
- Replace numbers with a token named NUM.
- Add sentence start and end tags "\<s>\</s>".
- Tokens with unit frequency should be modeled as "\<UNK>".

Tokenize by sentence

In [None]:
from nltk.tokenize import sent_tokenize
import re

def tokenize_by_sentence(input_file, is_xml=False, max_chars=None):
    try:
        print(f"\n[INFO] Abriendo archivo: {input_file}")
        with open(input_file, 'r', encoding='latin-1') as infile:
            if max_chars:
                text = infile.read(max_chars) # Lee solo los primeros 'max_chars'
            else:
                text = infile.read()
            print(f"[OK] Archivo leído correctamente. Longitud del texto: {len(text)} caracteres")
    except Exception as e:
        print(f"[ERROR] No se pudo leer el archivo {input_file}: {e}")
        return []
    
    try:
        if is_xml:
            print("[INFO] Extrayendo contenido de <post>...</post>")
            posts = re.findall(r"<post>(.*?)</post>", text, flags=re.DOTALL)
            print(f"[OK] Se encontraron {len(posts)} posts")
            if len(posts) == 0:
                print("[WARN] No se encontró nada dentro de <post>...</post>")
            text = "\n".join(posts)
    except Exception as e:
        print(f"[ERROR] Falló la extracción de posts: {e}")
        return []
    
    try:
        sentences = sent_tokenize(text)
        print(f"[OK] Se tokenizaron {len(sentences)} oraciones")
        return sentences
    except Exception as e:
        print(f"[ERROR] Falló la tokenización: {e}")
        return []

if __name__ == '__main__':
    sentences_20news = tokenize_by_sentence("large-files/20news.txt")
    sentences_blogs = tokenize_by_sentence("large-files/blogs.txt", is_xml=True)
    # sentences_blogs = tokenize_by_sentence("large-files/blogs.txt", is_xml=True, max_chars=10000)
    
    print("\nEjemplo primeras 5 oraciones 20news:")
    print("20news:", sentences_20news[:5])
    print("\nEjemplo primeras 5 oraciones blogs:")
    print("Blogs:", sentences_blogs[:5])


--- Prueba rápida con 10,000 caracteres ---

[INFO] Abriendo archivo: large-files/20news.txt
[OK] Archivo leído correctamente. Longitud del texto: 33930324 caracteres
[OK] Se tokenizaron 290240 oraciones

[INFO] Abriendo archivo: large-files/blogs.txt
[OK] Archivo leído correctamente. Longitud del texto: 799624275 caracteres
[INFO] Extrayendo contenido de <post>...</post>
[OK] Se encontraron 681288 posts
[OK] Se tokenizaron 8813223 oraciones

Ejemplo primeras 5 oraciones:
20news: ['From: CGKarras@world.std.com (Christopher G Karras)\nSubject: Need Maintenance tips\n\n\nAfter reading the service manual for my bike (Suzuki GS500E--1990) I have\na couple of questions I hope you can answer:\n\nWhen checking the oil level with the dip stick built into the oil fill\ncap, does one check it with the cap screwed in or not?', 'I am more used to\nthe dip stick for a cage where the stick is extracted fully, wiped clean\nand reinserted fully, then withdrawn and read.', 'The dip stick on my bike\nis

Normalize, but DO NOT eliminate stop words.

In [281]:
import re

def normalize_text(sentences):
    normalized_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()  # Convertir a minúsculas
        # Ahora el patrón incluye los dígitos del 0 al 9
        sentence = re.sub(r'[^a-záéíóúüñ\s0-9]', '', sentence)  # Eliminar solo puntuación y caracteres no alfanuméricos
        sentence = re.sub(r'\s+', ' ', sentence).strip()  # Eliminar espacios extra
        normalized_sentences.append(sentence)
    return normalized_sentences


if __name__ == '__main__':
    normalized_20news = normalize_text(sentences_20news)
    normalized_blogs = normalize_text(sentences_blogs)
    
    print("\nEjemplo primeras 5 oraciones normalizadas:")
    print("20news:", normalized_20news[:5])
    print("Blogs:", normalized_blogs[:5])



[INFO] Lectura completada.
[INFO] Lectura completada.

Ejemplo primeras 5 oraciones normalizadas:
20news: ['from cgkarrasworldstdcom christopher g karras subject need maintenance tips after reading the service manual for my bike suzuki gs500e1990 i have a couple of questions i hope you can answer when checking the oil level with the dip stick built into the oil fill cap does one check it with the cap screwed in or not', 'i am more used to the dip stick for a cage where the stick is extracted fully wiped clean and reinserted fully then withdrawn and read', 'the dip stick on my bike is part of the oil filler cap and has about 12 inch of threads on it', 'do i remove the cap wipe the stick clean and reinsert it withwithout screwing it down before reading', 'the service manual calls for the application of suzuki bond no']
Blogs: ['today is another first day i put up a few links that i either a find handy or b like because i can either relate or they just plain cracked me up', 'it is finally

Replace numbers with a token named NUM.

In [282]:
import re

def replace_numbers_with_token(normalized_sentences):
    """
    Reemplaza todos los números en una lista de oraciones por el token 'NUM'.
    """
    tokenized_sentences = []
    for sentence in normalized_sentences:
        # La expresión regular \d+ busca uno o más dígitos (0-9).
        # Reemplaza cualquier secuencia de números con la cadena 'NUM'.
        sentence_with_token = re.sub(r'\d+', 'NUM', sentence)
        tokenized_sentences.append(sentence_with_token)
    return tokenized_sentences

if __name__ == '__main__':
    tokenized_num_20news = replace_numbers_with_token(normalized_20news)
    tokenized_num_blogs = replace_numbers_with_token(normalized_blogs)
    
    print("\nEjemplo primeras 5 oraciones con números reemplazados por 'NUM':")
    print("20news:", tokenized_num_20news[:5])
    print("Blogs:", tokenized_num_blogs[:5])

[INFO] Lectura completada.

Ejemplo primeras 5 oraciones con números reemplazados por 'NUM':
20news: ['from cgkarrasworldstdcom christopher g karras subject need maintenance tips after reading the service manual for my bike suzuki gsNUMeNUM i have a couple of questions i hope you can answer when checking the oil level with the dip stick built into the oil fill cap does one check it with the cap screwed in or not', 'i am more used to the dip stick for a cage where the stick is extracted fully wiped clean and reinserted fully then withdrawn and read', 'the dip stick on my bike is part of the oil filler cap and has about NUM inch of threads on it', 'do i remove the cap wipe the stick clean and reinsert it withwithout screwing it down before reading', 'the service manual calls for the application of suzuki bond no']
Blogs: ['today is another first day i put up a few links that i either a find handy or b like because i can either relate or they just plain cracked me up', 'it is finally frid

- Add sentence start and end tags "\<s>\</s>".

In [283]:
def add_sentence_tags(sentences):
    """
    Agrega tags de inicio y fin a cada oración en una lista.

    Args:
        sentences (list): Una lista de cadenas, donde cada cadena es una oración.

    Returns:
        list: Una nueva lista con las oraciones que tienen los tags agregados.
    """
    tagged_sentences = []
    for sentence in sentences:
        # Añade <s> al principio y </s> al final de la oración
        tagged_sentence = f"<s> {sentence} </s>"
        tagged_sentences.append(tagged_sentence)
    return tagged_sentences

# Ejemplo de uso
if __name__ == '__main__':
    tokenized_tags_20news = add_sentence_tags(tokenized_num_20news)
    tokenized_tags_blogs = add_sentence_tags(tokenized_num_blogs)

    print(tokenized_tags_20news[:5])
    print(tokenized_tags_blogs[:5])

['<s> from cgkarrasworldstdcom christopher g karras subject need maintenance tips after reading the service manual for my bike suzuki gsNUMeNUM i have a couple of questions i hope you can answer when checking the oil level with the dip stick built into the oil fill cap does one check it with the cap screwed in or not </s>', '<s> i am more used to the dip stick for a cage where the stick is extracted fully wiped clean and reinserted fully then withdrawn and read </s>', '<s> the dip stick on my bike is part of the oil filler cap and has about NUM inch of threads on it </s>', '<s> do i remove the cap wipe the stick clean and reinsert it withwithout screwing it down before reading </s>', '<s> the service manual calls for the application of suzuki bond no </s>']
['<s> today is another first day i put up a few links that i either a find handy or b like because i can either relate or they just plain cracked me up </s>', '<s> it is finally friday </s>', '<s> has anyone else been suffering thro

- Tokens with unit frequency should be modeled as "\<UNK>".

In [284]:
import collections
import re

def replace_single_occurrence_tokens(sentences):
    """
    Reemplaza los tokens que aparecen solo una vez en el corpus proporcionado
    con el token '<UNK>'.
    
    Args:
        sentences (list): Una lista de oraciones para procesar.
    
    Returns:
        list: Una nueva lista con los tokens de ocurrencia única reemplazados.
    """
    # 1. Combina todas las oraciones en una lista de tokens.
    all_tokens = []
    for sentence in sentences:
        tokens = sentence.split()
        all_tokens.extend(tokens)
    
    # 2. Cuenta la frecuencia de cada token.
    token_counts = collections.Counter(all_tokens)
    
    # 3. Reemplaza los tokens de ocurrencia única con '<UNK>'.
    processed_sentences = []
    for sentence in sentences:
        current_sentence_tokens = sentence.split()
        new_tokens = []
        for token in current_sentence_tokens:
            if token_counts[token] == 1:
                new_tokens.append('<UNK>')
            else:
                new_tokens.append(token)
        processed_sentences.append(" ".join(new_tokens))
    
    return processed_sentences


if __name__ == '__main__':
    processed_20news = replace_single_occurrence_tokens(tokenized_tags_20news)
    processed_blogs = replace_single_occurrence_tokens(tokenized_tags_blogs)

    print("--- Oraciones procesadas de 20news ---")
    print(processed_20news[:10])  # Muestra solo las primeras 5 oraciones procesadas
    print("\n--- Oraciones procesadas de Blogs ---")
    print(processed_blogs[:10])  # Muestra solo las primeras 5 oraciones procesadas

--- Oraciones procesadas de 20news ---
['<s> from cgkarrasworldstdcom christopher g karras subject need maintenance tips after reading the service manual for my bike suzuki gsNUMeNUM i have a couple of questions i hope you can answer when checking the oil level with the dip stick built into the oil fill cap does one check it with the cap screwed in or not </s>', '<s> i am more used to the dip stick for a cage where the stick is extracted fully wiped clean and reinserted fully then withdrawn and read </s>', '<s> the dip stick on my bike is part of the oil filler cap and has about NUM inch of threads on it </s>', '<s> do i remove the cap wipe the stick clean and reinsert it withwithout screwing it down before reading </s>', '<s> the service manual calls for the application of suzuki bond no </s>', '<s> NUMb on the head cover </s>', '<s> i guess this is some sort of liquid gasket material </s>', '<s> do you know of a generic cheaper substitute </s>', '<s> my headlight is a halogen NUM w b

(10p) Select 80% of the resulting sentences---random without replacement---to build the N-gram model and the remaining 20 for evaluation. Create the following files:

- 20N\_<group\_code>\_training (training sentences)
- 20N\_<group\_code>\_testing (testing sentences)
- BAC\_<group\_code>\_training (training sentences)
- BAC\_<group\_code>\_testing (testing sentences)

In [285]:
import random
import os

def split_and_save_data(sentences, group_code, file_prefix):
    """
    Randomly splits a list of sentences into 80% training and 20% testing sets,
    and saves them to respective files.

    Args:
        sentences (list): The list of sentences to split.
        group_code (str): A code to be included in the filename (e.g., 'your_group_name').
        file_prefix (str): The prefix for the output filenames (e.g., '20N' or 'BAC').
    """
    print(f"\n[INFO] Procesando {len(sentences)} oraciones para {file_prefix}")

    # Mezcla las oraciones aleatoriamente para asegurar una división imparcial
    random.shuffle(sentences)
    
    # Calcula el índice de división (80% para entrenamiento)
    split_index = int(len(sentences) * 0.8)
    
    # Divide las oraciones en conjuntos de entrenamiento y prueba
    training_set = sentences[:split_index]
    testing_set = sentences[split_index:]
    
    # Define la carpeta de destino
    target_directory = "tercer-punto"
    
    # Crea el directorio si no existe
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)
    
    # Define los nombres y rutas de los archivos de salida
    training_filename = os.path.join(target_directory, f"{file_prefix}_{group_code}_training.txt")
    testing_filename = os.path.join(target_directory, f"{file_prefix}_{group_code}_testing.txt")
    
    try:
        # Guarda el conjunto de entrenamiento
        with open(training_filename, 'w', encoding='utf-8') as f:
            for sentence in training_set:
                f.write(f"{sentence}\n")
        print(f"[OK] Guardado el conjunto de entrenamiento en '{training_filename}' ({len(training_set)} oraciones)")
        
        # Guarda el conjunto de prueba
        with open(testing_filename, 'w', encoding='utf-8') as f:
            for sentence in testing_set:
                f.write(f"{sentence}\n")
        print(f"[OK] Guardado el conjunto de prueba en '{testing_filename}' ({len(testing_set)} oraciones)")
    except Exception as e:
        print(f"[ERROR] No se pudieron guardar los archivos: {e}")


if __name__ == '__main__':
    group_code = "my_group"

    split_and_save_data(processed_20news, group_code, "20N")
    split_and_save_data(processed_blogs, group_code, "BAC")


[INFO] Procesando 290240 oraciones para 20N
[OK] Guardado el conjunto de entrenamiento en 'tercer-punto/20N_my_group_training.txt' (232192 oraciones)
[OK] Guardado el conjunto de prueba en 'tercer-punto/20N_my_group_testing.txt' (58048 oraciones)

[INFO] Procesando 8813223 oraciones para BAC
[OK] Guardado el conjunto de entrenamiento en 'tercer-punto/BAC_my_group_training.txt' (7050578 oraciones)
[OK] Guardado el conjunto de prueba en 'tercer-punto/BAC_my_group_testing.txt' (1762645 oraciones)


(50p) Build the following N-gram models using Laplace smoothing and generate an output file for each one (you choose the output structure, but be sure to provide an appropriate Python reading method/function):

- 20N\_<group\_code>\_unigrams
- 20N\_<group\_code>\_bigrams
- 20N\_<group\_code>\_trigrams
- BAC\_<group\_code>\_unigrams
- BAC\_<group\_code>\_bigrams
- BAC\_<group\_code>\_trigrams

In [286]:
import json
import collections
import os
import re

def build_ngram_models(corpus_file, group_code, file_prefix):
    """
    Construye modelos de unigramas, bigramas y trigramas con suavizado de Laplace
    a partir de un corpus de texto y los guarda como archivos JSON.

    Args:
        corpus_file (str): La ruta al archivo de texto (datos de entrenamiento).
        group_code (str): Un código para los nombres de los archivos de salida.
        file_prefix (str): El prefijo para los nombres de los archivos (ej., '20N' o 'BAC').
    """
    # Crea el directorio para los modelos si no existe
    models_dir = os.path.join("cuarto-punto", "models")
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    # Lee el corpus de texto
    try:
        with open(corpus_file, 'r', encoding='utf-8') as f:
            sentences = [line.strip() for line in f if line.strip()]
        if not sentences:
            print(f"[WARN] No se encontraron oraciones en el archivo: {corpus_file}")
            return
        print(f"[OK] Leído el corpus de entrenamiento de '{corpus_file}' con {len(sentences)} oraciones.")
    except Exception as e:
        print(f"[ERROR] No se pudo leer el archivo '{corpus_file}': {e}")
        return

    # Procesa los tokens para todos los modelos
    all_tokens = ' '.join(sentences).split()
    vocab_size = len(set(all_tokens))
    
    # 1. Construye y guarda el modelo de Unigramas
    unigram_counts = collections.Counter(all_tokens)
    unigram_model = {}
    total_tokens = len(all_tokens)
    for token, count in unigram_counts.items():
        unigram_model[token] = (count + 1) / (total_tokens + vocab_size)
    
    unigram_path = os.path.join(models_dir, f"{file_prefix}_{group_code}_unigrams.json")
    with open(unigram_path, 'w', encoding='utf-8') as f:
        json.dump(unigram_model, f, ensure_ascii=False, indent=4)
    print(f"[OK] Modelo de unigramas guardado en '{unigram_path}'")
    
    # 2. Construye y guarda el modelo de Bigramas
    bigram_counts = collections.Counter()
    unigram_context_counts = collections.Counter()
    for sentence in sentences:
        tokens = sentence.split()
        for i in range(len(tokens) - 1):
            bigram_counts[(tokens[i], tokens[i+1])] += 1
            unigram_context_counts[tokens[i]] += 1
    
    bigram_model = {}
    for (token1, token2), count in bigram_counts.items():
        context_count = unigram_context_counts[token1]
        probability = (count + 1) / (context_count + vocab_size)
        bigram_model[f"{token1} {token2}"] = probability
        
    bigram_path = os.path.join(models_dir, f"{file_prefix}_{group_code}_bigrams.json")
    with open(bigram_path, 'w', encoding='utf-8') as f:
        json.dump(bigram_model, f, ensure_ascii=False, indent=4)
    print(f"[OK] Modelo de bigramas guardado en '{bigram_path}'")
    
    # 3. Construye y guarda el modelo de Trigramas
    trigram_counts = collections.Counter()
    bigram_context_counts = collections.Counter()
    for sentence in sentences:
        tokens = sentence.split()
        for i in range(len(tokens) - 2):
            trigram_counts[(tokens[i], tokens[i+1], tokens[i+2])] += 1
            bigram_context_counts[(tokens[i], tokens[i+1])] += 1
            
    trigram_model = {}
    for (token1, token2, token3), count in trigram_counts.items():
        context_count = bigram_context_counts.get((token1, token2), 0)
        probability = (count + 1) / (context_count + vocab_size)
        trigram_model[f"{token1} {token2} {token3}"] = probability

    trigram_path = os.path.join(models_dir, f"{file_prefix}_{group_code}_trigrams.json")
    with open(trigram_path, 'w', encoding='utf-8') as f:
        json.dump(trigram_model, f, ensure_ascii=False, indent=4)
    print(f"[OK] Modelo de trigramas guardado en '{trigram_path}'")


if __name__ == '__main__':
    group_code = "my_group"
    
    training_file_20N = os.path.join("tercer-punto", f"20N_{group_code}_training.txt")
    training_file_BAC = os.path.join("tercer-punto", f"BAC_{group_code}_training.txt")
    
    # Construye y guarda los modelos
    build_ngram_models(training_file_20N, group_code, "20N")
    build_ngram_models(training_file_BAC, group_code, "BAC")


[OK] Leído el corpus de entrenamiento de 'tercer-punto/20N_my_group_training.txt' con 232192 oraciones.
[OK] Modelo de unigramas guardado en 'cuarto-punto/models/20N_my_group_unigrams.json'
[OK] Modelo de bigramas guardado en 'cuarto-punto/models/20N_my_group_bigrams.json'
[OK] Modelo de trigramas guardado en 'cuarto-punto/models/20N_my_group_trigrams.json'
[OK] Leído el corpus de entrenamiento de 'tercer-punto/BAC_my_group_training.txt' con 7050578 oraciones.
[OK] Modelo de unigramas guardado en 'cuarto-punto/models/BAC_my_group_unigrams.json'
[OK] Modelo de bigramas guardado en 'cuarto-punto/models/BAC_my_group_bigrams.json'
[OK] Modelo de trigramas guardado en 'cuarto-punto/models/BAC_my_group_trigrams.json'


(15p) Using the test dataset, calculate the perplexity of each language model. Report the results obtained. If you experience variable overflow, use probabilities in log space.

In [287]:
import json
import os
import math
from collections import defaultdict

# ---------- Utilidades de carga/normalización ----------

def read_ngram_model(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            model = json.load(f)
        print(f"[OK] Modelo cargado desde '{file_path}'")
        return model
    except FileNotFoundError:
        print(f"[ERROR] Archivo no encontrado: '{file_path}'")
        return None
    except Exception as e:
        print(f"[ERROR] No se pudo cargar el modelo desde '{file_path}': {e}")
        return None

def _normalize_ngram_keys(model_dict, n):
    """
    Convierte llaves 'a b' / 'a b c' en tuplas ('a','b'[, 'c']) para O(1) consistente y evita splits repetidos.
    Devuelve (dict_normalizado, seguidores_por_contexto)
    seguidores_por_contexto:
        - bigrama: dict prev -> #tipos_siguientes
        - trigrama: dict (prev1,prev2) -> #tipos_siguientes
    """
    out = {}
    followers = defaultdict(int)
    if n == 1:
        # unigramas ya están por token -> p(token)
        # no se necesita followers
        for k, v in model_dict.items():
            out[(k,)] = float(v)
        return out, {}
    elif n == 2:
        next_types = defaultdict(set)
        for k, v in model_dict.items():
            a, b = k.split(' ', 1)
            out[(a, b)] = float(v)
            next_types[a].add(b)
        followers = {a: len(bs) for a, bs in next_types.items()}
        return out, followers
    elif n == 3:
        next_types = defaultdict(set)
        for k, v in model_dict.items():
            a, b, c = k.split(' ', 2)
            out[(a, b, c)] = float(v)
            next_types[(a, b)].add(c)
        followers = {ctx: len(cs) for ctx, cs in next_types.items()}
        return out, followers
    else:
        raise ValueError("n debe ser 1, 2 o 3")

# ---------- Perplejidad optimizada ----------

def calculate_perplexity(
    testing_file,
    unigram_model, bigram_model, trigram_model,
    vocab_size=None
):
    """
    Calcula perplejidad usando:
      - Accesos O(1) (tuplas como claves)
      - Conteos de contexto precomputados (evita sum(...) por llave)
      - Log-probabilidades para estabilidad
      - Lectura streaming del archivo de prueba
    """
    if any(m is None for m in (unigram_model, bigram_model, trigram_model)):
        return None, None, None

    # Normaliza llaves y precomputa seguidores/contexts
    uni, _ = _normalize_ngram_keys(unigram_model, 1)
    bi, followers_bi = _normalize_ngram_keys(bigram_model, 2)
    tri, followers_tri = _normalize_ngram_keys(trigram_model, 3)

    # Tamaño de vocabulario: por defecto = #unigramas distintos
    if vocab_size is None:
        vocab_size = len(uni)

    # Precalcula denominadores de Laplace para velocidad
    # Nota: tu fórmula original usa (#tipos_siguientes + V) como denominador de suavizado.
    # Mantengo esa lógica para equivalencia funcional, pero ahora O(1).
    laplace_uni_denom = (len(uni) + vocab_size)

    total_log_u = 0.0
    total_log_b = 0.0
    total_log_t = 0.0
    total_tokens = 0

    # Lectura streaming (sin cargar todo a memoria)
    try:
        with open(testing_file, 'r', encoding='utf-8') as f:
            for raw in f:
                s = raw.strip()
                if not s:
                    continue
                tokens = s.split()
                n = len(tokens)
                if n == 0:
                    continue

                total_tokens += n

                # -------- Unigram --------
                for w in tokens:
                    p = uni.get((w,), 0.0)
                    if p <= 0.0:
                        p = 1.0 / laplace_uni_denom
                    total_log_u += math.log(p)

                # -------- Bigram --------
                # P(w1) + Π P(w_i | w_{i-1})
                # w1 como unigrama
                p1 = uni.get((tokens[0],), 0.0)
                if p1 <= 0.0:
                    p1 = 1.0 / laplace_uni_denom
                total_log_b += math.log(p1)

                for i in range(n - 1):
                    a, b_ = tokens[i], tokens[i + 1]
                    p = bi.get((a, b_), 0.0)
                    if p <= 0.0:
                        context_types = followers_bi.get(a, 0)
                        p = 1.0 / (context_types + vocab_size)
                    total_log_b += math.log(p)

                # -------- Trigram --------
                # P(w1) * P(w2|w1) * Π P(w_i | w_{i-2}, w_{i-1})
                # w1 como unigrama
                p1 = uni.get((tokens[0],), 0.0)
                if p1 <= 0.0:
                    p1 = 1.0 / laplace_uni_denom
                total_log_t += math.log(p1)

                if n >= 2:
                    p2 = bi.get((tokens[0], tokens[1]), 0.0)
                    if p2 <= 0.0:
                        ctx_types = followers_bi.get(tokens[0], 0)
                        p2 = 1.0 / (ctx_types + vocab_size)
                    total_log_t += math.log(p2)

                for i in range(n - 2):
                    a, b_, c = tokens[i], tokens[i + 1], tokens[i + 2]
                    p = tri.get((a, b_, c), 0.0)
                    if p <= 0.0:
                        ctx_types = followers_tri.get((a, b_), 0)
                        p = 1.0 / (ctx_types + vocab_size)
                    total_log_t += math.log(p)

    except Exception as e:
        print(f"[ERROR] No se pudo leer el archivo de prueba '{testing_file}': {e}")
        return None, None, None

    if total_tokens == 0:
        return float('inf'), float('inf'), float('inf')

    ppl_u = math.exp(-total_log_u / total_tokens)
    ppl_b = math.exp(-total_log_b / total_tokens)
    ppl_t = math.exp(-total_log_t / total_tokens)
    return ppl_u, ppl_b, ppl_t

# ---------- Script principal ----------

if __name__ == '__main__':
    group_code = "my_group"

    training_file_20N = os.path.join("tercer-punto", f"20N_{group_code}_training.txt")
    testing_file_20N  = os.path.join("tercer-punto", f"20N_{group_code}_testing.txt")
    training_file_BAC = os.path.join("tercer-punto", f"BAC_{group_code}_training.txt")
    testing_file_BAC  = os.path.join("tercer-punto", f"BAC_{group_code}_testing.txt")

    print("\n\n--- Calculando la perplejidad ---")

    # Modelos 20N
    unigrams_20N = read_ngram_model(os.path.join("cuarto-punto", "models", f"20N_{group_code}_unigrams.json"))
    bigrams_20N  = read_ngram_model(os.path.join("cuarto-punto", "models", f"20N_{group_code}_bigrams.json"))
    trigrams_20N = read_ngram_model(os.path.join("cuarto-punto", "models", f"20N_{group_code}_trigrams.json"))

    # vocab: usa #unigramas; evita re-leer el corpus completo
    vocab_size_20N = len(unigrams_20N) if unigrams_20N else None

    if all([unigrams_20N, bigrams_20N, trigrams_20N]):
        pp_uni_20N, pp_bi_20N, pp_tri_20N = calculate_perplexity(
            testing_file_20N,
            unigrams_20N, bigrams_20N, trigrams_20N,
            vocab_size=vocab_size_20N
        )
        print(f"\nResultados de Perplejidad para 20N (Corpus):")
        print(f"  Unigramas: {pp_uni_20N:.2f}")
        print(f"  Bigramas:  {pp_bi_20N:.2f}")
        print(f"  Trigramas: {pp_tri_20N:.2f}")

    # Modelos BAC
    unigrams_BAC = read_ngram_model(os.path.join("cuarto-punto", "models", f"BAC_{group_code}_unigrams.json"))
    bigrams_BAC  = read_ngram_model(os.path.join("cuarto-punto", "models", f"BAC_{group_code}_bigrams.json"))
    trigrams_BAC = read_ngram_model(os.path.join("cuarto-punto", "models", f"BAC_{group_code}_trigrams.json"))

    vocab_size_BAC = len(unigrams_BAC) if unigrams_BAC else None

    if all([unigrams_BAC, bigrams_BAC, trigrams_BAC]):
        pp_uni_BAC, pp_bi_BAC, pp_tri_BAC = calculate_perplexity(
            testing_file_BAC,
            unigrams_BAC, bigrams_BAC, trigrams_BAC,
            vocab_size=vocab_size_BAC
        )
        print(f"\nResultados de Perplejidad para BAC (Corpus):")
        print(f"  Unigramas: {pp_uni_BAC:.2f}")
        print(f"  Bigramas:  {pp_bi_BAC:.2f}")
        print(f"  Trigramas: {pp_tri_BAC:.2f}")



--- Calculando la perplejidad ---
[OK] Modelo cargado desde 'cuarto-punto/models/20N_my_group_unigrams.json'
[OK] Modelo cargado desde 'cuarto-punto/models/20N_my_group_bigrams.json'
[OK] Modelo cargado desde 'cuarto-punto/models/20N_my_group_trigrams.json'

Resultados de Perplejidad para 20N (Corpus):
  Unigramas: 1256.35
  Bigramas:  3104.00
  Trigramas: 15012.65
[OK] Modelo cargado desde 'cuarto-punto/models/BAC_my_group_unigrams.json'
[OK] Modelo cargado desde 'cuarto-punto/models/BAC_my_group_bigrams.json'
[OK] Modelo cargado desde 'cuarto-punto/models/BAC_my_group_trigrams.json'

Resultados de Perplejidad para BAC (Corpus):
  Unigramas: 825.99
  Bigramas:  1163.46
  Trigramas: 12544.12


(15p) Using your best language model, build a method/function that automatically generates sentences by receiving the first word of a sentence as input. Take different tests and document them.

In [None]:
import json, math, random
from typing import Dict, List, Tuple, Iterable

# ---------------- Utilidades ----------------

def load_unigram_model(path:str) -> Dict[str, float]:
    """Carga un JSON {token: prob} o {token: count}. Normaliza a probas."""
    with open(path, "r", encoding="utf-8") as f:
        raw = json.load(f)
    # Si ya son probabilidades que suman ~1, úsalo; si parecen conteos, normaliza.
    vals = list(raw.values())
    s = sum(vals)
    # Si hay NaNs o s==0, error
    if not math.isfinite(s) or s <= 0:
        raise ValueError("Modelo vacío o inválido")
    # Normaliza siempre (robusto ante floats que no sumen 1 exactamente)
    return {tok: float(c)/s for tok, c in raw.items()}

def _build_temperature_probs(p: Dict[str,float], temperature: float) -> Tuple[List[str], List[float]]:
    """Aplica temperatura y devuelve listas paralelas (vocab, cdf)."""
    if temperature <= 0:
        raise ValueError("temperature debe ser > 0")
    # p^ (1/T), renormaliza
    powed = {t: (pi ** (1.0/temperature)) for t, pi in p.items() if pi > 0.0}
    z = sum(powed.values())
    vocab = []
    cdf = []
    acc = 0.0
    for t, w in powed.items():
        acc += w / z
        vocab.append(t)
        cdf.append(acc)
    # Asegura que el último sea 1.0 exacto
    cdf[-1] = 1.0
    return vocab, cdf

def _sample_from_cdf(vocab: List[str], cdf: List[float]) -> str:
    r = random.random()
    # búsqueda lineal (rápida en práctica). Cambia a bisect si tu vocab es enorme.
    for t, c in zip(vocab, cdf):
        if r <= c:
            return t
    return vocab[-1]

def _should_stop(token: str, stop_tokens: Iterable[str]) -> bool:
    return token in stop_tokens or any(token.endswith(st) for st in stop_tokens)

# ---------------- Generador ----------------

class UnigramSentenceGenerator:
    def __init__(self, unigram_probs: Dict[str,float], temperature: float = 1.0,
                 stop_tokens: Iterable[str] = (".", "!", "?", "</s>")):
        self.vocab, self.cdf = _build_temperature_probs(unigram_probs, temperature)
        self.stop_tokens = set(stop_tokens)

    def generate(self, first_word: str, max_len: int = 30, seed: int = None) -> str:
        """
        Genera una oración que inicia con first_word y luego muestrea unigramas.
        No re-tokeniza; asume que los tokens del modelo son por palabra.
        """
        if seed is not None:
            random.seed(seed)
        tokens = [first_word]
        # Si el primer token ya es stop, devuelve inmediato
        if _should_stop(first_word, self.stop_tokens):
            return first_word
        # Completa hasta max_len o hasta stop
        for _ in range(max_len - 1):
            nxt = _sample_from_cdf(self.vocab, self.cdf)
            tokens.append(nxt)
            if _should_stop(nxt, self.stop_tokens):
                break
        # Pegado simple (si tu corpus maneja signos como tokens separados, esto es suficiente)
        sent = " ".join(tokens)
        # Arreglo menor de espacios antes de puntuación común
        sent = sent.replace(" .", ".").replace(" ,", ",").replace(" !", "!").replace(" ?", "?")
        return sent

if __name__ == '__main__':
    # 1) Carga tu mejor modelo: BAC_unigramas
    uni = load_unigram_model("cuarto-punto/models/BAC_my_group_unigrams.json")

    # 2) Crea el generador (puedes jugar con la temperatura)
    gen_cool = UnigramSentenceGenerator(uni, temperature=0.8)  # más conservador
    gen_neutral = UnigramSentenceGenerator(uni, temperature=1.0)
    gen_hot = UnigramSentenceGenerator(uni, temperature=1.3)   # más diverso

    # 3) Pruebas
    print(gen_neutral.generate("Colombia", max_len=20, seed=42))
    print(gen_cool.generate("Economía", max_len=20, seed=42))
    print(gen_hot.generate("Gobierno", max_len=20, seed=42))
    print(gen_neutral.generate("Bogotá", max_len=30, seed=7))

Colombia okay <s> and it world where flicks i have <s> it one <s> </s>
Economía do <s> </s>
Gobierno excellent i their one dat wash thunderbolts this hes i at gov i im rape taller your families endangered
Bogotá to just NUM i there the today will <s> was day i youll nasty that it thinking willis some what okinawa <s> brain take can a to nap </s>
