(5p) Read the files and build two large consolidate files that are the union of all the documents in 20N and BAC.

**SECCION PARA COLOCAR DONDE TIENES UBICADOS LOS DATASETS DE 20NEWS.ZIP Y BAC.ZIP**

In [1]:
# "CAMBIAR POR TU UBICACION EN TU PC"
newsgroups_dataset = "20news-18828.tar.gz" # http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz
bac_dataset = "BAC.zip" # https://huggingface.co/datasets/barilan/blog_authorship_corpus

!mkdir -p datasets
!tar -xzf {newsgroups_dataset} -C datasets
!unzip -o {bac_dataset} -d datasets
!unzip -o "datasets/blogs.zip" -d datasets
!mkdir -p large-files

Archive:  BAC.zip
mapname:  conversion of  failed
 extracting: datasets/blogs.zip      
 extracting: datasets/5114.male.25.indUnk.Scorpio.xml  
Archive:  datasets/blogs.zip
  inflating: datasets/blogs/1000331.female.37.indUnk.Leo.xml  
  inflating: datasets/blogs/1000866.female.17.Student.Libra.xml  
  inflating: datasets/blogs/1004904.male.23.Arts.Capricorn.xml  
  inflating: datasets/blogs/1005076.female.25.Arts.Cancer.xml  
  inflating: datasets/blogs/1005545.male.25.Engineering.Sagittarius.xml  
  inflating: datasets/blogs/1007188.male.48.Religion.Libra.xml  
  inflating: datasets/blogs/100812.female.26.Architecture.Aries.xml  
  inflating: datasets/blogs/1008329.female.16.Student.Pisces.xml  
  inflating: datasets/blogs/1009572.male.25.indUnk.Cancer.xml  
  inflating: datasets/blogs/1011153.female.27.Technology.Virgo.xml  
  inflating: datasets/blogs/1011289.female.25.indUnk.Libra.xml  
  inflating: datasets/blogs/1011311.female.17.indUnk.Scorpio.xml  
  inflating: datasets/blogs/

In [2]:
import os
import sys
from pathlib import Path
import shutil
from typing import Generator


def iter_files(folder_path: str) -> Generator[Path, None, None]:
    """
    Recorre recursivamente un directorio y genera rutas de archivos encontrados.

    Args:
        folder_path (str): Ruta del directorio raíz desde el cual se inicia la búsqueda.

    Yields:
        Path: Objeto Path correspondiente a cada archivo encontrado.
    """
    stack: list[Path] = [Path(folder_path)]
    while stack:
        p: Path = stack.pop()
        for entry in os.scandir(p):
            if entry.is_dir(follow_symlinks=False):
                stack.append(Path(entry.path))
            elif entry.is_file(follow_symlinks=False):
                yield Path(entry.path)


def join_files_in_one_stream(
    folder_path: str, 
    output_file_path: str, 
    sep: bytes = b"\n"
) -> None:
    """
    Une todos los archivos de texto en un solo archivo binario.

    Los archivos se concatenan usando buffers grandes para minimizar
    la sobrecarga de Python. Cada archivo se separa con un delimitador.

    Args:
        folder_path (str): Directorio raíz que contiene los archivos a concatenar.
        output_file_path (str): Ruta del archivo de salida que contendrá el stream unido.
        sep (bytes, opcional): Separador a insertar entre archivos. Por defecto b"\\n".

    Raises:
        Exception: Si ocurre algún error al copiar un archivo.
    """
    out_path: Path = Path(output_file_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    with open(out_path, "wb") as out_f:
        for fpath in iter_files(folder_path):
            try:
                with open(fpath, "rb") as in_f:
                    shutil.copyfileobj(in_f, out_f, length=1024 * 1024)
                out_f.write(sep)
            except Exception as e:
                print(f"[WARN] No se pudo copiar {fpath}: {e}", file=sys.stderr)


if __name__ == "__main__":
    join_files_in_one_stream("datasets/20news-18828", "large-files/20news.txt")
    join_files_in_one_stream("datasets/blogs", "large-files/blogs.txt")

(5p) Tokenize by sentence.
- Normalize, but DO NOT eliminate stop words.
- Replace numbers with a token named NUM.
- Add sentence start and end tags "\<s>\</s>".
- Tokens with unit frequency should be modeled as "\<UNK>".

Tokenize by sentence

In [3]:
!pip install nltk



In [None]:
import nltk
nltk.download('punkt_tab')

In [4]:
from nltk.tokenize import sent_tokenize
import re
from typing import List, Optional


def tokenize_by_sentence(
    input_file: str, 
    is_xml: bool = False, 
    max_chars: Optional[int] = None
) -> List[str]:
    """
    Lee un archivo de texto y lo tokeniza en oraciones.

    Si el archivo contiene formato XML, puede extraer el contenido de las etiquetas <post>...</post>.
    Permite limitar la cantidad de caracteres leídos.

    Args:
        input_file (str): Ruta del archivo de entrada.
        is_xml (bool, opcional): Indica si debe extraer contenido de etiquetas <post>. 
                                 Por defecto False.
        max_chars (int | None, opcional): Número máximo de caracteres a leer del archivo. 
                                          Si es None, se lee todo el archivo.

    Returns:
        List[str]: Lista de oraciones tokenizadas. Si ocurre un error, devuelve lista vacía.
    """
    try:
        print(f"\n[INFO] Abriendo archivo: {input_file}")
        with open(input_file, "r", encoding="latin-1") as infile:
            text: str = infile.read(max_chars) if max_chars else infile.read()
            print(f"[OK] Archivo leído correctamente. Longitud del texto: {len(text)} caracteres")
    except Exception as e:
        print(f"[ERROR] No se pudo leer el archivo {input_file}: {e}")
        return []

    try:
        if is_xml:
            print("[INFO] Extrayendo contenido de <post>...</post>")
            posts: List[str] = re.findall(r"<post>(.*?)</post>", text, flags=re.DOTALL)
            print(f"[OK] Se encontraron {len(posts)} posts")
            if len(posts) == 0:
                print("[WARN] No se encontró nada dentro de <post>...</post>")
            text = "\n".join(posts)
    except Exception as e:
        print(f"[ERROR] Falló la extracción de posts: {e}")
        return []

    try:
        sentences: List[str] = sent_tokenize(text)
        print(f"[OK] Se tokenizaron {len(sentences)} oraciones")
        return sentences
    except Exception as e:
        print(f"[ERROR] Falló la tokenización: {e}")
        return []


if __name__ == "__main__":
    sentences_20news: List[str] = tokenize_by_sentence("large-files/20news.txt")
    sentences_blogs: List[str] = tokenize_by_sentence("large-files/blogs.txt", is_xml=True)

    print("\nEjemplo primeras 5 oraciones 20news:")
    print("20news:", sentences_20news[:5])
    print("\nEjemplo primeras 5 oraciones blogs:")
    print("Blogs:", sentences_blogs[:5])


[INFO] Abriendo archivo: large-files/20news.txt
[OK] Archivo leído correctamente. Longitud del texto: 33930324 caracteres
[OK] Se tokenizaron 290240 oraciones

[INFO] Abriendo archivo: large-files/blogs.txt
[OK] Archivo leído correctamente. Longitud del texto: 799624275 caracteres
[INFO] Extrayendo contenido de <post>...</post>
[OK] Se encontraron 681288 posts
[OK] Se tokenizaron 8813223 oraciones

Ejemplo primeras 5 oraciones 20news:
20news: ['From: CGKarras@world.std.com (Christopher G Karras)\nSubject: Need Maintenance tips\n\n\nAfter reading the service manual for my bike (Suzuki GS500E--1990) I have\na couple of questions I hope you can answer:\n\nWhen checking the oil level with the dip stick built into the oil fill\ncap, does one check it with the cap screwed in or not?', 'I am more used to\nthe dip stick for a cage where the stick is extracted fully, wiped clean\nand reinserted fully, then withdrawn and read.', 'The dip stick on my bike\nis part of the oil filler cap and has a

Normalize, but DO NOT eliminate stop words.

In [5]:
import re

def normalize_text(sentences: List[str]) -> List[str]:
    """
    Normaliza una lista de oraciones:
      - Convierte a minúsculas.
      - Elimina caracteres no alfanuméricos (excepto dígitos y acentos).
      - Reduce espacios múltiples a uno solo.

    Args:
        sentences (List[str]): Lista de oraciones originales.

    Returns:
        List[str]: Lista de oraciones normalizadas.
    """
    normalized_sentences: List[str] = []
    for sentence in sentences:
        sentence = sentence.lower()
        sentence = re.sub(r"[^a-záéíóúüñ\s0-9]", "", sentence)
        sentence = re.sub(r"\s+", " ", sentence).strip()
        normalized_sentences.append(sentence)
    return normalized_sentences


if __name__ == '__main__':
    normalized_20news = normalize_text(sentences_20news)
    normalized_blogs = normalize_text(sentences_blogs)
    
    print("\nEjemplo primeras 5 oraciones normalizadas:")
    print("20news:", normalized_20news[:5])
    print("Blogs:", normalized_blogs[:5])


Ejemplo primeras 5 oraciones normalizadas:
20news: ['from cgkarrasworldstdcom christopher g karras subject need maintenance tips after reading the service manual for my bike suzuki gs500e1990 i have a couple of questions i hope you can answer when checking the oil level with the dip stick built into the oil fill cap does one check it with the cap screwed in or not', 'i am more used to the dip stick for a cage where the stick is extracted fully wiped clean and reinserted fully then withdrawn and read', 'the dip stick on my bike is part of the oil filler cap and has about 12 inch of threads on it', 'do i remove the cap wipe the stick clean and reinsert it withwithout screwing it down before reading', 'the service manual calls for the application of suzuki bond no']
Blogs: ['today is another first day i put up a few links that i either a find handy or b like because i can either relate or they just plain cracked me up', 'it is finally friday', 'has anyone else been suffering through this

Replace numbers with a token named NUM.

In [6]:
import re

def replace_numbers_with_token(normalized_sentences: List[str]) -> List[str]:
    """
    Reemplaza todos los números en una lista de oraciones por el token 'NUM'.

    Args:
        normalized_sentences (List[str]): Lista de oraciones normalizadas.

    Returns:
        List[str]: Lista de oraciones con los números reemplazados por 'NUM'.
    """
    tokenized_sentences: List[str] = []
    for sentence in normalized_sentences:
        sentence_with_token: str = re.sub(r"\d+", "NUM", sentence)
        tokenized_sentences.append(sentence_with_token)
    return tokenized_sentences

if __name__ == '__main__':
    tokenized_num_20news = replace_numbers_with_token(normalized_20news)
    tokenized_num_blogs = replace_numbers_with_token(normalized_blogs)
    
    print("\nEjemplo primeras 5 oraciones con números reemplazados por 'NUM':")
    print("20news:", tokenized_num_20news[:5])
    print("Blogs:", tokenized_num_blogs[:5])


Ejemplo primeras 5 oraciones con números reemplazados por 'NUM':
20news: ['from cgkarrasworldstdcom christopher g karras subject need maintenance tips after reading the service manual for my bike suzuki gsNUMeNUM i have a couple of questions i hope you can answer when checking the oil level with the dip stick built into the oil fill cap does one check it with the cap screwed in or not', 'i am more used to the dip stick for a cage where the stick is extracted fully wiped clean and reinserted fully then withdrawn and read', 'the dip stick on my bike is part of the oil filler cap and has about NUM inch of threads on it', 'do i remove the cap wipe the stick clean and reinsert it withwithout screwing it down before reading', 'the service manual calls for the application of suzuki bond no']
Blogs: ['today is another first day i put up a few links that i either a find handy or b like because i can either relate or they just plain cracked me up', 'it is finally friday', 'has anyone else been 

- Add sentence start and end tags "\<s>\</s>".

In [7]:
def add_sentence_tags(sentences: List[str]) -> List[str]:
    """
    Agrega etiquetas de inicio <s> y fin </s> a cada oración de una lista.

    Args:
        sentences (List[str]): Lista de oraciones.

    Returns:
        List[str]: Lista de oraciones con etiquetas de inicio y fin.
    """
    tagged_sentences: List[str] = []
    for sentence in sentences:
        tagged_sentence: str = f"<s> {sentence} </s>"
        tagged_sentences.append(tagged_sentence)
    return tagged_sentences


if __name__ == '__main__':
    tokenized_tags_20news = add_sentence_tags(tokenized_num_20news)
    tokenized_tags_blogs = add_sentence_tags(tokenized_num_blogs)

    print(tokenized_tags_20news[:5])
    print(tokenized_tags_blogs[:5])

['<s> from cgkarrasworldstdcom christopher g karras subject need maintenance tips after reading the service manual for my bike suzuki gsNUMeNUM i have a couple of questions i hope you can answer when checking the oil level with the dip stick built into the oil fill cap does one check it with the cap screwed in or not </s>', '<s> i am more used to the dip stick for a cage where the stick is extracted fully wiped clean and reinserted fully then withdrawn and read </s>', '<s> the dip stick on my bike is part of the oil filler cap and has about NUM inch of threads on it </s>', '<s> do i remove the cap wipe the stick clean and reinsert it withwithout screwing it down before reading </s>', '<s> the service manual calls for the application of suzuki bond no </s>']
['<s> today is another first day i put up a few links that i either a find handy or b like because i can either relate or they just plain cracked me up </s>', '<s> it is finally friday </s>', '<s> has anyone else been suffering thro

- Tokens with unit frequency should be modeled as "\<UNK>".

In [8]:
import collections
import re

def replace_single_occurrence_tokens(sentences: List[str]) -> List[str]:
    """
    Reemplaza los tokens que aparecen solo una vez en un corpus por '<UNK>'.

    Args:
        sentences (List[str]): Lista de oraciones a procesar.

    Returns:
        List[str]: Lista de oraciones con tokens de ocurrencia única reemplazados.
    """
    all_tokens: List[str] = []
    for sentence in sentences:
        tokens: List[str] = sentence.split()
        all_tokens.extend(tokens)

    token_counts: collections.Counter = collections.Counter(all_tokens)

    processed_sentences: List[str] = []
    for sentence in sentences:
        current_sentence_tokens: List[str] = sentence.split()
        new_tokens: List[str] = []
        for token in current_sentence_tokens:
            new_tokens.append("<UNK>" if token_counts[token] == 1 else token)
        processed_sentences.append(" ".join(new_tokens))

    return processed_sentences


if __name__ == '__main__':
    processed_20news = replace_single_occurrence_tokens(tokenized_tags_20news)
    processed_blogs = replace_single_occurrence_tokens(tokenized_tags_blogs)

    print("--- Oraciones procesadas de 20news ---")
    print(processed_20news[:10])
    print("\n--- Oraciones procesadas de Blogs ---")
    print(processed_blogs[:10])

--- Oraciones procesadas de 20news ---
['<s> from cgkarrasworldstdcom christopher g karras subject need maintenance tips after reading the service manual for my bike suzuki gsNUMeNUM i have a couple of questions i hope you can answer when checking the oil level with the dip stick built into the oil fill cap does one check it with the cap screwed in or not </s>', '<s> i am more used to the dip stick for a cage where the stick is extracted fully wiped clean and reinserted fully then withdrawn and read </s>', '<s> the dip stick on my bike is part of the oil filler cap and has about NUM inch of threads on it </s>', '<s> do i remove the cap wipe the stick clean and reinsert it withwithout screwing it down before reading </s>', '<s> the service manual calls for the application of suzuki bond no </s>', '<s> NUMb on the head cover </s>', '<s> i guess this is some sort of liquid gasket material </s>', '<s> do you know of a generic cheaper substitute </s>', '<s> my headlight is a halogen NUM w b

(10p) Select 80% of the resulting sentences---random without replacement---to build the N-gram model and the remaining 20 for evaluation. Create the following files:

- 20N\_<group\_code>\_training (training sentences)
- 20N\_<group\_code>\_testing (testing sentences)
- BAC\_<group\_code>\_training (training sentences)
- BAC\_<group\_code>\_testing (testing sentences)

In [9]:
import random
import os
from typing import List


def split_and_save_data(
    sentences: List[str], 
    group_code: str, 
    file_prefix: str
) -> None:
    """
    Divide aleatoriamente una lista de oraciones en 80% entrenamiento y 20% prueba,
    y guarda los conjuntos resultantes en archivos de texto.

    Args:
        sentences (List[str]): Lista de oraciones a dividir.
        group_code (str): Código a incluir en los nombres de archivo (por ejemplo, 'my_group').
        file_prefix (str): Prefijo para los nombres de archivo de salida (por ejemplo, '20N' o 'BAC').

    Returns:
        None: La función no retorna nada. Genera archivos de salida en disco.
    """
    print(f"\n[INFO] Procesando {len(sentences)} oraciones para {file_prefix}")

    random.shuffle(sentences)
    split_index: int = int(len(sentences) * 0.8)

    training_set: List[str] = sentences[:split_index]
    testing_set: List[str] = sentences[split_index:]

    target_directory: str = "tercer-punto"
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)

    training_filename: str = os.path.join(
        target_directory, f"{file_prefix}_{group_code}_training.txt"
    )
    testing_filename: str = os.path.join(
        target_directory, f"{file_prefix}_{group_code}_testing.txt"
    )

    try:
        with open(training_filename, "w", encoding="utf-8") as f:
            for sentence in training_set:
                f.write(f"{sentence}\n")
        print(
            f"[OK] Guardado el conjunto de entrenamiento en '{training_filename}' "
            f"({len(training_set)} oraciones)"
        )

        with open(testing_filename, "w", encoding="utf-8") as f:
            for sentence in testing_set:
                f.write(f"{sentence}\n")
        print(
            f"[OK] Guardado el conjunto de prueba en '{testing_filename}' "
            f"({len(testing_set)} oraciones)"
        )
    except Exception as e:
        print(f"[ERROR] No se pudieron guardar los archivos: {e}")


if __name__ == "__main__":
    group_code: str = "my_group"

    split_and_save_data(processed_20news, group_code, "20N")
    split_and_save_data(processed_blogs, group_code, "BAC")


[INFO] Procesando 290240 oraciones para 20N
[OK] Guardado el conjunto de entrenamiento en 'tercer-punto/20N_my_group_training.txt' (232192 oraciones)
[OK] Guardado el conjunto de prueba en 'tercer-punto/20N_my_group_testing.txt' (58048 oraciones)

[INFO] Procesando 8813223 oraciones para BAC
[OK] Guardado el conjunto de entrenamiento en 'tercer-punto/BAC_my_group_training.txt' (7050578 oraciones)
[OK] Guardado el conjunto de prueba en 'tercer-punto/BAC_my_group_testing.txt' (1762645 oraciones)


(50p) Build the following N-gram models using Laplace smoothing and generate an output file for each one (you choose the output structure, but be sure to provide an appropriate Python reading method/function):

- 20N\_<group\_code>\_unigrams
- 20N\_<group\_code>\_bigrams
- 20N\_<group\_code>\_trigrams
- BAC\_<group\_code>\_unigrams
- BAC\_<group\_code>\_bigrams
- BAC\_<group\_code>\_trigrams

In [11]:
import json
import collections
import os
import re

def build_ngram_models(
    corpus_file: str, 
    group_code: str, 
    file_prefix: str
) -> None:
    """
    Construye modelos de unigramas, bigramas y trigramas con suavizado de Laplace 
    a partir de un corpus de texto y los guarda como archivos JSON.

    Args:
        corpus_file (str): Ruta del archivo de texto (datos de entrenamiento).
        group_code (str): Código a incluir en los nombres de archivo de salida.
        file_prefix (str): Prefijo para los nombres de archivo (ejemplo: '20N' o 'BAC').

    Returns:
        None: La función no devuelve nada. Genera archivos JSON con los modelos.
    """
    # Crea el directorio para los modelos si no existe
    models_dir = os.path.join("cuarto-punto", "models")
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    # Lee el corpus de texto
    try:
        with open(corpus_file, 'r', encoding='utf-8') as f:
            sentences = [line.strip() for line in f if line.strip()]
        if not sentences:
            print(f"[WARN] No se encontraron oraciones en el archivo: {corpus_file}")
            return
        print(f"[OK] Leído el corpus de entrenamiento de '{corpus_file}' con {len(sentences)} oraciones.")
    except Exception as e:
        print(f"[ERROR] No se pudo leer el archivo '{corpus_file}': {e}")
        return

    # Procesa los tokens para todos los modelos
    all_tokens = ' '.join(sentences).split()
    vocab_size = len(set(all_tokens))
    
    # 1. Construye y guarda el modelo de Unigramas
    unigram_counts = collections.Counter(all_tokens)
    unigram_model = {}
    total_tokens = len(all_tokens)
    for token, count in unigram_counts.items():
        unigram_model[token] = (count + 1) / (total_tokens + vocab_size)
    
    unigram_path = os.path.join(models_dir, f"{file_prefix}_{group_code}_unigrams.json")
    with open(unigram_path, 'w', encoding='utf-8') as f:
        json.dump(unigram_model, f, ensure_ascii=False, indent=4)
    print(f"[OK] Modelo de unigramas guardado en '{unigram_path}'")
    
    # 2. Construye y guarda el modelo de Bigramas
    bigram_counts = collections.Counter()
    unigram_context_counts = collections.Counter()
    for sentence in sentences:
        tokens = sentence.split()
        for i in range(len(tokens) - 1):
            bigram_counts[(tokens[i], tokens[i+1])] += 1
            unigram_context_counts[tokens[i]] += 1
    
    bigram_model = {}
    for (token1, token2), count in bigram_counts.items():
        context_count = unigram_context_counts[token1]
        probability = (count + 1) / (context_count + vocab_size)
        bigram_model[f"{token1} {token2}"] = probability
        
    bigram_path = os.path.join(models_dir, f"{file_prefix}_{group_code}_bigrams.json")
    with open(bigram_path, 'w', encoding='utf-8') as f:
        json.dump(bigram_model, f, ensure_ascii=False, indent=4)
    print(f"[OK] Modelo de bigramas guardado en '{bigram_path}'")
    
    # 3. Construye y guarda el modelo de Trigramas
    trigram_counts = collections.Counter()
    bigram_context_counts = collections.Counter()
    for sentence in sentences:
        tokens = sentence.split()
        for i in range(len(tokens) - 2):
            trigram_counts[(tokens[i], tokens[i+1], tokens[i+2])] += 1
            bigram_context_counts[(tokens[i], tokens[i+1])] += 1
            
    trigram_model = {}
    for (token1, token2, token3), count in trigram_counts.items():
        context_count = bigram_context_counts.get((token1, token2), 0)
        probability = (count + 1) / (context_count + vocab_size)
        trigram_model[f"{token1} {token2} {token3}"] = probability

    trigram_path = os.path.join(models_dir, f"{file_prefix}_{group_code}_trigrams.json")
    with open(trigram_path, 'w', encoding='utf-8') as f:
        json.dump(trigram_model, f, ensure_ascii=False, indent=4)
    print(f"[OK] Modelo de trigramas guardado en '{trigram_path}'")


if __name__ == '__main__':
    group_code = "my_group"
    
    training_file_20N = os.path.join("tercer-punto", f"20N_{group_code}_training.txt")
    training_file_BAC = os.path.join("tercer-punto", f"BAC_{group_code}_training.txt")
    
    # Construye y guarda los modelos
    build_ngram_models(training_file_20N, group_code, "20N")
    build_ngram_models(training_file_BAC, group_code, "BAC")


[OK] Leído el corpus de entrenamiento de 'tercer-punto/20N_my_group_training.txt' con 232192 oraciones.
[OK] Modelo de unigramas guardado en 'cuarto-punto/models/20N_my_group_unigrams.json'
[OK] Modelo de bigramas guardado en 'cuarto-punto/models/20N_my_group_bigrams.json'
[OK] Modelo de trigramas guardado en 'cuarto-punto/models/20N_my_group_trigrams.json'
[OK] Leído el corpus de entrenamiento de 'tercer-punto/BAC_my_group_training.txt' con 7050578 oraciones.
[OK] Modelo de unigramas guardado en 'cuarto-punto/models/BAC_my_group_unigrams.json'
[OK] Modelo de bigramas guardado en 'cuarto-punto/models/BAC_my_group_bigrams.json'
[OK] Modelo de trigramas guardado en 'cuarto-punto/models/BAC_my_group_trigrams.json'
