# TraducciÃ³n InglÃ©s -> Sipakapense

## LibrerÃ­as

In [10]:
!pip install transformers sentencepiece
!pip install evaluate
!pip install datasets
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=7fdfb72db4ea7cacb776d18977ba344148a1a235ce19b80733db9028cf991e35
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset
import evaluate
from tqdm import tqdm
import csv

## Datasets

Dataset se puede encontrar en el [siguiente enlace](https://github.com/transducens/mayanv).

In [3]:
!git clone https://github.com/transducens/mayanv.git

Cloning into 'mayanv'...
remote: Enumerating objects: 177, done.[K
remote: Counting objects: 100% (177/177), done.[K
remote: Compressing objects: 100% (163/163), done.[K
remote: Total 177 (delta 22), reused 157 (delta 14), pack-reused 0 (from 0)[K
Receiving objects: 100% (177/177), 1.35 MiB | 2.80 MiB/s, done.
Resolving deltas: 100% (22/22), done.


In [4]:
def generate_dataset(language, train_folder="train", test_folder="test", base_path="mayanv/MayanV"):
    # Rutas
    train_lang_path = f"{base_path}/{language}/{train_folder}/data.{language}"
    test_lang_path = f"{base_path}/{language}/{test_folder}/data.{language}"

    train_es_path = f"{base_path}/{language}/{train_folder}/data.es"
    test_es_path = f"{base_path}/{language}/{test_folder}/data.es"

    # Carga manual de archivos
    def load_lines(path):
        with open(path, encoding="utf-8") as f:
            return [line.strip() for line in f if line.strip()]

    train_src = load_lines(train_lang_path)
    train_tgt = load_lines(train_es_path)

    test_src = load_lines(test_lang_path)
    test_tgt = load_lines(test_es_path)

    # Crea datasets
    train_dataset = Dataset.from_dict({"input": train_src, "target": train_tgt})
    test_dataset = Dataset.from_dict({"input": test_src, "target": test_tgt})

    return DatasetDict({
        "train": train_dataset,
        "test": test_dataset,
    })

In [6]:
es_qum_dataset = generate_dataset("qum", train_folder="test", test_folder="dev")

## Modelo

### 1. Modelo Zero-shot

In [7]:
model_name = "facebook/nllb-200-distilled-600M"
zeroshot_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
zeroshot_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [8]:
zeroshot_tokenizer.src_lang = "eng_Latn"
forced_bos_id = zeroshot_tokenizer.convert_tokens_to_ids("quc_Latn")

### 2. Modelo con Finetuning EspaÃ±ol-Sipakapense

In [None]:
finetune_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
finetune_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
def preprocess_function(examples):
    max_length = 128
    # Tokeniza el input
    inputs = finetune_tokenizer(examples["input"], max_length=max_length, truncation=True)
    # Tokeniza la salida (target)
    targets = finetune_tokenizer(examples["target"], max_length=max_length, truncation=True)

    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_train = es_qum_dataset["train"].map(preprocess_function, batched=True)
tokenized_test = es_qum_dataset["test"].map(preprocess_function, batched=True)

In [None]:
def translate_text(model, tokenizer, text, src_lang, tgt_lang):
    tokenizer.src_lang = src_lang
    forced_bos_id = tokenizer.convert_tokens_to_ids(tgt_lang)
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, forced_bos_token_id=forced_bos_id)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

In [None]:
training_args = TrainingArguments(
    output_dir="./finetuned_qum_nllb",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
)

trainer = Trainer(
    model=finetune_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=finetune_tokenizer,
)

trainer.train()

### 3. Modelo con Finetuning InglÃ©s-Sipakapense

In [None]:
def build_en_qum_split(split):
    new_inputs = []
    new_targets = []

    print(f"ðŸ”„ Traduciendo split '{split}' de espaÃ±ol a inglÃ©s...\n")
    for example in tqdm(es_qum_dataset[split]):
        es_text = example["input"]
        en_text = translate_text(model, tokenizer, es_text, "spa_Latn", "eng_Latn")
        new_inputs.append(en_text)
        new_targets.append(example["target"])

    return Dataset.from_dict({
        "input": new_inputs,
        "target": new_targets
    })

# Generar splits
en_qum_train = build_en_qum_split("train")
en_qum_test = build_en_qum_split("test")

# Combinar
en_qum_dataset = DatasetDict({
    "train": en_qum_train,
    "test": en_qum_test
})

In [None]:
output_dir = "."

import os
os.makedirs(output_dir, exist_ok=True)

# Guardar cada split en .en (inglÃ©s) y .quc (quichÃ©)
for split in ["train", "test"]:
    with open(f"{output_dir}/{split}.en", "w", encoding="utf-8") as f_en, \
         open(f"{output_dir}/{split}.quc", "w", encoding="utf-8") as f_quc:

        for example in en_qum_dataset[split]:
            f_en.write(example["input"].strip() + "\n")
            f_quc.write(example["target"].strip() + "\n")

print("âœ… Archivos .en y .quc guardados en", output_dir)

In [None]:
model_name = "facebook/nllb-200-distilled-600M"
en_qum_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
en_qum_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
def preprocess_function_en_qum(examples):
    max_length = 128
    inputs = en_qum_tokenizer(examples["input"], max_length=max_length, truncation=True)
    targets = en_qum_tokenizer(examples["target"], max_length=max_length, truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_en_qum_train = en_qum_dataset["train"].map(preprocess_function_en_qum, batched=True)
tokenized_en_qum_test = en_qum_dataset["test"].map(preprocess_function_en_qum, batched=True)

In [None]:
training_args_en_qum = TrainingArguments(
    output_dir="./finetuned_en_qum_nllb",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs_en_qum",
    logging_steps=100,
)

In [None]:
trainer_en_qum = Trainer(
    model=en_qum_model,
    args=training_args_en_qum,
    train_dataset=tokenized_en_qum_train,
    eval_dataset=tokenized_en_qum_test,
    tokenizer=en_qum_tokenizer,
)

trainer_en_qum.train()

## EvaluaciÃ³n

### MÃ©tricas

In [11]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

### Experimentos

#### 1. Zero-shot

Como NLLB tiene el idioma Kâ€™icheâ€™, se coloca este como idioma objetivo para las traducciones directas.

In [12]:
predictions = []
references = []

for item in tqdm(es_qum_dataset["test"]):
    input_text = item["input"]
    reference = item["target"]

    inputs = zeroshot_tokenizer(input_text, return_tensors="pt", truncation=True)
    outputs = zeroshot_model.generate(**inputs, forced_bos_token_id=forced_bos_id)
    pred = zeroshot_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    predictions.append(pred)
    references.append(reference)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 321/321 [1:16:41<00:00, 14.33s/it]


In [13]:
# Mostrar algunas predicciones
print("\n=== Ejemplos ===\n")
for i in range(5):
    print(f"> Entrada:     {es_qum_dataset['test'][i]['input']}")
    print(f"> Referencia:  {es_qum_dataset['test'][i]['target']}")
    print(f"> PredicciÃ³n:  {predictions[i]}\n")


=== Ejemplos ===

> Entrada:     wu rxqiil taÊ¼ xuux xaq utz laj siÊ¼ kchuknik chre
> Referencia:  #qum# La esposa de don Jesus solo utiliza buena leÃ±a
> PredicciÃ³n:                                                                                                    

> Entrada:     Njeel ri xkarawaÊ¼n xpochÊ¼ik rech chenim xchaqÊ¼jik
> Referencia:  #qum# Todo la arveja se
> PredicciÃ³n:  Njeel ri xkarawaÊ¼n xpochÊ¼ik rech chenim xchaqÊ¼jik

> Entrada:     Pri qtinmit qal chik kchkunsxik ri qyolbÊ¼aal
> Referencia:  #qum# En nuestro pueblo ya no se usa nuestro idioma
> PredicciÃ³n:  

> Entrada:     Ri ajtijnel tjin kirtijuuj c hemo rchkunsxik ri suÊ¼
> Referencia:  #qum# El estudiante estÃ¡ aprendiendo como
> PredicciÃ³n:                                                                                                    

> Entrada:     Ri ixoq tzÊ¼ulmaj rum ri rchjiil
> Referencia:  #qum# La mujer estaba abrazada por su marido
> PredicciÃ³n:  



In [15]:
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])

print("\n=== MÃ©tricas BLEU ===")
print(f"BLEU-1: {bleu_score['precisions'][0]*100:.2f}")
print(f"BLEU-2: {bleu_score['precisions'][1]*100:.2f}")
print(f"BLEU-3: {bleu_score['precisions'][2]*100:.2f}")
print(f"BLEU-4: {bleu_score['precisions'][3]*100:.2f}")
print(f"BLEU total: {bleu_score['bleu']*100:.2f}")

rouge_score = rouge.compute(predictions=predictions, references=references)

print("\n=== MÃ©tricas ROUGE ===")
print(f"ROUGE-1:   {rouge_score['rouge1']*100:.2f}")
print(f"ROUGE-2:   {rouge_score['rouge2']*100:.2f}")
print(f"ROUGE-L:   {rouge_score['rougeL']*100:.2f}")
print(f"ROUGE-Lsum:{rouge_score['rougeLsum']*100:.2f}")


=== MÃ©tricas BLEU ===
BLEU-1: 0.21
BLEU-2: 0.00
BLEU-3: 0.00
BLEU-4: 0.00
BLEU total: 0.00

=== MÃ©tricas ROUGE ===
ROUGE-1:   0.44
ROUGE-2:   0.00
ROUGE-L:   0.43
ROUGE-Lsum:0.44


In [16]:
with open("results_es_qum.tsv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["input", "reference", "prediction"])

    for example, pred, ref in zip(es_qum_dataset["test"], predictions, references):
        writer.writerow([example["input"], ref, pred])

print("âœ… Resultados guardados en 'results_es_qum.tsv'")

âœ… Resultados guardados en 'results_es_qum.tsv'


#### 2. Finetuning ES-QUC y Pipeline

In [None]:
predictions_intermediate = []
references_intermediate = []

print("\nðŸ”¹ Traduciendo: inglÃ©s â†’ espaÃ±ol â†’ Kâ€™icheâ€™ (vÃ­a intermediaria)\n")

for item in tqdm(es_qum_dataset["test"]):
    input_text = item["input"]
    reference = item["target"]

    # Paso 1: inglÃ©s â†’ espaÃ±ol (modelo base)
    inter = translate_text(zeroshot_model, zeroshot_tokenizer, input_text, "eng_Latn", "spa_Latn")

    # Paso 2: espaÃ±ol â†’ Kâ€™icheâ€™ (modelo fine-tuneado)
    pred = translate_text(finetune_model, finetune_tokenizer, inter, "spa_Latn", "quc_Latn")

    predictions_intermediate.append(pred)
    references_intermediate.append(reference)

In [None]:
print("\n=== Ejemplos Fine-Tuned ===\n")
for i in range(5):
    print(f"> Entrada:     {es_qum_dataset['test'][i]['input']}")
    print(f"> Referencia:  {es_qum_dataset['test'][i]['target']}")
    print(f"> PredicciÃ³n:  {predictions_intermediate[i]}\n")

In [None]:
bleu_score_ft = bleu.compute(predictions=predictions_intermediate, references=[[r] for r in references_intermediate])
rouge_score_ft = rouge.compute(predictions=predictions_intermediate, references=references_intermediate)

print("\n=== MÃ©tricas Fine-Tuned ===")
print(f"BLEU: {bleu_score_ft['bleu']:.2f}")
print(f"ROUGE-L: {rouge_score_ft['rougeL']:.2f}")

#### 3. Finetuning ES-QUC y TraducciÃ³n Directa

In [None]:
predictions_direct = []
references_direct = []

print("\nðŸ”¹ Traduciendo: inglÃ©s â†’ Kâ€™icheâ€™ (directo)\n")

for item in tqdm(es_qum_dataset["test"]):
    input_text = item["input"]        # asumimos que esto estÃ¡ en inglÃ©s
    reference = item["target"]        # y esto en Kâ€™icheâ€™
    pred = translate_text(finetune_model, finetune_tokenizer, input_text, "eng_Latn", "quc_Latn")

    predictions_direct.append(pred)
    references_direct.append(reference)

In [None]:
print("\n=== Ejemplos Fine-Tuned ===\n")
for i in range(5):
    print(f"> Entrada:     {es_qum_dataset['test'][i]['input']}")
    print(f"> Referencia:  {es_qum_dataset['test'][i]['target']}")
    print(f"> PredicciÃ³n:  {predictions_direct[i]}\n")

In [None]:
bleu_score_ft = bleu.compute(predictions=predictions_direct, references=[[r] for r in references_direct])
rouge_score_ft = rouge.compute(predictions=predictions_direct, references=references_direct)

print("\n=== MÃ©tricas Fine-Tuned ===")
print(f"BLEU: {bleu_score_ft['bleu']:.2f}")
print(f"ROUGE-L: {rouge_score_ft['rougeL']:.2f}")

#### 4. Finetuning EN-QUC