# Inglés -> Sipakapense

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

## Librerías

In [1]:
from IPython.display import clear_output

In [2]:
!pip install -U bitsandbytes
clear_output()

In [3]:
!pip install -U accelerate peft transformers sentencepiece datasets
!pip install evaluate
!pip install rouge_score
!pip install nltk
clear_output()

In [None]:
# !pip install comet

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset, DatasetDict, Dataset
# from comet import download_model, load_from_checkpoint
import evaluate
from tqdm import tqdm
import csv
import torch
import os

## Datasets

Dataset se puede encontrar en el [siguiente enlace](https://github.com/transducens/mayanv).

In [5]:
!git clone https://github.com/transducens/mayanv.git

Cloning into 'mayanv'...
remote: Enumerating objects: 177, done.[K
remote: Counting objects: 100% (177/177), done.[K
remote: Compressing objects: 100% (163/163), done.[K
remote: Total 177 (delta 22), reused 157 (delta 14), pack-reused 0 (from 0)[K
Receiving objects: 100% (177/177), 1.35 MiB | 6.67 MiB/s, done.
Resolving deltas: 100% (22/22), done.


In [6]:
def generate_dataset(language, train_folder="train", test_folder="test", base_path="mayanv/MayanV"):
    # Rutas
    train_lang_path = f"{base_path}/{language}/{train_folder}/data.{language}"
    test_lang_path = f"{base_path}/{language}/{test_folder}/data.{language}"

    train_es_path = f"{base_path}/{language}/{train_folder}/data.es"
    test_es_path = f"{base_path}/{language}/{test_folder}/data.es"

    # Carga manual de archivos
    def load_lines(path):
        with open(path, encoding="utf-8") as f:
            return [line.strip() for line in f if line.strip()]

    train_src = load_lines(train_lang_path)
    train_tgt = load_lines(train_es_path)

    test_src = load_lines(test_lang_path)
    test_tgt = load_lines(test_es_path)

    # Crea datasets
    train_dataset = Dataset.from_dict({"input": train_src, "target": train_tgt})
    test_dataset = Dataset.from_dict({"input": test_src, "target": test_tgt})

    return DatasetDict({
        "train": train_dataset,
        "test": test_dataset,
    })

In [7]:
es_qum_dataset = generate_dataset("qum", train_folder="test", test_folder="dev")

## Modelo

### 1. Modelo Zero-shot

In [34]:
model_name = "facebook/nllb-200-distilled-600M"
zeroshot_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
zeroshot_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [35]:
zeroshot_tokenizer.src_lang = "eng_Latn"
forced_bos_id = zeroshot_tokenizer.convert_tokens_to_ids("quc_Latn")

### 2. Modelo con Finetuning Español-Sipakapense

In [8]:
model_name = "facebook/nllb-200-distilled-600M"
finetune_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)


finetune_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Configurar LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Aplicar QLoRA
finetune_model = get_peft_model(finetune_model, lora_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [12]:
def preprocess_function(examples):
    max_length = 128

    inputs = finetune_tokenizer(
        examples["input"],
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    targets = finetune_tokenizer(
        examples["target"],
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_train = es_qum_dataset["train"].map(preprocess_function, batched=True)
tokenized_test = es_qum_dataset["test"].map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/321 [00:00<?, ? examples/s]

In [17]:
def translate_text(model, tokenizer, text, src_lang, tgt_lang):
    tokenizer.src_lang = src_lang
    forced_bos_id = tokenizer.convert_tokens_to_ids(tgt_lang)

    # Detecta el dispositivo (GPU o CPU)
    device = next(model.parameters()).device

    # Tokeniza y mueve a la misma device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    # Generación
    outputs = model.generate(**inputs, forced_bos_token_id=forced_bos_id)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]


In [13]:
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./qlora_finetuned_qum_nllb",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    save_strategy="epoch",
    logging_dir="./logs_qlora",
    save_total_limit=2,
    bf16=False,
    fp16=True
)

trainer = Trainer(
    model=finetune_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=finetune_tokenizer,
)

trainer.train()
finetune_model.save_pretrained("./qlora_finetuned_qum_nllb")
finetune_tokenizer.save_pretrained("./qlora_finetuned_qum_nllb")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,8.0782


('./qlora_finetuned_qum_nllb/tokenizer_config.json',
 './qlora_finetuned_qum_nllb/special_tokens_map.json',
 './qlora_finetuned_qum_nllb/sentencepiece.bpe.model',
 './qlora_finetuned_qum_nllb/added_tokens.json',
 './qlora_finetuned_qum_nllb/tokenizer.json')

### 3. Modelo con Finetuning Inglés-Sipakapense

Para ejecutar esta parte se requiere haber definido el modelo Zeroshot para generar las traducciones Español -> Inglés

In [None]:
def build_en_mayaV_split(split, model, tokenizer):
    new_inputs = []
    new_targets = []

    print(f"🔄 Traduciendo split '{split}' de español a inglés...\n")
    for example in tqdm(es_qum_dataset[split]):
        es_text = example["input"]
        en_text = translate_text(model, tokenizer, es_text, "spa_Latn", "eng_Latn")
        new_inputs.append(en_text)
        new_targets.append(example["target"])

    return Dataset.from_dict({
        "input": new_inputs,
        "target": new_targets
    })

# Generar splits
en_qum_train = build_en_mayaV_split("train", zeroshot_model, zeroshot_tokenizer)
en_qum_test = build_en_mayaV_split("test", zeroshot_model, zeroshot_tokenizer)

# Combinar
en_qum_dataset = DatasetDict({
    "train": en_qum_train,
    "test": en_qum_test
})

🔄 Traduciendo split 'train' de español a inglés...



 82%|████████▏ | 822/1000 [48:09<11:47,  3.98s/it]

In [None]:
output_dir = "."

import os
os.makedirs(output_dir, exist_ok=True)

# Guardar cada split en .en (inglés) y .quc (quiché)
for split in ["train", "test"]:
    with open(f"{output_dir}/{split}.en", "w", encoding="utf-8") as f_en, \
         open(f"{output_dir}/{split}.quc", "w", encoding="utf-8") as f_quc:

        for example in en_qum_dataset[split]:
            f_en.write(example["input"].strip() + "\n")
            f_quc.write(example["target"].strip() + "\n")

print("✅ Archivos .en y .quc guardados en", output_dir)

In [None]:
model_name = "facebook/nllb-200-distilled-600M"
en_qum_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

en_qum_model = get_peft_model(base_model, lora_config)

In [None]:
def preprocess_function_en_qum(examples):
    max_length = 128
    inputs = en_qum_tokenizer(examples["input"], max_length=max_length, truncation=True, padding=True)
    targets = en_qum_tokenizer(examples["target"], max_length=max_length, truncation=True, padding=True)
    inputs["labels"] = targets["input_ids"]
    return inputs


tokenized_en_qum_train = en_qum_dataset["train"].map(preprocess_function_en_qum, batched=True)
tokenized_en_qum_test = en_qum_dataset["test"].map(preprocess_function_en_qum, batched=True)

In [None]:
os.environ["WANDB_DISABLED"] = "true"

training_args_en_qum = TrainingArguments(
    output_dir="./finetuned_en_qum_nllb",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs_en_qum",
    logging_steps=100,
)

In [None]:
trainer_en_qum = Trainer(
    model=en_qum_model,
    args=training_args_en_qum,
    train_dataset=tokenized_en_qum_train,
    eval_dataset=tokenized_en_qum_test,
    tokenizer=en_qum_tokenizer,
)

trainer_en_qum.train()

## Evaluación

### Métricas

In [14]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

meteor = evaluate.load("meteor")
# comet_path = download_model("Unbabel/wmt22-comet-da")
# comet_model = load_from_checkpoint(comet_path)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


### Experimentos

#### 1. Zero-shot

Como NLLB tiene el idioma K’iche’, se coloca este como idioma objetivo para las traducciones directas.

In [36]:
predictions = []
references = []

for item in tqdm(es_qum_dataset["test"]):
    input_text = item["input"]
    reference = item["target"]

    inputs = zeroshot_tokenizer(input_text, return_tensors="pt", truncation=True)
    outputs = zeroshot_model.generate(**inputs, forced_bos_token_id=forced_bos_id)
    pred = zeroshot_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    predictions.append(pred)
    references.append(reference)

100%|██████████| 321/321 [1:17:38<00:00, 14.51s/it]


In [37]:
# Mostrar algunas predicciones
print("\n=== Ejemplos ===\n")
for i in range(5):
    print(f"> Entrada:     {es_qum_dataset['test'][i]['input']}")
    print(f"> Referencia:  {es_qum_dataset['test'][i]['target']}")
    print(f"> Predicción:  {predictions[i]}\n")


=== Ejemplos ===

> Entrada:     wu rxqiil taʼ xuux xaq utz laj siʼ kchuknik chre
> Referencia:  #qum# La esposa de don Jesus solo utiliza buena leña
> Predicción:                                                                                                    

> Entrada:     Njeel ri xkarawaʼn xpochʼik rech chenim xchaqʼjik
> Referencia:  #qum# Todo la arveja se
> Predicción:  Njeel ri xkarawaʼn xpochʼik rech chenim xchaqʼjik

> Entrada:     Pri qtinmit qal chik kchkunsxik ri qyolbʼaal
> Referencia:  #qum# En nuestro pueblo ya no se usa nuestro idioma
> Predicción:  

> Entrada:     Ri ajtijnel tjin kirtijuuj c hemo rchkunsxik ri suʼ
> Referencia:  #qum# El estudiante está aprendiendo como
> Predicción:                                                                                                    

> Entrada:     Ri ixoq tzʼulmaj rum ri rchjiil
> Referencia:  #qum# La mujer estaba abrazada por su marido
> Predicción:  



In [38]:
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])

print("\n=== Métricas BLEU ===")
print(f"BLEU-1: {bleu_score['precisions'][0]*100:.2f}")
print(f"BLEU-2: {bleu_score['precisions'][1]*100:.2f}")
print(f"BLEU-3: {bleu_score['precisions'][2]*100:.2f}")
print(f"BLEU-4: {bleu_score['precisions'][3]*100:.2f}")
print(f"BLEU total: {bleu_score['bleu']*100:.2f}")

rouge_score = rouge.compute(predictions=predictions, references=references)

print("\n=== Métricas ROUGE ===")
print(f"ROUGE-1:   {rouge_score['rouge1']*100:.2f}")
print(f"ROUGE-2:   {rouge_score['rouge2']*100:.2f}")
print(f"ROUGE-L:   {rouge_score['rougeL']*100:.2f}")
print(f"ROUGE-Lsum:{rouge_score['rougeLsum']*100:.2f}")

meteor_score = meteor.compute(predictions=predictions, references=references)
# comet_data = [
#     {"src": src, "mt": pred, "ref": ref}
#     for src, pred, ref in zip(
#         [ex["input"] for ex in es_qum_dataset["test"]],
#         predictions_intermediate,
#         references_intermediate
#     )
# ]
# comet_score = comet_model.predict(comet_data, batch_size=8, gpus=1)

print("\n=== Otras Métricas ===")
print(f"METEOR:     {meteor_score['meteor']*100:.2f}")
# print(f"COMET:      {comet_score.system_score*100:.2f}")


=== Métricas BLEU ===
BLEU-1: 0.21
BLEU-2: 0.00
BLEU-3: 0.00
BLEU-4: 0.00
BLEU total: 0.00

=== Métricas ROUGE ===
ROUGE-1:   0.44
ROUGE-2:   0.00
ROUGE-L:   0.43
ROUGE-Lsum:0.45

=== Otras Métricas ===
METEOR:     0.09


In [41]:
with open("results_es_qum_zeroshot.tsv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["input", "reference", "prediction"])

    for example, pred, ref in zip(es_qum_dataset["test"], predictions, references):
        writer.writerow([example["input"], ref, pred])

print("✅ Resultados guardados en 'results_es_qum_zeroshot.tsv'")

✅ Resultados guardados en 'results_es_qum_zeroshot.tsv'


#### 2. Finetuning ES-QUC y Pipeline

In [18]:
predictions_intermediate = []
references_intermediate = []

print("\n🔹 Traduciendo: inglés → español → K’iche’ (vía intermediaria)\n")

for item in tqdm(es_qum_dataset["test"]):
    input_text = item["input"]
    reference = item["target"]

    # Paso 1: inglés → español (modelo base)
    inter = translate_text(finetune_model, finetune_tokenizer, input_text, "eng_Latn", "spa_Latn")

    # Paso 2: español → K’iche’ (modelo fine-tuneado)
    pred = translate_text(finetune_model, finetune_tokenizer, inter, "spa_Latn", "quc_Latn")

    predictions_intermediate.append(pred)
    references_intermediate.append(reference)


🔹 Traduciendo: inglés → español → K’iche’ (vía intermediaria)



100%|██████████| 321/321 [06:39<00:00,  1.24s/it]


In [19]:
print("\n=== Ejemplos Fine-Tuned ===\n")
for i in range(5):
    print(f"> Entrada:     {es_qum_dataset['test'][i]['input']}")
    print(f"> Referencia:  {es_qum_dataset['test'][i]['target']}")
    print(f"> Predicción:  {predictions_intermediate[i]}\n")


=== Ejemplos Fine-Tuned ===

> Entrada:     wu rxqiil taʼ xuux xaq utz laj siʼ kchuknik chre
> Referencia:  #qum# La esposa de don Jesus solo utiliza buena leña
> Predicción:  #qum# El hombre que se ha tirado el pelo

> Entrada:     Njeel ri xkarawaʼn xpochʼik rech chenim xchaqʼjik
> Referencia:  #qum# Todo la arveja se
> Predicción:  # #qum# La mujer se ha ido

> Entrada:     Pri qtinmit qal chik kchkunsxik ri qyolbʼaal
> Referencia:  #qum# En nuestro pueblo ya no se usa nuestro idioma
> Predicción:  # La mujer se ha hecho el trabajo

> Entrada:     Ri ajtijnel tjin kirtijuuj c hemo rchkunsxik ri suʼ
> Referencia:  #qum# El estudiante está aprendiendo como
> Predicción:  La gente se ha echado

> Entrada:     Ri ixoq tzʼulmaj rum ri rchjiil
> Referencia:  #qum# La mujer estaba abrazada por su marido
> Predicción:  La gente se va a casa



In [29]:
bleu_score = bleu.compute(predictions=predictions_intermediate, references=[[ref] for ref in references_intermediate])

print("\n=== Métricas BLEU ===")
print(f"BLEU-1: {bleu_score['precisions'][0]*100:.2f}")
print(f"BLEU-2: {bleu_score['precisions'][1]*100:.2f}")
print(f"BLEU-3: {bleu_score['precisions'][2]*100:.2f}")
print(f"BLEU-4: {bleu_score['precisions'][3]*100:.2f}")
print(f"BLEU total: {bleu_score['bleu']*100:.2f}")

rouge_score = rouge.compute(predictions=predictions_intermediate, references=references_intermediate)

print("\n=== Métricas ROUGE ===")
print(f"ROUGE-1:   {rouge_score['rouge1']*100:.2f}")
print(f"ROUGE-2:   {rouge_score['rouge2']*100:.2f}")
print(f"ROUGE-L:   {rouge_score['rougeL']*100:.2f}")
print(f"ROUGE-Lsum:{rouge_score['rougeLsum']*100:.2f}")

meteor_score = meteor.compute(predictions=predictions_intermediate, references=references_intermediate)
# comet_data = [
#     {"src": src, "mt": pred, "ref": ref}
#     for src, pred, ref in zip(
#         [ex["input"] for ex in es_qum_dataset["test"]],
#         predictions_intermediate,
#         references_intermediate
#     )
# ]
# comet_score = comet_model.predict(comet_data, batch_size=8, gpus=1)

print("\n=== Otras Métricas ===")
print(f"METEOR:     {meteor_score['meteor']*100:.2f}")
# print(f"COMET:      {comet_score.system_score*100:.2f}")


=== Métricas BLEU ===
BLEU-1: 19.84
BLEU-2: 6.16
BLEU-3: 3.32
BLEU-4: 0.78
BLEU total: 2.82

=== Métricas ROUGE ===
ROUGE-1:   12.99
ROUGE-2:   1.31
ROUGE-L:   12.57
ROUGE-Lsum:12.52

=== Otras Métricas ===
METEOR:     11.11


In [31]:
with open("results_es_qum_finetune_intermediate.tsv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["input", "reference", "prediction"])

    for example, pred, ref in zip(es_qum_dataset["test"], predictions_intermediate, references_intermediate):
        writer.writerow([example["input"], ref, pred])

print("✅ Resultados guardados en 'results_es_qum_finetune_intermediate.tsv'")

✅ Resultados guardados en 'results_es_qum_finetune_intermediate.tsv'


#### 3. Finetuning ES-QUC y Traducción Directa

In [23]:
predictions_direct = []
references_direct = []

print("\n🔹 Traduciendo: inglés → K’iche’ (directo)\n")

for item in tqdm(es_qum_dataset["test"]):
    input_text = item["input"]        # asumimos que esto está en inglés
    reference = item["target"]        # y esto en K’iche’
    pred = translate_text(finetune_model, finetune_tokenizer, input_text, "eng_Latn", "quc_Latn")

    predictions_direct.append(pred)
    references_direct.append(reference)


🔹 Traduciendo: inglés → K’iche’ (directo)



100%|██████████| 321/321 [03:44<00:00,  1.43it/s]


In [24]:
print("\n=== Ejemplos Fine-Tuned ===\n")
for i in range(5):
    print(f"> Entrada:     {es_qum_dataset['test'][i]['input']}")
    print(f"> Referencia:  {es_qum_dataset['test'][i]['target']}")
    print(f"> Predicción:  {predictions_direct[i]}\n")


=== Ejemplos Fine-Tuned ===

> Entrada:     wu rxqiil taʼ xuux xaq utz laj siʼ kchuknik chre
> Referencia:  #qum# La esposa de don Jesus solo utiliza buena leña
> Predicción:  #qum# La mujer se ha hecho el uso de la

> Entrada:     Njeel ri xkarawaʼn xpochʼik rech chenim xchaqʼjik
> Referencia:  #qum# Todo la arveja se
> Predicción:  # Cuando se despierta la casa de la mujer

> Entrada:     Pri qtinmit qal chik kchkunsxik ri qyolbʼaal
> Referencia:  #qum# En nuestro pueblo ya no se usa nuestro idioma
> Predicción:  # El hombre que se ha quedado en la casa

> Entrada:     Ri ajtijnel tjin kirtijuuj c hemo rchkunsxik ri suʼ
> Referencia:  #qum# El estudiante está aprendiendo como
> Predicción:  #qum# El hombre se ha ido a la casa

> Entrada:     Ri ixoq tzʼulmaj rum ri rchjiil
> Referencia:  #qum# La mujer estaba abrazada por su marido
> Predicción:  # El hombre que se ha ido



In [28]:
bleu_score = bleu.compute(predictions=predictions_direct, references=[[ref] for ref in references_direct])

print("\n=== Métricas BLEU ===")
print(f"BLEU-1: {bleu_score['precisions'][0]*100:.2f}")
print(f"BLEU-2: {bleu_score['precisions'][1]*100:.2f}")
print(f"BLEU-3: {bleu_score['precisions'][2]*100:.2f}")
print(f"BLEU-4: {bleu_score['precisions'][3]*100:.2f}")
print(f"BLEU total: {bleu_score['bleu']*100:.2f}")

rouge_score = rouge.compute(predictions=predictions_direct, references=references_direct)

print("\n=== Métricas ROUGE ===")
print(f"ROUGE-1:   {rouge_score['rouge1']*100:.2f}")
print(f"ROUGE-2:   {rouge_score['rouge2']*100:.2f}")
print(f"ROUGE-L:   {rouge_score['rougeL']*100:.2f}")
print(f"ROUGE-Lsum:{rouge_score['rougeLsum']*100:.2f}")

meteor_score = meteor.compute(predictions=predictions_direct, references=references_direct)
# comet_data = [
#     {"src": src, "mt": pred, "ref": ref}
#     for src, pred, ref in zip(
#         [ex["input"] for ex in es_qum_dataset["test"]],
#         predictions_intermediate,
#         references_intermediate
#     )
# ]
# comet_score = comet_model.predict(comet_data, batch_size=8, gpus=1)

print("\n=== Otras Métricas ===")
print(f"METEOR:     {meteor_score['meteor']*100:.2f}")
# print(f"COMET:      {comet_score.system_score*100:.2f}")


=== Métricas BLEU ===
BLEU-1: 21.22
BLEU-2: 9.45
BLEU-3: 5.45
BLEU-4: 1.59
BLEU total: 5.68

=== Métricas ROUGE ===
ROUGE-1:   15.55
ROUGE-2:   2.23
ROUGE-L:   14.50
ROUGE-Lsum:14.46

=== Otras Métricas ===
METEOR:     16.58


In [33]:
filename = "results_es_qum_finetune_direct.tsv"
with open(filename, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["input", "reference", "prediction"])

    for example, pred, ref in zip(es_qum_dataset["test"], predictions_direct, references_direct):
        writer.writerow([example["input"], ref, pred])

print(f"✅ Resultados guardados en '{filename}'")

✅ Resultados guardados en 'results_es_qum_finetune_direct.tsv'


#### 4. Finetuning EN-QUC

In [None]:
predictions_en_qum = []
references_en_qum = []

print("\n🔹 Traduciendo: inglés → K’iche’ (dataset intermedio)\n")

for item in tqdm(en_qum_dataset["test"]):
    input_text = item["input"]
    reference = item["target"]
    pred = translate_text(en_qum_model, en_qum_tokenizer, input_text, "eng_Latn", "quc_Latn")

    predictions_en_qum.append(pred)
    references_en_qum.append(reference)

In [None]:
print("\n=== Ejemplos Fine-Tuned ===\n")
for i in range(5):
    print(f"> Entrada:     {en_qum_dataset['test'][i]['input']}")
    print(f"> Referencia:  {en_qum_dataset['test'][i]['target']}")
    print(f"> Predicción:  {predictions_en_qum[i]}\n")

In [None]:
bleu_score = bleu.compute(predictions=predictions_en_qum, references=[[ref] for ref in references_en_qum])

print("\n=== Métricas BLEU ===")
print(f"BLEU-1: {bleu_score['precisions'][0]*100:.2f}")
print(f"BLEU-2: {bleu_score['precisions'][1]*100:.2f}")
print(f"BLEU-3: {bleu_score['precisions'][2]*100:.2f}")
print(f"BLEU-4: {bleu_score['precisions'][3]*100:.2f}")
print(f"BLEU total: {bleu_score['bleu']*100:.2f}")

rouge_score = rouge.compute(predictions=predictions_en_qum, references=references_en_qum)

print("\n=== Métricas ROUGE ===")
print(f"ROUGE-1:   {rouge_score['rouge1']*100:.2f}")
print(f"ROUGE-2:   {rouge_score['rouge2']*100:.2f}")
print(f"ROUGE-L:   {rouge_score['rougeL']*100:.2f}")
print(f"ROUGE-Lsum:{rouge_score['rougeLsum']*100:.2f}")

meteor_score = meteor.compute(predictions=predictions_en_qum, references=references_en_qum)
# comet_data = [
#     {"src": src, "mt": pred, "ref": ref}
#     for src, pred, ref in zip(
#         [ex["input"] for ex in es_qum_dataset["test"]],
#         predictions_intermediate,
#         references_intermediate
#     )
# ]
# comet_score = comet_model.predict(comet_data, batch_size=8, gpus=1)

print("\n=== Otras Métricas ===")
print(f"METEOR:     {meteor_score['meteor']*100:.2f}")
# print(f"COMET:      {comet_score.system_score*100:.2f}")

In [None]:
filename = "results_es_qum_finetune_en_qum.tsv"
with open(filename, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["input", "reference", "prediction"])

    for example, pred, ref in zip(es_qum_dataset["test"], predictions_en_qum, references_en_qum):
        writer.writerow([example["input"], ref, pred])

print(f"✅ Resultados guardados en '{filename}'")