# Experimentos de Traducción Alemán -> Sipakapense

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
lang = "de"
n_epochs = 15

## Librerías

In [3]:
from IPython.display import clear_output

In [4]:
!pip install -U bitsandbytes
clear_output()

In [5]:
!pip install -U accelerate peft transformers sentencepiece datasets
!pip install evaluate
!pip install rouge_score
!pip install nltk
clear_output()

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset, DatasetDict, Dataset
import evaluate
from tqdm import tqdm
import csv
import torch
import os

## Datasets
!REQUIERE SUBIR LOS ARCHIVOS DEL DATASET SINTETICO

Dataset se puede encontrar en el [siguiente enlace](https://github.com/transducens/mayanv).

In [7]:
!git clone https://github.com/transducens/mayanv.git

Cloning into 'mayanv'...
remote: Enumerating objects: 177, done.[K
remote: Counting objects: 100% (177/177), done.[K
remote: Compressing objects: 100% (163/163), done.[K
remote: Total 177 (delta 22), reused 157 (delta 14), pack-reused 0 (from 0)[K
Receiving objects: 100% (177/177), 1.35 MiB | 5.46 MiB/s, done.
Resolving deltas: 100% (22/22), done.


In [2]:
def load_lines(path):
    with open(path, encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

def generate_dataset(language, train_folder="train", test_folder="test", base_path="mayanv/MayanV"):
    # Rutas
    train_lang_path = f"{base_path}/{language}/{train_folder}/data.{language}"
    test_lang_path = f"{base_path}/{language}/{test_folder}/data.{language}"

    train_es_path = f"{base_path}/{language}/{train_folder}/data.es"
    test_es_path = f"{base_path}/{language}/{test_folder}/data.es"

    train_src = load_lines(train_lang_path)
    train_tgt = load_lines(train_es_path)

    test_src = load_lines(test_lang_path)
    test_tgt = load_lines(test_es_path)

    # Crea datasets
    train_dataset = Dataset.from_dict({"input": train_src, "target": train_tgt})
    test_dataset = Dataset.from_dict({"input": test_src, "target": test_tgt})

    return DatasetDict({
        "train": train_dataset,
        "test": test_dataset,
    })

In [3]:
def invertir_input_target(example):
    return {
        "input": example["target"],
        "target": example["input"]
    }

In [6]:
es_qum_dataset = generate_dataset("qum", train_folder="test", test_folder="dev")
es_qum_dataset = DatasetDict({
    "train": es_qum_dataset["train"].map(invertir_input_target),
    "test": es_qum_dataset["test"].map(invertir_input_target),
})

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/321 [00:00<?, ? examples/s]

In [7]:
def generate_en_qum_dataset_from_files(
    data_dir="./",
    base_path="mayanv/MayanV",
    language="qum",
    train_folder="test",
    test_folder="dev",
    lang=lang
):
    # Paths a los archivos .en
    train_en_path = os.path.join(data_dir, f"train.{lang}")
    test_en_path = os.path.join(data_dir, f"test.{lang}")

    # Paths a los archivos .qum
    train_qum_path = os.path.join(base_path, language, train_folder, "data.qum")
    test_qum_path = os.path.join(base_path, language, test_folder, "data.qum")

    # Carga las líneas
    train_en = load_lines(train_en_path)
    test_en = load_lines(test_en_path)

    train_qum = load_lines(train_qum_path)
    test_qum = load_lines(test_qum_path)

    # Validación
    assert len(train_en) == len(train_qum), f"Train mismatch: {len(train_en)} vs {len(train_qum)}"
    assert len(test_en) == len(test_qum), f"Test mismatch: {len(test_en)} vs {len(test_qum)}"

    # Combina en datasets Hugging Face
    train_dataset = Dataset.from_dict({"input": train_en, "target": train_qum})
    test_dataset = Dataset.from_dict({"input": test_en, "target": test_qum})

    return DatasetDict({
        "train": train_dataset,
        "test": test_dataset,
    })


en_qum_dataset = generate_en_qum_dataset_from_files()

## Funciones

In [8]:
def preprocess_function(examples):
    max_length = 128

    inputs = finetune_tokenizer(
        examples["input"],
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    targets = finetune_tokenizer(
        examples["target"],
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    inputs["labels"] = targets["input_ids"]
    return inputs


def translate_text(model, tokenizer, text, src_lang, tgt_lang):
    tokenizer.src_lang = src_lang
    forced_bos_id = tokenizer.convert_tokens_to_ids(tgt_lang)

    # Detecta el dispositivo (GPU o CPU)
    device = next(model.parameters()).device

    # Tokeniza y mueve a la misma device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    # Generación
    outputs = model.generate(**inputs, forced_bos_token_id=forced_bos_id)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]


def build_and_save_split_en_only(split, model, tokenizer, output_dir):
    input_path = os.path.join(output_dir, f"{split}.{lang}")
    os.makedirs(output_dir, exist_ok=True)

    with open(input_path, "w", encoding="utf-8") as f_en:
        print(f"🔄 Traduciendo y guardando split '{split}' al {lang}...\n")

        for example in tqdm(es_qum_dataset[split]):
            es_text = example["input"].replace("#qum#", "").strip()

            try:
                en_text = translate_text(model, tokenizer, es_text, "spa_Latn", f"{lang}_Latn")
                en_text = "#qum# " + en_text.strip()
            except Exception as e:
                print(f"❌ Error traduciendo: {es_text}\n{e}")
                en_text = "#qum#"

            f_en.write(en_text + "\n")

    print(f"✅ Archivo '{split}.{lang}' guardado en {output_dir}")

## Crear dataset sintético
!SOLO EJECUTAR SI NO SE TIENE EL DATASET TODAVIA

In [None]:
model_name = "facebook/nllb-200-distilled-600M"
mid_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
mid_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
mid_tokenizer.src_lang = "spa_Latn"
mid_forced_bos_id = mid_tokenizer.convert_tokens_to_ids(f"{lang}_Latn")

In [None]:
output_dir = "/content/drive/MyDrive/en_qum_dataset"

build_and_save_split_en_only("test", mid_model, mid_tokenizer, output_dir)
build_and_save_split_en_only("train", mid_model, mid_tokenizer, output_dir)

## Modelos

### 1. Modelo con Finetuning Español-Sipakapense

In [None]:
model_name = "facebook/nllb-200-distilled-600M"
finetune_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)


finetune_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Configurar LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Aplicar QLoRA
finetune_model = get_peft_model(finetune_model, lora_config)

In [None]:
# Añadir el token especial "#qum#" como marcador de idioma
special_tokens_dict = {"additional_special_tokens": ["#qum#"]}
finetune_tokenizer.add_special_tokens(special_tokens_dict)

# Redimensionar embeddings del modelo para incluir el nuevo token
finetune_model.resize_token_embeddings(len(finetune_tokenizer))

In [None]:
tokenized_train = es_qum_dataset["train"].map(preprocess_function, batched=True)
tokenized_test = es_qum_dataset["test"].map(preprocess_function, batched=True)

In [None]:
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./qlora_finetuned_qum_nllb",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=n_epochs,
    learning_rate=2e-4,
    save_strategy="epoch",
    logging_dir="./logs_qlora",
    save_total_limit=2,
    bf16=False,
    fp16=True
)

trainer = Trainer(
    model=finetune_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=finetune_tokenizer,
)

trainer.train()
finetune_model.save_pretrained("./qlora_finetuned_qum_nllb")
finetune_tokenizer.save_pretrained("./qlora_finetuned_qum_nllb")

### 2. Modelo con Finetuning Portugues-Sipakapense

In [9]:
model_name = "facebook/nllb-200-distilled-600M"
en_qum_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

en_qum_model = get_peft_model(base_model, lora_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
# Añadir el token especial "#qum#" como marcador de idioma
special_tokens_dict = {"additional_special_tokens": ["#qum#"]}
en_qum_tokenizer.add_special_tokens(special_tokens_dict)

# Redimensionar embeddings del modelo para incluir el nuevo token
en_qum_model.resize_token_embeddings(len(en_qum_tokenizer))

M2M100ScaledWordEmbedding(256205, 1024, padding_idx=1)

In [11]:
def preprocess_function_en_qum(examples):
    max_length = 128
    inputs = en_qum_tokenizer(examples["input"], max_length=max_length, truncation=True, padding=True)
    targets = en_qum_tokenizer(examples["target"], max_length=max_length, truncation=True, padding=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_en_qum_train = en_qum_dataset["train"].map(preprocess_function_en_qum, batched=True)
tokenized_en_qum_test = en_qum_dataset["test"].map(preprocess_function_en_qum, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/321 [00:00<?, ? examples/s]

In [12]:
os.environ["WANDB_DISABLED"] = "true"

training_args_en_qum = TrainingArguments(
    output_dir="./finetuned_en_qum_nllb",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=n_epochs,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs_en_qum",
    logging_steps=100,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [13]:
trainer_en_qum = Trainer(
    model=en_qum_model,
    args=training_args_en_qum,
    train_dataset=tokenized_en_qum_train,
    eval_dataset=tokenized_en_qum_test,
    tokenizer=en_qum_tokenizer,
)

trainer_en_qum.train()

  trainer_en_qum = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,10.4381
200,9.075
300,8.1297
400,7.2777
500,6.8176
600,6.6152
700,6.5136
800,6.4297
900,6.4002
1000,6.341




TrainOutput(global_step=1875, training_loss=6.857385416666666, metrics={'train_runtime': 720.0946, 'train_samples_per_second': 20.831, 'train_steps_per_second': 2.604, 'total_flos': 796271616000000.0, 'train_loss': 6.857385416666666, 'epoch': 15.0})

## Evaluación

### Métricas

In [14]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Experimentos

In [15]:
origin_lang = "deu"

#### 1. Con Finetuning Español-Sipakapense

In [None]:
predictions_intermediate = []
references_intermediate = []

print(f"\n🔹 Traduciendo: {origin_lang} → Español → Sipakapense\n")

for item in tqdm(en_qum_dataset["test"]):
    input_text = item["input"]
    reference = item["target"]

    # Paso 1: origin_lang → español
    inter = translate_text(finetune_model, finetune_tokenizer, input_text, f"{origin_lang}_Latn", "spa_Latn")

    # Paso 2: español → "Sipakapense"
    pred = translate_text(finetune_model, finetune_tokenizer, inter, "spa_Latn", "qum")

    predictions_intermediate.append(pred)
    references_intermediate.append(reference)

In [None]:
clean_pred_intermediate = [pred.removeprefix("qum").strip() for pred in predictions_intermediate]

In [None]:
print("\n=== Ejemplos Fine-Tuned ===\n")
for i in range(5):
    print(f"> Entrada:     {en_qum_dataset['test'][i]['input']}")
    print(f"> Referencia:  {en_qum_dataset['test'][i]['target']}")
    print(f"> Predicción:  {clean_pred_intermediate[i]}\n")

In [None]:
bleu_score = bleu.compute(predictions=clean_pred_intermediate, references=[[ref] for ref in references_intermediate])

print("\n=== Métricas BLEU ===")
print(f"BLEU-1: {bleu_score['precisions'][0]*100:.2f}")
print(f"BLEU-2: {bleu_score['precisions'][1]*100:.2f}")
print(f"BLEU-3: {bleu_score['precisions'][2]*100:.2f}")
print(f"BLEU-4: {bleu_score['precisions'][3]*100:.2f}")
print(f"BLEU total: {bleu_score['bleu']*100:.2f}")

rouge_score = rouge.compute(predictions=clean_pred_intermediate, references=references_intermediate)

print("\n=== Métricas ROUGE ===")
print(f"ROUGE-1:   {rouge_score['rouge1']*100:.2f}")
print(f"ROUGE-2:   {rouge_score['rouge2']*100:.2f}")
print(f"ROUGE-L:   {rouge_score['rougeL']*100:.2f}")
print(f"ROUGE-Lsum:{rouge_score['rougeLsum']*100:.2f}")

meteor_score = meteor.compute(predictions=clean_pred_intermediate, references=references_intermediate)

print("\n=== Otras Métricas ===")
print(f"METEOR:     {meteor_score['meteor']*100:.2f}")

In [None]:
with open(f"results_{lang}_qum_finetune_intermediate.tsv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["input", "reference", "prediction"])

    for example, pred, ref in zip(es_qum_dataset["test"], predictions_intermediate, references_intermediate):
        writer.writerow([example["input"], ref, pred])

print("✅ Resultados guardados en 'results_es_qum_finetune_intermediate.tsv'")

#### 2. Con Finetuning Portugues-Sipakapense

In [16]:
finetune_model = en_qum_model
finetune_tokenizer = en_qum_tokenizer

finetune_model.save_pretrained("./qlora_finetuned_model_deu_quc_nllb")
finetune_tokenizer.save_pretrained("./qlora_finetuned_model_deu_quc_nllb")



('./qlora_finetuned_model_deu_quc_nllb/tokenizer_config.json',
 './qlora_finetuned_model_deu_quc_nllb/special_tokens_map.json',
 './qlora_finetuned_model_deu_quc_nllb/sentencepiece.bpe.model',
 './qlora_finetuned_model_deu_quc_nllb/added_tokens.json',
 './qlora_finetuned_model_deu_quc_nllb/tokenizer.json')

In [17]:
predictions_direct = []
references_direct = []

print(f"\n🔹 Traduciendo: {origin_lang} → Sipakapense\n")

for item in tqdm(en_qum_dataset["test"]):
    input_text = item["input"]
    reference = item["target"]

    pred = translate_text(finetune_model, finetune_tokenizer, input_text, f"{origin_lang}_Latn", "qum")

    predictions_direct.append(pred)
    references_direct.append(reference)


🔹 Traduciendo: deu → Sipakapense



100%|██████████| 321/321 [08:39<00:00,  1.62s/it]


In [18]:
clean_pred_direct = [pred.removeprefix("qum").strip() for pred in predictions_direct]

In [19]:
print("\n=== Ejemplos Fine-Tuned ===\n")
for i in range(5):
    print(f"> Entrada:     {en_qum_dataset['test'][i]['input']}")
    print(f"> Referencia:  {en_qum_dataset['test'][i]['target']}")
    print(f"> Predicción:  {clean_pred_direct[i]}\n")


=== Ejemplos Fine-Tuned ===

> Entrada:     #qum# Don Jesus' Frau benutzt nur gutes Holz.
> Referencia:  wu rxqiil taʼ xuux xaq utz laj siʼ kchuknik chre
> Predicción:  , and the other side is a right-hander than a right-hander.

> Entrada:     #qum# Die ganze Pflaume ist weg.
> Referencia:  Njeel ri xkarawaʼn xpochʼik rech chenim xchaqʼjik
> Predicción:  's all that is left of the earth.

> Entrada:     #qum# In unserem Dorf wird unsere Sprache nicht mehr gesprochen
> Referencia:  Pri qtinmit qal chik kchkunsxik ri qyolbʼaal
> Predicción:  in our village our language is no longer spoken

> Entrada:     #qum# Der Schüler lernt, wie
> Referencia:  Ri ajtijnel tjin kirtijuuj c hemo rchkunsxik ri suʼ
> Predicción:  ate, and the student learns how to

> Entrada:     #qum# Die Frau war von ihrem Mann umarmt.
> Referencia:  Ri ixoq tzʼulmaj rum ri rchjiil
> Predicción:  , the woman was in the arms of her husband.



In [21]:
bleu_score = bleu.compute(predictions=clean_pred_direct, references=[[ref] for ref in references_direct])

print("\n=== Métricas BLEU ===")
print(f"BLEU-1: {bleu_score['precisions'][0]*100:.2f}")
print(f"BLEU-2: {bleu_score['precisions'][1]*100:.2f}")
print(f"BLEU-3: {bleu_score['precisions'][2]*100:.2f}")
print(f"BLEU-4: {bleu_score['precisions'][3]*100:.2f}")
print(f"BLEU total: {bleu_score['bleu']*100:.2f}")

rouge_score = rouge.compute(predictions=clean_pred_direct, references=references_direct)

print("\n=== Métricas ROUGE ===")
print(f"ROUGE-1:   {rouge_score['rouge1']*100:.2f}")
print(f"ROUGE-2:   {rouge_score['rouge2']*100:.2f}")
print(f"ROUGE-L:   {rouge_score['rougeL']*100:.2f}")
print(f"ROUGE-Lsum:{rouge_score['rougeLsum']*100:.2f}")

meteor_score = meteor.compute(predictions=clean_pred_direct, references=references_direct)

print("\n=== Otras Métricas ===")
print(f"METEOR:     {meteor_score['meteor']*100:.2f}")


=== Métricas BLEU ===
BLEU-1: 0.00
BLEU-2: 0.00
BLEU-3: 0.00
BLEU-4: 0.00
BLEU total: 0.00

=== Métricas ROUGE ===
ROUGE-1:   0.34
ROUGE-2:   0.00
ROUGE-L:   0.33
ROUGE-Lsum:0.34

=== Otras Métricas ===
METEOR:     0.02


In [22]:
filename = f"results_{lang}_qum_finetune_direct.tsv"
with open(filename, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["input", "reference", "prediction"])

    for example, pred, ref in zip(es_qum_dataset["test"], predictions_direct, references_direct):
        writer.writerow([example["input"], ref, pred])

print(f"✅ Resultados guardados en '{filename}'")

✅ Resultados guardados en 'results_de_qum_finetune_direct.tsv'
