# mT5 Model

This section contains experiments and implementations using the **mT5** model.


In [None]:
!pip -q install pandas pyarrow


In [None]:
from datasets import load_dataset

ds = load_dataset("opus_books", "en-es", split="train")

print(ds)
print(ds[0])


en-es/train-00000-of-00001.parquet:   0%|          | 0.00/16.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/93470 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'translation'],
    num_rows: 93470
})
{'id': '0', 'translation': {'en': 'Source: Project GutenbergAudiobook available here', 'es': 'Source: Wikisource & librodot.com'}}


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
max_input_length = 128
max_target_length = 128

def preprocess(batch):
    # Spanish -> input
    inputs = [ex["es"] for ex in batch["translation"]]
    # English -> target
    targets = [ex["en"] for ex in batch["translation"]]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
small_ds = ds.select(range(2000))

tokenized_ds = small_ds.map(
    preprocess,
    batched=True,
    remove_columns=small_ds.column_names
)

print(tokenized_ds)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})


In [None]:
tokenized_ds[0]


{'input_ids': [18510,
  267,
  20100,
  15785,
  549,
  11393,
  20038,
  260,
  284,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./mt5-es-en",
    eval_strategy="no",          # <-- DEĞİŞTİ
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    fp16=True,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    report_to="none"
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [None]:
trainer.train()


Step,Training Loss
50,0.0
100,0.0


TrainOutput(global_step=125, training_loss=0.0, metrics={'train_runtime': 125.2234, 'train_samples_per_second': 15.971, 'train_steps_per_second': 0.998, 'total_flos': 264374845440000.0, 'train_loss': 0.0, 'epoch': 1.0})

In [None]:
max_input_length = 128
max_target_length = 128

def preprocess(batch):
    # PREFIX EKLENİYOR 🔥
    inputs = [
        "translate Spanish to English: " + ex["es"]
        for ex in batch["translation"]
    ]
    targets = [ex["en"] for ex in batch["translation"]]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
small_ds = ds.select(range(2000))

tokenized_ds = small_ds.map(
    preprocess,
    batched=True,
    remove_columns=small_ds.column_names
)

tokenized_ds[0]


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



{'input_ids': [37194,
  259,
  29037,
  288,
  5413,
  267,
  18510,
  267,
  20100,
  15785,
  549,
  11393,
  20038,
  260,
  284,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
50,0.0
100,0.0


TrainOutput(global_step=125, training_loss=0.0, metrics={'train_runtime': 190.3438, 'train_samples_per_second': 10.507, 'train_steps_per_second': 0.657, 'total_flos': 264374845440000.0, 'train_loss': 0.0, 'epoch': 1.0})

In [None]:
def translate_es_to_en(text):
    prefixed_text = "translate Spanish to English: " + text

    inputs = tokenizer(
        prefixed_text,
        return_tensors="pt",
        truncation=True,
        max_length=128
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_length=128,
        num_beams=4
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
print(translate_es_to_en("Este proyecto es muy interesante y educativo."))


<0x03>


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

max_input_length = 128
max_target_length = 128

def preprocess(batch):
    # Spanish -> input (prefix ile)
    inputs = ["translate Spanish to English: " + ex["es"] for ex in batch["translation"]]
    targets = [ex["en"] for ex in batch["translation"]]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
        # padding YOK! (dinamik padding'i data collator yapacak)
    )

    labels = tokenizer(
        targets,
        max_length=max_target_length,
        truncation=True
        # padding YOK!
    )["input_ids"]

    # pad tokenlarını loss'tan çıkar: pad_token_id -> -100
    pad = tokenizer.pad_token_id
    labels = [[(tok if tok != pad else -100) for tok in seq] for seq in labels]

    model_inputs["labels"] = labels
    return model_inputs




In [None]:
small_ds = ds.select(range(4000))  # 2000 de olur, biraz artırdım

tokenized_ds = small_ds.map(
    preprocess,
    batched=True,
    remove_columns=small_ds.column_names
)

print(tokenized_ds[0].keys())


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0


TrainOutput(global_step=250, training_loss=0.0, metrics={'train_runtime': 280.3474, 'train_samples_per_second': 14.268, 'train_steps_per_second': 0.892, 'total_flos': 352704959815680.0, 'train_loss': 0.0, 'epoch': 1.0})

In [None]:
def translate_es_to_en(text):
    prefixed = "translate Spanish to English: " + text
    inputs = tokenizer(prefixed, return_tensors="pt", truncation=True, max_length=128).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=64, num_beams=4)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(translate_es_to_en("Este proyecto es muy interesante y educativo."))


<0x03>


In [None]:
print(ds[200]["translation"]["es"])
print(ds[200]["translation"]["en"])


Es evidente, a pesar de su asidua atención cuando ella dibuja, que de hecho no sabe nada en esta materia.
It is evident, in spite of his frequent attention to her while she draws, that in fact he knows nothing of the matter.


In [None]:
test_es = ds[200]["translation"]["es"]
print("Spanish:", test_es)
print("English (model):", translate_es_to_en(test_es))
print("English (gold):", ds[200]["translation"]["en"])


Spanish: Es evidente, a pesar de su asidua atención cuando ella dibuja, que de hecho no sabe nada en esta materia.
English (model): <0x03>
English (gold): It is evident, in spite of his frequent attention to her while she draws, that in fact he knows nothing of the matter.


In [None]:
model.config.decoder_start_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id


In [None]:
def translate_es_to_en(text):
    prefixed = "translate Spanish to English: " + text

    inputs = tokenizer(
        prefixed,
        return_tensors="pt",
        truncation=True,
        max_length=128
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=64,
        num_beams=4,
        early_stopping=True,
        do_sample=False
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
print("Spanish:", ds[200]["translation"]["es"])
print("English (model):", translate_es_to_en(ds[200]["translation"]["es"]))
print("English (gold):", ds[200]["translation"]["en"])


Spanish: Es evidente, a pesar de su asidua atención cuando ella dibuja, que de hecho no sabe nada en esta materia.
English (model): <0x03>
English (gold): It is evident, in spite of his frequent attention to her while she draws, that in fact he knows nothing of the matter.


In [None]:
print(translate_es_to_en("Este proyecto es muy interesante y educativo."))


<0x03>


# FLAN-T5 Model

This section contains experiments and implementations using the **FLAN-T5** model.

In [45]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"  # RAM zorlanırsa flan-t5-small
flan_tokenizer = AutoTokenizer.from_pretrained(model_name)
flan_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(
    "cuda" if torch.cuda.is_available() else "cpu"
)

def flan_translate_es_en(text):
    prompt = f"Translate Spanish to English: {text}"
    inputs = flan_tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=256
    ).to(flan_model.device)

    outputs = flan_model.generate(
        **inputs,
        max_new_tokens=80,
        num_beams=4
    )
    return flan_tokenizer.decode(outputs[0], skip_special_tokens=True)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [46]:
print("FLAN:", flan_translate_es_en("Este proyecto es muy interesante y educativo."))
print("FLAN (dataset):", flan_translate_es_en(ds[200]["translation"]["es"]))


FLAN: This project is very interesting and educational.
FLAN (dataset): It is obvious, despite her attention to detail when she sketches, that she in fact does not know anything about this subject.


In [50]:
from datasets import DatasetDict

splits = ds.train_test_split(test_size=0.1, seed=42)
train_ds = splits["train"]
test_ds  = splits["test"]

print(train_ds, test_ds)


Dataset({
    features: ['id', 'translation'],
    num_rows: 84123
}) Dataset({
    features: ['id', 'translation'],
    num_rows: 9347
})


In [51]:
N = 200
test_sample = test_ds.select(range(N))

print("Test sample size:", len(test_sample))


Test sample size: 200


In [52]:
!pip -q install evaluate sacrebleu

import evaluate
bleu = evaluate.load("sacrebleu")


In [53]:
preds = []
refs = []

for ex in test_sample:
    src = ex["translation"]["es"]
    ref = ex["translation"]["en"]

    pred = flan_translate_es_en(src)
    preds.append(pred)
    refs.append([ref])   # sacrebleu formatı

result = bleu.compute(predictions=preds, references=refs)
print("FLAN-T5 BLEU:", result["score"])


FLAN-T5 BLEU: 9.584217024662964


In [54]:
import evaluate
chrf = evaluate.load("chrf")

result_chrf = chrf.compute(predictions=preds, references=refs)
print("FLAN-T5 chrF:", result_chrf["score"])


Downloading builder script: 0.00B [00:00, ?B/s]

FLAN-T5 chrF: 34.019770630584205


In [55]:
import pandas as pd

rows = []
for i in range(5):
    ex = test_sample[i]
    src = ex["translation"]["es"]
    ref = ex["translation"]["en"]
    flan_out = flan_translate_es_en(src)

    rows.append({
        "Spanish": src,
        "FLAN-T5 Output": flan_out,
        "Reference (EN)": ref,
        "Meaning Preserved?": "Yes" if flan_out else "—"
    })

df = pd.DataFrame(rows)
df


Unnamed: 0,Spanish,FLAN-T5 Output,Reference (EN),Meaning Preserved?
0,"Kitty, al contrario, estaba más activa a inclu...","Kitty, on the other hand, was more active and ...","Kitty, on the contrary, was more active than u...",Yes
1,Se darán las órdenes necesarias para su regreso.,You will be given the orders needed for your r...,All necessary preparations shall be made for y...,Yes
2,¡Demasiado larga ha sido ya la pausa! ¡Adelante!,Very long has already been the pain!,"Maintenant partons, allons, allons vers Sion.",Yes
3,Jueves 20 de agosto.,Thursday 20 August.,"Thursday, Aug.",Yes
4,Richelieu»,Richelieu»,"""Richelieu""",Yes
