In [1]:
!pip install transformers datasets evaluate sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 KB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting tabulate>=0.8.9
  Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Collecting lxml
  Downloading lxml-6.0.0-cp310-cp310-macosx_10_9_universal2.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting portalocker
  Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: tabulate, portalocker, lxml, colorama, sacrebleu
Successfully installed colorama-0.4.6 lxml-6.0.0 portalocker-3.2.0 sacrebleu-2.5.1 tabulate-0.9.0
You should consider upgrading via the '/Users/bagchi/projects/transformers/.venv/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

In [6]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-es")

Generating train split: 100%|██████████| 93470/93470 [00:00<00:00, 1855185.90 examples/s]


In [10]:
books['train'][20]

{'id': '20',
 'translation': {'en': 'He meant not to be unkind, however, and, as a mark of his affection for the three girls, he left them a thousand pounds a-piece.',
  'es': 'No era su intención, sin embargo, faltar a la bondad, y como señal de su afecto por las tres niñas le dejó mil libras a cada una.'}}

In [11]:
books = books["train"].train_test_split(test_size=0.2)

In [12]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [14]:
source_lang = "en"
target_lang = "es"
prefix = "translate English to Spanish: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs
tokenized_books = books.map(preprocess_function, batched=True)

Map: 100%|██████████| 74776/74776 [00:04<00:00, 15955.97 examples/s]
Map: 100%|██████████| 18694/18694 [00:01<00:00, 16399.06 examples/s]


In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [17]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script: 8.15kB [00:00, 6.14MB/s]


In [18]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [19]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./models/my_awesome_opus_books_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    # fp16=True, #change to bf16=True for XPU
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [None]:
from transformers import pipeline

translator = pipeline("translation_en_to_es", model="./my_awesome_opus_books_model")

In [None]:
text = "translate English to Spanish: Legumes share resources with nitrogen-fixing bacteria."
translator(text)