In [23]:
!pip install datasets
!pip install evaluate
!pip install accelerate -U
!pip install sacrebleu

NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [None]:
pip install transformers -U


In [24]:
from datasets import load_dataset

dataset = load_dataset("mt_eng_vietnamese", "iwslt2015-en-vi")

In [35]:
dataset['train'][1]

{'translation': {'en': 'In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .',
  'vi': 'Trong 4 phút , chuyên gia hoá học khí quyển Rachel Pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình -- hàng ngàn người đã cống hiến cho dự án này -- một chuyến bay mạo hiểm qua rừng già để tìm kiếm thông tin về một phân tử then chốt .'}}

In [36]:
dataset['test'][1]

{'translation': {'en': 'And I was very proud .',
  'vi': 'Tôi đã rất tự hào về đất nước tôi .'}}

In [37]:
from transformers import AutoTokenizer

checkpoint = "/content/drive/MyDrive/translate_machine/last_15epoch"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [38]:
source_lang = "en"
target_lang = "vi"
prefix = "translate English to Vietnamese: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [39]:
tokenized_books = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/133318 [00:00<?, ? examples/s]

Map:   0%|          | 0/1269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1269 [00:00<?, ? examples/s]

In [40]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [41]:
import evaluate

metric = evaluate.load("sacrebleu")

In [42]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [43]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [44]:
model = model.to('cuda')

In [45]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/translate_machine/weights",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    logging_steps=1000,
    warmup_steps=10,

)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.5599,1.400907,0.2547,18.7486
2,1.4977,1.346427,0.3728,18.7675
3,1.465,1.317261,0.441,18.7533
4,1.4493,1.308039,0.454,18.7533




TrainOutput(global_step=8336, training_loss=1.5026135545316905, metrics={'train_runtime': 3511.3992, 'train_samples_per_second': 151.869, 'train_steps_per_second': 2.374, 'total_flos': 1.431530059628544e+16, 'train_loss': 1.5026135545316905, 'epoch': 4.0})

In [46]:
trainer.save_model('/content/drive/MyDrive/translate_machine/last_8_epoch')


In [2]:
text = "i like to play video game online. In evening i usually go outside with my friend to the coffe shop"

In [3]:
from transformers import pipeline

translator = pipeline("translation", model='./last_8_epoch' )
translator(text)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[{'translation_text': 'tôi mt video game online tr vào nhng tôi gii hi vi bn ca tôi vào các ca tôi .'}]