# Import libs

In [3]:
from tokenizers import Tokenizer, pre_tokenizers, trainers, models
from datasets import load_dataset

ds = load_dataset("ncduy/mt-en-vi")

In [144]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi', 'source'],
        num_rows: 2884451
    })
    validation: Dataset({
        features: ['en', 'vi', 'source'],
        num_rows: 11316
    })
    test: Dataset({
        features: ['en', 'vi', 'source'],
        num_rows: 11225
    })
})

In [2]:
ds.remove_columns(["source"])

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 2884451
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 11316
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 11225
    })
})

# Pretrained Model

## BART

In [None]:
from transformers import AutoTokenizer

MAX_LEN = 50 

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

def preprocess_function(examples):
    src_texts = examples["en"]
    tgt_texts = examples["vi"]
    src_encodings = tokenizer(src_texts, padding="max_length", truncation=True, max_length=MAX_LEN)
    tgt_encodings = tokenizer(tgt_texts, padding="max_length", truncation=True, max_length=MAX_LEN)
    return {
        "input_ids": src_encodings["input_ids"],
        "labels": tgt_encodings["input_ids"],
    }

preprocessed_ds = ds.map(preprocess_function, batched=True)

## mBART

In [4]:
from transformers import MBart50TokenizerFast

MAX_LEN = 50  

tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="vi_VN")

def preprocess_function(examples):
    src_texts = examples["en"]
    tgt_texts = examples["vi"]
    src_encodings = tokenizer(src_texts, padding="max_length", truncation=True, max_length=MAX_LEN)
    with tokenizer.as_target_tokenizer():
        tgt_encodings = tokenizer(tgt_texts, padding="max_length", truncation=True, max_length=MAX_LEN)
    return {
        "input_ids": src_encodings["input_ids"],
        "labels": tgt_encodings["input_ids"],
    }


preprocessed_ds = ds.map(preprocess_function, batched=True)



In [5]:
tokenizer.convert_ids_to_tokens(250004)

'en_XX'

In [None]:
preprocessed_ds['train'][20]['input_ids']

In [8]:
from transformers import MBartForConditionalGeneration

model_mbart = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50')

# Testing

In [9]:
import torch
input_ids = torch.tensor([preprocessed_ds["train"][10]["input_ids"]])
labels = torch.tensor([preprocessed_ds["train"][10]["labels"]])
pred = model_mbart(input_ids=input_ids, labels=labels)

In [12]:
pred['logits'].shape

torch.Size([1, 50, 250054])

# Trainer

In [None]:
# Disable wandb
import os

os.environ["WANDB_DISABLED"] = "true"
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Training
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart-50-en-vi",
    logging_dir="logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    learning_rate=2e-05,
    save_total_limit=1,
    predict_with_generate=True,
    fp16=True,
    lr_scheduler_type="cosine",
    weight_decay=0.05,
    gradient_checkpointing=True,
    gradient_accumulation_steps=2,
    #report_to="wandb",
)


In [9]:
trainer = Seq2SeqTrainer(
    model=model_mbart,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["validation"],
    tokenizer=tokenizer,
)
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/90139 [00:00<?, ?it/s]

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


{'loss': 0.6184, 'grad_norm': 1.205190658569336, 'learning_rate': 6.219338429813704e-12, 'epoch': 1.0}


  0%|          | 0/708 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


{'eval_loss': 0.4920928478240967, 'eval_runtime': 27.8232, 'eval_samples_per_second': 406.711, 'eval_steps_per_second': 25.446, 'epoch': 1.0}
{'train_runtime': 39861.936, 'train_samples_per_second': 72.361, 'train_steps_per_second': 2.261, 'train_loss': 0.6184480098098493, 'epoch': 1.0}


TrainOutput(global_step=90139, training_loss=0.6184480098098493, metrics={'train_runtime': 39861.936, 'train_samples_per_second': 72.361, 'train_steps_per_second': 2.261, 'total_flos': 3.052232971124736e+17, 'train_loss': 0.6184480098098493, 'epoch': 0.999994453042229})

In [11]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [43]:
input=tokenizer("I am going to bed", return_tensors="pt")['input_ids'].to(model_mbart.device)

In [44]:
output=model_mbart.generate(input, max_length=50, num_beams=5, early_stopping=True, temperature=1.0, do_sample=True, forced_bos_token_id=tokenizer.lang_code_to_id["vi_VN"])

In [51]:
tokenizer.convert_ids_to_tokens(4724)

'▁đang'

In [46]:
output

tensor([[     2, 250024,    384,  38068,   4724,   2467,  27421,   4600,      5,
              2]], device='cuda:0')

In [49]:
tokenizer.decode(output[0], skip_special_tokens=True)

'Tớ đang đi ngủ đây.'

In [20]:
trainer.push_to_hub(commit_message="mBART-50 EN-VI") 

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/binhphap5/mbart-50-en-vi/commit/41bc2dee30026ba8283440d56276a8f174ce3b0c', commit_message='mBART-50 EN-VI', commit_description='', oid='41bc2dee30026ba8283440d56276a8f174ce3b0c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/binhphap5/mbart-50-en-vi', endpoint='https://huggingface.co', repo_type='model', repo_id='binhphap5/mbart-50-en-vi'), pr_revision=None, pr_num=None)

In [5]:
torch.cuda.empty_cache()

In [22]:
trainer.evaluate(preprocessed_ds["test"])

  0%|          | 0/702 [00:00<?, ?it/s]

{'eval_loss': 0.49249181151390076,
 'eval_runtime': 28.4953,
 'eval_samples_per_second': 393.924,
 'eval_steps_per_second': 24.636,
 'epoch': 0.999994453042229}

In [1]:
from transformers import pipeline
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
translator = pipeline(
    model="binhphap5/mbart-50-en-vi",
    device=device,
)
translator(
    "A cat is going to the moon with the astronauts",
    max_length=50,
    num_beams=5,
    early_stopping=True,
    temperature=1,
    do_sample=True,
)[0]["generated_text"]

'Một con mèo sẽ lên mặt trăng cùng các phi hành gia'

# BLEU Score

In [7]:
import sacrebleu

def compute_bleu_score_pipeline_sacrebleu(translator, test_dataset, batch_size=32):
    """
    Tính BLEU score sử dụng sacrebleu cho pipeline HuggingFace.
    
    Args:
        translator: pipeline dịch máy HuggingFace
        test_dataset: dict với khóa 'en' và 'vi'
        batch_size: số câu xử lý mỗi lần
    
    Returns:
        BLEU score float
    """
    src_sentences = test_dataset['en']
    tgt_sentences = test_dataset['vi']

    predictions = []
    n = len(src_sentences)
    for i in range(0, n, batch_size):
        batch = src_sentences[i:i+batch_size]
        results = translator(
            batch,
            max_length=50,
            num_beams=5,
            early_stopping=True,
            temperature=1,
            do_sample=True,
        )
        preds = [res["generated_text"] for res in results]
        predictions.extend(preds)
        print(f"Processed {min(i+batch_size, n)} / {n}")

    # Tính BLEU bằng sacrebleu
    bleu = sacrebleu.corpus_bleu(predictions, [tgt_sentences])
    return bleu.score

bleu_score = compute_bleu_score_pipeline_sacrebleu(translator, preprocessed_ds['test'], batch_size=32)
# free up VRAM
torch.cuda.empty_cache()
print(f"\nBLEU score (sử dụng sacrebleu): {bleu_score:.2f}")

Processed 32 / 11225
Processed 64 / 11225
Processed 96 / 11225
Processed 128 / 11225


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed 160 / 11225
Processed 192 / 11225
Processed 224 / 11225
Processed 256 / 11225
Processed 288 / 11225
Processed 320 / 11225
Processed 352 / 11225
Processed 384 / 11225
Processed 416 / 11225
Processed 448 / 11225
Processed 480 / 11225
Processed 512 / 11225
Processed 544 / 11225
Processed 576 / 11225
Processed 608 / 11225
Processed 640 / 11225
Processed 672 / 11225
Processed 704 / 11225
Processed 736 / 11225
Processed 768 / 11225
Processed 800 / 11225
Processed 832 / 11225
Processed 864 / 11225
Processed 896 / 11225
Processed 928 / 11225
Processed 960 / 11225
Processed 992 / 11225
Processed 1024 / 11225
Processed 1056 / 11225
Processed 1088 / 11225
Processed 1120 / 11225
Processed 1152 / 11225
Processed 1184 / 11225
Processed 1216 / 11225
Processed 1248 / 11225
Processed 1280 / 11225
Processed 1312 / 11225
Processed 1344 / 11225
Processed 1376 / 11225
Processed 1408 / 11225
Processed 1440 / 11225
Processed 1472 / 11225
Processed 1504 / 11225
Processed 1536 / 11225
Processed 1568 