## Loading the Data

In [1]:
from datasets import load_dataset

dataset = load_dataset('openpecha/cleaned_MT_v1.0.3')

Using the latest cached version of the dataset since openpecha/cleaned_MT_v1.0.3 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/j/.cache/huggingface/datasets/openpecha___cleaned_mt_v1.0.3/default/0.0.0/e3a09af43e787a278f3affb4112319edf75bfdfc (last modified on Mon Dec  9 21:54:04 2024).


In [2]:
dataset['train'][0]

{'Source': 'ཐུབ་པས་རྟག་ཏུ་དེ་བཞིན་སྤྱད།།',
 'Target': 'The aspirant should move in such a way at all times.',
 'File_Name': 'TM2382',
 'Machine Aligned': True,
 '__index_level_0__': 0}

In [3]:
dataset['test'][0]

{'Source': 'ཚད་མེད་བཏང་སྙོམས་གསུམ་ལས།',
 'Target': '3. Immeasureable equanimity ',
 'File_Name': 'TM2203',
 'Machine Aligned': True,
 '__index_level_0__': 0}

## Load Tokenizer, Model, and Data Collator

In [4]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM

checkpoint = "billingsmoore/tibetan-to-english-translation"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="cuda:0")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [5]:
# Generate a list of all Tibetan Unicode characters (U+0F00 to U+0FFF)
tibetan_chars = [chr(codepoint) for codepoint in range(0x0F00, 0x0FFF)]

# Add the Tibetan characters to the tokenizer's vocabulary
new_tokens = [char for char in tibetan_chars if char not in tokenizer.get_vocab()]

In [6]:
len(new_tokens)

162

In [7]:
# Add new tokens to the tokenizer
tokenizer.add_tokens(new_tokens)

# Resize model embeddings to accommodate the new vocabulary size
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(32407, 1024)

In [8]:
enc = tokenizer.encode(dataset['train'][0]['Source'])
dec = tokenizer.decode(enc)
dec

'ཐུབ་པས་རྟག་ཏུ་དེ་བཞིན་སྤྱད།།</s>'

## Preprocess Data

The dataset can now be tokenized for training.

In [9]:
source_lang = 'Source'
target_lang = 'Target'

def preprocess_function(examples):

    inputs = [example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")

    return model_inputs

In [10]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1429192 [00:00<?, ? examples/s]

Map:   0%|          | 0/9066 [00:00<?, ? examples/s]

## Define Metric

In [11]:
import numpy as np

import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Train the Model

Finally, we can train the model. Note that the optimizer used is Adafactor. This is the optimizer that is preferred for translation tasks and for the T5 model in general. The transformers api includes a built in version of Adafactor, but I define it separately here so that we can optimize it with the 'accelerate' library.

In [12]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, Adafactor
from accelerate import Accelerator

accelerator = Accelerator()

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

model, optimizer = accelerator.prepare(model, optimizer)

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"op-poc",
    auto_find_batch_size=True,
    predict_with_generate=True,
    fp16=False, #check this
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    num_train_epochs=3
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112462322221796, max=1.0…

  0%|          | 0/535947 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


  0%|          | 0/1071894 [00:00<?, ?it/s]

{'loss': 1.0866, 'grad_norm': 0.23311997950077057, 'learning_rate': 0.00029986006078959297, 'epoch': 0.0}
{'loss': 0.929, 'grad_norm': 0.3048689663410187, 'learning_rate': 0.00029972012157918597, 'epoch': 0.0}
{'loss': 0.9101, 'grad_norm': 0.39472708106040955, 'learning_rate': 0.00029958018236877896, 'epoch': 0.0}
{'loss': 0.9577, 'grad_norm': 0.5777726173400879, 'learning_rate': 0.00029944024315837196, 'epoch': 0.01}
{'loss': 0.8766, 'grad_norm': 0.6334481835365295, 'learning_rate': 0.000299300303947965, 'epoch': 0.01}
{'loss': 0.8911, 'grad_norm': 0.5862141251564026, 'learning_rate': 0.000299160364737558, 'epoch': 0.01}
{'loss': 0.874, 'grad_norm': 0.34396687150001526, 'learning_rate': 0.00029902042552715094, 'epoch': 0.01}
{'loss': 0.8883, 'grad_norm': 1.5457463264465332, 'learning_rate': 0.000298880486316744, 'epoch': 0.01}
{'loss': 0.8521, 'grad_norm': 0.6387004852294922, 'learning_rate': 0.000298740547106337, 'epoch': 0.01}
{'loss': 0.8359, 'grad_norm': 0.25920066237449646, 'lear

  0%|          | 0/1134 [00:00<?, ?it/s]

{'eval_loss': 0.2127106934785843, 'eval_bleu': 6.6205, 'eval_gen_len': 17.2423, 'eval_runtime': 1300.9022, 'eval_samples_per_second': 6.969, 'eval_steps_per_second': 0.872, 'epoch': 1.0}
{'loss': 0.5349, 'grad_norm': 0.16815313696861267, 'learning_rate': 0.00019994346455899558, 'epoch': 1.0}
{'loss': 0.4922, 'grad_norm': 0.3978942036628723, 'learning_rate': 0.00019980352534858855, 'epoch': 1.0}
{'loss': 0.488, 'grad_norm': 0.44972002506256104, 'learning_rate': 0.00019966358613818155, 'epoch': 1.0}
{'loss': 0.5184, 'grad_norm': 7.861936092376709, 'learning_rate': 0.00019952364692777457, 'epoch': 1.0}
{'loss': 0.498, 'grad_norm': 0.2684919536113739, 'learning_rate': 0.00019938370771736756, 'epoch': 1.01}
{'loss': 0.5357, 'grad_norm': 0.529466450214386, 'learning_rate': 0.00019924376850696053, 'epoch': 1.01}
{'loss': 0.4922, 'grad_norm': 0.786566972732544, 'learning_rate': 0.00019910382929655356, 'epoch': 1.01}
{'loss': 0.529, 'grad_norm': 0.28487923741340637, 'learning_rate': 0.000198963

  0%|          | 0/1134 [00:00<?, ?it/s]

{'eval_loss': 0.1998659074306488, 'eval_bleu': 7.4168, 'eval_gen_len': 17.2781, 'eval_runtime': 1301.0791, 'eval_samples_per_second': 6.968, 'eval_steps_per_second': 0.872, 'epoch': 2.0}
{'loss': 0.5108, 'grad_norm': 0.21731719374656677, 'learning_rate': 9.988692911799113e-05, 'epoch': 2.0}
{'loss': 0.5152, 'grad_norm': 0.5265568494796753, 'learning_rate': 9.974698990758414e-05, 'epoch': 2.0}
{'loss': 0.4861, 'grad_norm': 0.633124589920044, 'learning_rate': 9.960705069717713e-05, 'epoch': 2.0}
{'loss': 0.461, 'grad_norm': 0.33089324831962585, 'learning_rate': 9.946711148677013e-05, 'epoch': 2.01}
{'loss': 0.518, 'grad_norm': 0.23990435898303986, 'learning_rate': 9.932717227636313e-05, 'epoch': 2.01}
{'loss': 0.4604, 'grad_norm': 0.1814083307981491, 'learning_rate': 9.918723306595614e-05, 'epoch': 2.01}
{'loss': 0.4504, 'grad_norm': 0.4819490611553192, 'learning_rate': 9.904729385554915e-05, 'epoch': 2.01}
{'loss': 0.4881, 'grad_norm': 0.3725143373012543, 'learning_rate': 9.890735464514

  0%|          | 0/1134 [00:00<?, ?it/s]

{'eval_loss': 0.19567222893238068, 'eval_bleu': 7.7955, 'eval_gen_len': 17.2892, 'eval_runtime': 1303.3534, 'eval_samples_per_second': 6.956, 'eval_steps_per_second': 0.87, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 757130.6143, 'train_samples_per_second': 5.663, 'train_steps_per_second': 1.416, 'train_loss': 0.5213899086590627, 'epoch': 3.0}


TrainOutput(global_step=1071894, training_loss=0.5213899086590627, metrics={'train_runtime': 757130.6143, 'train_samples_per_second': 5.663, 'train_steps_per_second': 1.416, 'total_flos': 4.641402412597248e+18, 'train_loss': 0.5213899086590627, 'epoch': 3.0})