## Loading the Data

In [None]:
from datasets import load_dataset, concatenate_datasets

dataset = load_dataset('billingsmoore/84000-bo-en')

{'bo': 'བཅོམ་ལྡན་འདས་ཅི་དེ་བཞིན་གཤེགས་པ་དགྲ་བཅོམ་པ་ཡང་དག་པར་རྫོགས་པའི་སངས་རྒྱས་རྣམས་ཡོན་ཏན་དཔག་ཏུ་མ་མཆིས་པ་དང་ལྡན་ལགས་སམ། དེ་སྐད་ཅེས་གསོལ་པ་དང་།',
 'en': 'Blessed One, do thus-gone, worthy, perfectly complete buddhas possess limitless good qualities?”',
 'source_file': 'toh126'}

## Load Unfinetuned Tokenizer, Model, and Data Collator

In [5]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained('billingsmoore/getok-v0')
model = AutoModelForSeq2SeqLM.from_pretrained("google/t5-efficient-tiny", device_map="cuda:0")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

config.json:   0%|          | 0.00/628 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/62.3M [00:00<?, ?B/s]

RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

## Preprocess Data

The dataset can now be tokenized for training.

In [4]:
def bo_en_preprocess_function(examples):

    # Prepare translation inputs and targets
    translation_inputs = ['Translate Tibetan to English: ' + example for example in examples['tibetan']]
    translation_targets = [example for example in examples['english']]
    
    # Tokenize translation inputs and targets
    bo_en_model_inputs = tokenizer(translation_inputs, text_target=translation_targets, 
                                         max_length=300, truncation=True, padding="max_length")
    
    
    return bo_en_model_inputs

In [5]:
def en_bo_preprocess_function(examples):

    # Prepare translation inputs and targets
    translation_inputs = ['Translate English to Tibetan: ' + example for example in examples['english']]
    translation_targets = [example for example in examples['tibetan']]
    
    # Tokenize translation inputs and targets
    en_bo_model_inputs = tokenizer(translation_inputs, text_target=translation_targets, 
                                         max_length=300, truncation=True, padding="max_length")
    
    
    return en_bo_model_inputs

In [6]:
bo_en_tokenized_dataset = dataset.map(bo_en_preprocess_function, batched=True)

Map:   0%|          | 0/69042 [00:00<?, ? examples/s]

Map:   0%|          | 0/12184 [00:00<?, ? examples/s]

In [7]:
en_bo_tokenized_dataset = dataset.map(en_bo_preprocess_function, batched=True)

Map:   0%|          | 0/69042 [00:00<?, ? examples/s]

Map:   0%|          | 0/12184 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset = {}

tokenized_dataset['train'] = concatenate_datasets([bo_en_tokenized_dataset['train'], en_bo_tokenized_dataset['train']])
tokenized_dataset['test'] = bo_en_tokenized_dataset['test']

## Train the Model

Finally, we can train the model. Note that the optimizer used is Adafactor. This is the optimizer that is preferred for translation tasks and for the T5 model in general. The transformers api includes a built in version of Adafactor, but I define it separately here so that we can optimize it with the 'accelerate' library.

In [9]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, Adafactor
from accelerate import Accelerator

accelerator = Accelerator()

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

model, optimizer = accelerator.prepare(model, optimizer)

In [10]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [11]:
import wandb

wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"bidirectional",
    auto_find_batch_size=True,
    predict_with_generate=True,
    fp16=False, #check this
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    num_train_epochs=3
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()



  0%|          | 0/51783 [00:00<?, ?it/s]

{'loss': 0.6332, 'grad_norm': 0.17786581814289093, 'learning_rate': 0.0002971032964486414, 'epoch': 0.03}
{'loss': 0.299, 'grad_norm': 0.12014766037464142, 'learning_rate': 0.00029420659289728285, 'epoch': 0.06}
{'loss': 0.2795, 'grad_norm': 0.16227394342422485, 'learning_rate': 0.0002913098893459243, 'epoch': 0.09}
{'loss': 0.2722, 'grad_norm': 0.21368838846683502, 'learning_rate': 0.00028841318579456577, 'epoch': 0.12}
{'loss': 0.2595, 'grad_norm': 0.14164778590202332, 'learning_rate': 0.0002855164822432072, 'epoch': 0.14}
{'loss': 0.2529, 'grad_norm': 0.1506086140871048, 'learning_rate': 0.00028261977869184864, 'epoch': 0.17}
{'loss': 0.2533, 'grad_norm': 0.14093157649040222, 'learning_rate': 0.0002797230751404901, 'epoch': 0.2}
{'loss': 0.2459, 'grad_norm': 0.13410639762878418, 'learning_rate': 0.0002768263715891315, 'epoch': 0.23}
{'loss': 0.2392, 'grad_norm': 0.16665083169937134, 'learning_rate': 0.00027392966803777295, 'epoch': 0.26}
{'loss': 0.2356, 'grad_norm': 0.1214125454425



  0%|          | 0/1523 [00:00<?, ?it/s]

{'eval_loss': 0.14627183973789215, 'eval_bleu': 3.6518, 'eval_gen_len': 16.1257, 'eval_runtime': 414.8777, 'eval_samples_per_second': 29.368, 'eval_steps_per_second': 3.671, 'epoch': 1.0}
{'loss': 0.1894, 'grad_norm': 0.13471776247024536, 'learning_rate': 0.00019861537570245062, 'epoch': 1.01}
{'loss': 0.1862, 'grad_norm': 0.1104794517159462, 'learning_rate': 0.00019571867215109205, 'epoch': 1.04}
{'loss': 0.1858, 'grad_norm': 0.15300562977790833, 'learning_rate': 0.0001928219685997335, 'epoch': 1.07}
{'loss': 0.1839, 'grad_norm': 0.13286134600639343, 'learning_rate': 0.00018992526504837493, 'epoch': 1.1}
{'loss': 0.1819, 'grad_norm': 0.1635775864124298, 'learning_rate': 0.00018702856149701636, 'epoch': 1.13}
{'loss': 0.1857, 'grad_norm': 0.11667294055223465, 'learning_rate': 0.00018413185794565783, 'epoch': 1.16}
{'loss': 0.1816, 'grad_norm': 0.18541409075260162, 'learning_rate': 0.0001812351543942993, 'epoch': 1.19}
{'loss': 0.1787, 'grad_norm': 0.1281498521566391, 'learning_rate': 0



  0%|          | 0/1523 [00:00<?, ?it/s]

{'eval_loss': 0.1341230273246765, 'eval_bleu': 4.6624, 'eval_gen_len': 16.1466, 'eval_runtime': 422.1308, 'eval_samples_per_second': 28.863, 'eval_steps_per_second': 3.608, 'epoch': 2.0}
{'loss': 0.1629, 'grad_norm': 0.18075750768184662, 'learning_rate': 9.723075140490121e-05, 'epoch': 2.03}
{'loss': 0.1627, 'grad_norm': 0.29260414838790894, 'learning_rate': 9.433404785354267e-05, 'epoch': 2.06}
{'loss': 0.1626, 'grad_norm': 0.1274515837430954, 'learning_rate': 9.14373443021841e-05, 'epoch': 2.09}
{'loss': 0.1629, 'grad_norm': 0.16370031237602234, 'learning_rate': 8.854064075082554e-05, 'epoch': 2.11}
{'loss': 0.164, 'grad_norm': 0.1507793813943863, 'learning_rate': 8.5643937199467e-05, 'epoch': 2.14}
{'loss': 0.1666, 'grad_norm': 0.11839388310909271, 'learning_rate': 8.274723364810844e-05, 'epoch': 2.17}
{'loss': 0.1612, 'grad_norm': 0.15337583422660828, 'learning_rate': 7.985053009674989e-05, 'epoch': 2.2}
{'loss': 0.1639, 'grad_norm': 0.2055007815361023, 'learning_rate': 7.695382654



  0%|          | 0/1523 [00:00<?, ?it/s]

{'eval_loss': 0.1302887201309204, 'eval_bleu': 5.4383, 'eval_gen_len': 16.0153, 'eval_runtime': 421.8028, 'eval_samples_per_second': 28.886, 'eval_steps_per_second': 3.611, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 10903.968, 'train_samples_per_second': 37.991, 'train_steps_per_second': 4.749, 'train_loss': 0.18972355383442457, 'epoch': 3.0}


TrainOutput(global_step=51783, training_loss=0.18972355383442457, metrics={'train_runtime': 10903.968, 'train_samples_per_second': 37.991, 'train_steps_per_second': 4.749, 'total_flos': 3.28509444980736e+16, 'train_loss': 0.18972355383442457, 'epoch': 3.0})