## Loading the Data

In [None]:
from datasets import load_from_disk
dataset = load_from_disk('../../../tibetan_english_dataset')

In [None]:
dataset = dataset.train_test_split(.10)

In [None]:
dataset['train'][0]

In [None]:
dataset['test'][0]

## Load Tokenizer, Model, and Data Collator

In [None]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM

checkpoint = "billingsmoore/tibetan-to-english-translation"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="cuda:0")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [10]:
# Generate a list of all Tibetan Unicode characters (U+0F00 to U+0FFF)
tibetan_chars = [chr(codepoint) for codepoint in range(0x0F00, 0x0FFF)]

# Add the Tibetan characters to the tokenizer's vocabulary
new_tokens = [char for char in tibetan_chars if char not in tokenizer.get_vocab()]



In [11]:
len(new_tokens)

162

In [13]:
# Add new tokens to the tokenizer
tokenizer.add_tokens(new_tokens)

# Resize model embeddings to accommodate the new vocabulary size
model.resize_token_embeddings(len(tokenizer))

Embedding(32407, 1024)

In [14]:
enc = tokenizer.encode(dataset['train'][0]['tibetan'])
dec = tokenizer.decode(enc)
dec

'སྣ་ཚོགས་གོས་ནི་འཆང་བ་དང་།།</s>'

## Preprocess Data

The dataset can now be tokenized for training.

In [20]:
source_lang = 'tibetan'
target_lang = 'english'

def preprocess_function(examples):

    inputs = [example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")

    return model_inputs

In [21]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/112006 [00:00<?, ? examples/s]

Map:   0%|          | 0/12446 [00:00<?, ? examples/s]

## Define Metric

In [17]:
import numpy as np

import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Train the Model

Finally, we can train the model. Note that the optimizer used is Adafactor. This is the optimizer that is preferred for translation tasks and for the T5 model in general. The transformers api includes a built in version of Adafactor, but I define it separately here so that we can optimize it with the 'accelerate' library.

In [18]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, Adafactor
from accelerate import Accelerator

accelerator = Accelerator()

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

model, optimizer = accelerator.prepare(model, optimizer)

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"84000",
    auto_find_batch_size=True,
    predict_with_generate=True,
    fp16=False, #check this
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    num_train_epochs=3
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  0%|          | 0/42003 [00:00<?, ?it/s]

  0%|          | 0/84006 [00:00<?, ?it/s]

{'loss': 0.4933, 'grad_norm': 0.5370489358901978, 'learning_rate': 0.00029821441325619596, 'epoch': 0.02}
{'loss': 0.4026, 'grad_norm': 0.3901846706867218, 'learning_rate': 0.00029642882651239195, 'epoch': 0.04}
{'loss': 0.3724, 'grad_norm': 0.3839608430862427, 'learning_rate': 0.00029464323976858794, 'epoch': 0.05}
{'loss': 0.3316, 'grad_norm': 0.5529310703277588, 'learning_rate': 0.0002928576530247839, 'epoch': 0.07}
{'loss': 0.3331, 'grad_norm': 0.288876473903656, 'learning_rate': 0.0002910720662809799, 'epoch': 0.09}
{'loss': 0.3235, 'grad_norm': 0.36818790435791016, 'learning_rate': 0.0002892864795371759, 'epoch': 0.11}
{'loss': 0.2949, 'grad_norm': 0.326768159866333, 'learning_rate': 0.0002875008927933719, 'epoch': 0.12}
{'loss': 0.3075, 'grad_norm': 0.2530459761619568, 'learning_rate': 0.0002857153060495679, 'epoch': 0.14}
{'loss': 0.285, 'grad_norm': 0.2755714952945709, 'learning_rate': 0.00028392971930576386, 'epoch': 0.16}
{'loss': 0.2915, 'grad_norm': 0.26459965109825134, 'l



  0%|          | 0/1556 [00:00<?, ?it/s]

{'eval_loss': 0.1986752450466156, 'eval_bleu': 8.2157, 'eval_gen_len': 17.8406, 'eval_runtime': 1739.7393, 'eval_samples_per_second': 7.154, 'eval_steps_per_second': 0.894, 'epoch': 1.0}
{'loss': 0.2001, 'grad_norm': 0.6920514106750488, 'learning_rate': 0.0001982215556031712, 'epoch': 1.02}
{'loss': 0.1998, 'grad_norm': 0.32884547114372253, 'learning_rate': 0.00019643596885936718, 'epoch': 1.04}
{'loss': 0.1909, 'grad_norm': 0.30329251289367676, 'learning_rate': 0.00019465038211556316, 'epoch': 1.05}
{'loss': 0.208, 'grad_norm': 0.37133264541625977, 'learning_rate': 0.00019286479537175915, 'epoch': 1.07}
{'loss': 0.206, 'grad_norm': 0.2181055098772049, 'learning_rate': 0.00019107920862795514, 'epoch': 1.09}
{'loss': 0.199, 'grad_norm': 0.14069849252700806, 'learning_rate': 0.00018929362188415113, 'epoch': 1.11}
{'loss': 0.1949, 'grad_norm': 0.15009649097919464, 'learning_rate': 0.0001875080351403471, 'epoch': 1.12}
{'loss': 0.2005, 'grad_norm': 0.3371492624282837, 'learning_rate': 0.00



  0%|          | 0/1556 [00:00<?, ?it/s]

{'eval_loss': 0.18215888738632202, 'eval_bleu': 8.7419, 'eval_gen_len': 17.7906, 'eval_runtime': 1730.5984, 'eval_samples_per_second': 7.192, 'eval_steps_per_second': 0.899, 'epoch': 2.0}
{'loss': 0.1713, 'grad_norm': 0.15688832104206085, 'learning_rate': 9.822869795014642e-05, 'epoch': 2.02}
{'loss': 0.1682, 'grad_norm': 0.2840486764907837, 'learning_rate': 9.64431112063424e-05, 'epoch': 2.04}
{'loss': 0.1713, 'grad_norm': 0.09711557626724243, 'learning_rate': 9.465752446253839e-05, 'epoch': 2.05}
{'loss': 0.1671, 'grad_norm': 0.1733408123254776, 'learning_rate': 9.287193771873438e-05, 'epoch': 2.07}
{'loss': 0.1802, 'grad_norm': 0.30460530519485474, 'learning_rate': 9.108635097493037e-05, 'epoch': 2.09}
{'loss': 0.1609, 'grad_norm': 0.5159547328948975, 'learning_rate': 8.930076423112634e-05, 'epoch': 2.11}
{'loss': 0.1733, 'grad_norm': 0.20846009254455566, 'learning_rate': 8.751517748732232e-05, 'epoch': 2.12}
{'loss': 0.1729, 'grad_norm': 0.29432353377342224, 'learning_rate': 8.5729



  0%|          | 0/1556 [00:00<?, ?it/s]

{'eval_loss': 0.17730243504047394, 'eval_bleu': 9.123, 'eval_gen_len': 17.7157, 'eval_runtime': 1733.5152, 'eval_samples_per_second': 7.18, 'eval_steps_per_second': 0.898, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 64914.4181, 'train_samples_per_second': 5.176, 'train_steps_per_second': 1.294, 'train_loss': 0.20581387315418448, 'epoch': 3.0}


TrainOutput(global_step=84006, training_loss=0.20581387315418448, metrics={'train_runtime': 64914.4181, 'train_samples_per_second': 5.176, 'train_steps_per_second': 1.294, 'total_flos': 3.63747431153664e+17, 'train_loss': 0.20581387315418448, 'epoch': 3.0})

In [25]:
dataset['test'].save_to_disk('test-set')

Saving the dataset (0/1 shards):   0%|          | 0/12446 [00:00<?, ? examples/s]