## Loading the Data

In [1]:
from datasets import load_dataset

dataset = load_dataset('openpecha/cleaned_MT_v1.0.3')

In [2]:
len(dataset['train'])

1429192

In [3]:
dataset['train'][0]

{'Source': 'ཐུབ་པས་རྟག་ཏུ་དེ་བཞིན་སྤྱད།།',
 'Target': 'The aspirant should move in such a way at all times.',
 'File_Name': 'TM2382',
 'Machine Aligned': True,
 '__index_level_0__': 0}

In [4]:
dataset['test'][0]

{'Source': 'ཚད་མེད་བཏང་སྙོམས་གསུམ་ལས།',
 'Target': '3. Immeasureable equanimity ',
 'File_Name': 'TM2203',
 'Machine Aligned': True,
 '__index_level_0__': 0}

## Load Tokenizer, Model, and Data Collator

In [5]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM

model_id = "google/t5-v1_1-small"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_id)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
# Generate a list of all Tibetan Unicode characters (U+0F00 to U+0FFF)
tibetan_chars = [chr(codepoint) for codepoint in range(0x0F00, 0x0FFF)]

# Add the Tibetan characters to the tokenizer's vocabulary
new_tokens = [char for char in tibetan_chars if char not in tokenizer.get_vocab()]

# Add new tokens to the tokenizer
tokenizer.add_tokens(new_tokens)

# Resize model embeddings to accommodate the new vocabulary size
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

Embedding(32355, 512)

In [15]:
enc = tokenizer.encode(dataset['train'][0]['Source'])
dec = tokenizer.decode(enc)
dec

'ཐུབ་པས་རྟག་ཏུ་དེ་བཞིན་སྤྱད།།</s>'

## Preprocess Data

The dataset can now be tokenized for training.

In [16]:
source_lang = 'Source'
target_lang = 'Target'

def preprocess_function(examples):

    inputs = [example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")

    return model_inputs

In [17]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1429192 [00:00<?, ? examples/s]

Map:   0%|          | 0/9066 [00:00<?, ? examples/s]

## Define Metric

In [11]:
import numpy as np

import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Train the Model

In [12]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, Adafactor
from accelerate import Accelerator

accelerator = Accelerator()

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

model, optimizer = accelerator.prepare(model, optimizer)

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"v1_1-small-poc",
    auto_find_batch_size=True,
    predict_with_generate=True,
    #max_grad_norm=1.0,
    fp16=False, #check this
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    num_train_epochs=1
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Seq2SeqTrainer(


  0%|          | 0/178649 [00:00<?, ?it/s]

{'loss': 1.1001, 'grad_norm': 1.1236166954040527, 'learning_rate': 0.000299160364737558, 'epoch': 0.0}
{'loss': 1.107, 'grad_norm': 0.4474409222602844, 'learning_rate': 0.000298320729475116, 'epoch': 0.01}
{'loss': 1.1037, 'grad_norm': 0.45653262734413147, 'learning_rate': 0.000297481094212674, 'epoch': 0.01}
{'loss': 1.1049, 'grad_norm': 0.4536183774471283, 'learning_rate': 0.000296641458950232, 'epoch': 0.01}
{'loss': 1.063, 'grad_norm': 0.2168353646993637, 'learning_rate': 0.00029580182368779, 'epoch': 0.01}
{'loss': 1.0191, 'grad_norm': 0.4735202193260193, 'learning_rate': 0.000294962188425348, 'epoch': 0.02}
{'loss': 1.056, 'grad_norm': 0.3177906274795532, 'learning_rate': 0.000294122553162906, 'epoch': 0.02}
{'loss': 1.053, 'grad_norm': 0.4990245997905731, 'learning_rate': 0.000293282917900464, 'epoch': 0.02}
{'loss': 0.9733, 'grad_norm': 0.3776359558105469, 'learning_rate': 0.000292443282638022, 'epoch': 0.03}
{'loss': 1.0178, 'grad_norm': 0.24495179951190948, 'learning_rate': 0

  0%|          | 0/1134 [00:00<?, ?it/s]

{'eval_loss': 0.37846577167510986, 'eval_bleu': 0.566, 'eval_gen_len': 18.7067, 'eval_runtime': 430.9215, 'eval_samples_per_second': 21.039, 'eval_steps_per_second': 2.632, 'epoch': 1.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


{'train_runtime': 38543.2152, 'train_samples_per_second': 37.08, 'train_steps_per_second': 4.635, 'train_loss': 0.8592925987040585, 'epoch': 1.0}


TrainOutput(global_step=178649, training_loss=0.8592925987040585, metrics={'train_runtime': 38543.2152, 'train_samples_per_second': 37.08, 'train_steps_per_second': 4.635, 'total_flos': 1.3309175054676787e+17, 'train_loss': 0.8592925987040585, 'epoch': 1.0})