## Loading the Data

In [1]:
from datasets import load_dataset, DatasetDict

dataset = DatasetDict()

dataset['train'] = load_dataset('openpecha/cleaned_MT_v1.0.2', split='train')
dataset['test'] = load_dataset('openpecha/cleaned_MT_v1.0.3', split='test')


Using the latest cached version of the dataset since openpecha/cleaned_MT_v1.0.2 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/j/.cache/huggingface/datasets/openpecha___cleaned_mt_v1.0.2/default/0.0.0/f89a8ce696a5711c5dfd57352677aad125224ee9 (last modified on Fri Dec  6 10:55:31 2024).


In [2]:
dataset['train'][0]

{'Source': 'ཐུབ་པས་རྟག་ཏུ་དེ་བཞིན་སྤྱད།།',
 'Target': 'The aspirant should move in such a way at all times.',
 'File_Name': 'TM2382',
 'Machine Aligned': True}

In [3]:
dataset['test'][0]

{'Source': 'ཚད་མེད་བཏང་སྙོམས་གསུམ་ལས།',
 'Target': '3. Immeasureable equanimity ',
 'File_Name': 'TM2203',
 'Machine Aligned': True,
 '__index_level_0__': 0}

## Load Tokenizer, Model, and Data Collator

In [4]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="cuda:0")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [5]:
# Generate a list of all Tibetan Unicode characters (U+0F00 to U+0FFF)
tibetan_chars = [chr(codepoint) for codepoint in range(0x0F00, 0x0FFF)]

# Add the Tibetan characters to the tokenizer's vocabulary
new_tokens = [char for char in tibetan_chars if char not in tokenizer.get_vocab()]

# Add new tokens to the tokenizer
tokenizer.add_tokens(new_tokens)

# Resize model embeddings to accommodate the new vocabulary size
model.resize_token_embeddings(len(tokenizer))

Embedding(32355, 512)

In [6]:
enc = tokenizer.encode(dataset['train'][0]['Source'])
dec = tokenizer.decode(enc)
dec

'ཐུབ་པས་རྟག་ཏུ་དེ་བཞིན་སྤྱད།།</s>'

## Preprocess Data

The dataset can now be tokenized for training.

In [7]:
source_lang = 'Source'
target_lang = 'Target'

def preprocess_function(examples):

    inputs = [example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")

    return model_inputs

In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1562949 [00:00<?, ? examples/s]

## Define Metric

In [9]:
import numpy as np

import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Train the Model

In [10]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, Adafactor
from accelerate import Accelerator

accelerator = Accelerator()

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

model, optimizer = accelerator.prepare(model, optimizer)

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"clean_1.0.2",
    auto_find_batch_size=True,
    predict_with_generate=True,
    fp16=False, #check this
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    num_train_epochs=1
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/195369 [00:00<?, ?it/s]

{'loss': 1.1938, 'grad_norm': 0.224343940615654, 'learning_rate': 0.00029923222210279007, 'epoch': 0.0}
{'loss': 0.96, 'grad_norm': 0.1544794887304306, 'learning_rate': 0.00029846444420558016, 'epoch': 0.01}
{'loss': 0.9863, 'grad_norm': 1.4241544008255005, 'learning_rate': 0.00029769666630837026, 'epoch': 0.01}
{'loss': 0.9716, 'grad_norm': 0.1773810237646103, 'learning_rate': 0.0002969288884111604, 'epoch': 0.01}
{'loss': 0.9727, 'grad_norm': 0.324550062417984, 'learning_rate': 0.0002961611105139505, 'epoch': 0.01}
{'loss': 0.9705, 'grad_norm': 0.29743683338165283, 'learning_rate': 0.0002953933326167406, 'epoch': 0.02}
{'loss': 0.9208, 'grad_norm': 0.17542213201522827, 'learning_rate': 0.0002946255547195307, 'epoch': 0.02}
{'loss': 0.9064, 'grad_norm': 0.1885157972574234, 'learning_rate': 0.0002938577768223208, 'epoch': 0.02}
{'loss': 0.9468, 'grad_norm': 0.5190813541412354, 'learning_rate': 0.00029308999892511094, 'epoch': 0.02}
{'loss': 0.9297, 'grad_norm': 0.3789009749889374, 'lea



  0%|          | 0/1134 [00:00<?, ?it/s]

{'eval_loss': 0.3263661861419678, 'eval_bleu': 2.6598, 'eval_gen_len': 16.7499, 'eval_runtime': 289.8016, 'eval_samples_per_second': 31.283, 'eval_steps_per_second': 3.913, 'epoch': 1.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 32229.7966, 'train_samples_per_second': 48.494, 'train_steps_per_second': 6.062, 'train_loss': 0.7735757870275787, 'epoch': 1.0}


TrainOutput(global_step=195369, training_loss=0.7735757870275787, metrics={'train_runtime': 32229.7966, 'train_samples_per_second': 48.494, 'train_steps_per_second': 6.062, 'total_flos': 1.0576616663443046e+17, 'train_loss': 0.7735757870275787, 'epoch': 1.0})