## Loading the Data

In [2]:
from datasets import load_from_disk

dataset = load_from_disk('rat-poc-ds')

In [3]:
dataset['train'][0]

{'Source': 'འཇིག་ལས་འདས་པའི་གང་འདུལ་ལོ།།',
 'Target': 'Taming with transcendent beings.',
 'File_Name': 'TM3076',
 'Machine Aligned': False,
 '__index_level_0__': 1176089,
 'Tag': 'Intrinsic Existence, Conventional Existence'}

In [4]:
dataset['test'][0]

{'Source': ' དབང་ཤེས་ནི་རྟགས་ལས་དཔག་མི་དགོས་པར་མངོན་སུམ་དུ་ངེས་པའི་ཕྱིར་རོ།།',
 'Target': ' Sense cognitions need not infer from signs but can ascertain things directly.',
 'File_Name': 'TM0713',
 'Machine Aligned': False,
 '__index_level_0__': 168915,
 'Tag': 'Prophecies, Rituals'}

## Load Tokenizer, Model, and Data Collator

In [5]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="cuda:0")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
# Generate a list of all Tibetan Unicode characters (U+0F00 to U+0FFF)
tibetan_chars = [chr(codepoint) for codepoint in range(0x0F00, 0x0FFF)]

# Add the Tibetan characters to the tokenizer's vocabulary
new_tokens = [char for char in tibetan_chars if char not in tokenizer.get_vocab()]

# Add new tokens to the tokenizer
tokenizer.add_tokens(new_tokens)

# Resize model embeddings to accommodate the new vocabulary size
model.resize_token_embeddings(len(tokenizer))

Embedding(32355, 512)

In [7]:
enc = tokenizer.encode(dataset['train'][0]['Source'])
dec = tokenizer.decode(enc)
dec

'འཇིག་ལས་འདས་པའི་གང་འདུལ་ལོ།།</s>'

## Preprocess Data

The dataset can now be tokenized for training.

In [8]:
source_lang = 'Source'
target_lang = 'Target'

def preprocess_function(examples):

    inputs = [example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")

    return model_inputs

In [9]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

## Define Metric

In [12]:
import numpy as np
import evaluate

# Load BLEU and CHRF metrics
bleu_metric = evaluate.load("sacrebleu")
chrf_metric = evaluate.load("chrf")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Postprocess text
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute BLEU score
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_score = bleu_result["score"]

    # Compute CHRF score
    chrf_result = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels)
    chrf_score = chrf_result["score"]

    # Compute generation length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    avg_gen_len = np.mean(prediction_lens)

    # Return rounded results
    return {
        "bleu": round(bleu_score, 4),
        "chrf": round(chrf_score, 4),
        "gen_len": round(avg_gen_len, 4),
    }

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

## Train the Model

In [13]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, Adafactor
from accelerate import Accelerator

accelerator = Accelerator()

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

model, optimizer = accelerator.prepare(model, optimizer)

In [15]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/j/.netrc


True

In [16]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"rat-poc-no-context",
    auto_find_batch_size=True,
    predict_with_generate=True,
    fp16=False, #check this
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    num_train_epochs=3
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/16875 [00:00<?, ?it/s]

{'loss': 1.3069, 'grad_norm': 0.965528130531311, 'learning_rate': 0.0002911111111111111, 'epoch': 0.09}
{'loss': 1.0191, 'grad_norm': 0.27212992310523987, 'learning_rate': 0.00028222222222222223, 'epoch': 0.18}
{'loss': 0.9983, 'grad_norm': 0.18509145081043243, 'learning_rate': 0.00027333333333333333, 'epoch': 0.27}
{'loss': 0.9976, 'grad_norm': 0.33916306495666504, 'learning_rate': 0.00026444444444444443, 'epoch': 0.36}
{'loss': 1.0109, 'grad_norm': 0.33182889223098755, 'learning_rate': 0.00025555555555555553, 'epoch': 0.44}
{'loss': 1.0051, 'grad_norm': 0.48836204409599304, 'learning_rate': 0.0002466666666666666, 'epoch': 0.53}
{'loss': 0.9593, 'grad_norm': 0.3102256655693054, 'learning_rate': 0.00023777777777777775, 'epoch': 0.62}
{'loss': 0.973, 'grad_norm': 0.1904025375843048, 'learning_rate': 0.00022888888888888885, 'epoch': 0.71}
{'loss': 0.9951, 'grad_norm': 0.5006190538406372, 'learning_rate': 0.00021999999999999995, 'epoch': 0.8}
{'loss': 0.9268, 'grad_norm': 0.32680538296699



  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.8925486207008362, 'eval_bleu': 0.0486, 'eval_chrf': 4.9351, 'eval_gen_len': 18.4756, 'eval_runtime': 105.0071, 'eval_samples_per_second': 47.616, 'eval_steps_per_second': 5.952, 'epoch': 1.0}
{'loss': 0.9535, 'grad_norm': 0.3119215965270996, 'learning_rate': 0.00019333333333333333, 'epoch': 1.07}
{'loss': 0.9136, 'grad_norm': 0.5302177667617798, 'learning_rate': 0.00018444444444444443, 'epoch': 1.16}
{'loss': 0.933, 'grad_norm': 0.4272601008415222, 'learning_rate': 0.00017555555555555553, 'epoch': 1.24}
{'loss': 0.9164, 'grad_norm': 0.2883760929107666, 'learning_rate': 0.00016666666666666666, 'epoch': 1.33}
{'loss': 0.9557, 'grad_norm': 0.3059743344783783, 'learning_rate': 0.00015777777777777776, 'epoch': 1.42}
{'loss': 0.9457, 'grad_norm': 0.3589041233062744, 'learning_rate': 0.00014888888888888886, 'epoch': 1.51}
{'loss': 0.943, 'grad_norm': 0.31390276551246643, 'learning_rate': 0.00014, 'epoch': 1.6}
{'loss': 0.949, 'grad_norm': 0.36544880270957947, 'learning_rate': 



  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.8728377223014832, 'eval_bleu': 0.044, 'eval_chrf': 5.2642, 'eval_gen_len': 18.483, 'eval_runtime': 108.0452, 'eval_samples_per_second': 46.277, 'eval_steps_per_second': 5.785, 'epoch': 2.0}
{'loss': 0.9175, 'grad_norm': 0.2940704822540283, 'learning_rate': 9.555555555555555e-05, 'epoch': 2.04}
{'loss': 0.9474, 'grad_norm': 0.4207407236099243, 'learning_rate': 8.666666666666665e-05, 'epoch': 2.13}
{'loss': 0.9303, 'grad_norm': 0.335420697927475, 'learning_rate': 7.777777777777777e-05, 'epoch': 2.22}
{'loss': 0.9133, 'grad_norm': 0.5305365920066833, 'learning_rate': 6.888888888888888e-05, 'epoch': 2.31}
{'loss': 0.9284, 'grad_norm': 0.39540573954582214, 'learning_rate': 5.9999999999999995e-05, 'epoch': 2.4}
{'loss': 0.9084, 'grad_norm': 0.24254003167152405, 'learning_rate': 5.111111111111111e-05, 'epoch': 2.49}
{'loss': 0.9081, 'grad_norm': 0.37868040800094604, 'learning_rate': 4.222222222222222e-05, 'epoch': 2.58}
{'loss': 0.8959, 'grad_norm': 0.18758004903793335, 'learn



  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.8670628666877747, 'eval_bleu': 0.0433, 'eval_chrf': 5.4459, 'eval_gen_len': 18.3346, 'eval_runtime': 108.7933, 'eval_samples_per_second': 45.959, 'eval_steps_per_second': 5.745, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 4896.7708, 'train_samples_per_second': 27.569, 'train_steps_per_second': 3.446, 'train_loss': 0.957993236400463, 'epoch': 3.0}


TrainOutput(global_step=16875, training_loss=0.957993236400463, metrics={'train_runtime': 4896.7708, 'train_samples_per_second': 27.569, 'train_steps_per_second': 3.446, 'total_flos': 9135571599360000.0, 'train_loss': 0.957993236400463, 'epoch': 3.0})