## Loading the Data

In [9]:
from datasets import load_from_disk, DatasetDict
dataset = DatasetDict()
dataset['train'] = load_from_disk('ds-with-synth-data')
dataset['test'] = load_from_disk('test-ds')

## Load Tokenizer, Model, and Data Collator

In [10]:
from transformers import PreTrainedTokenizerFast, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM

checkpoint = "baseline-t5-small/checkpoint-8439"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="cuda:0")

# Initialize and train the tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained('custom-tokenizer')

# Resize the model's token embeddings if necessary
model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## Preprocess Data

The dataset can now be tokenized for training.

In [11]:
def translation_preprocess_function(examples):

    # Prepare translation inputs and targets
    translation_inputs = ['Translate Tibetan to English: ' + example for example in examples['tibetan']]
    translation_targets = [example for example in examples['english']]
    
    # Tokenize translation inputs and targets
    translation_model_inputs = tokenizer(translation_inputs, text_target=translation_targets, 
                                         max_length=256, truncation=False, padding="max_length")
    
    
    return translation_model_inputs


In [12]:
tokenized_dataset = dataset.map(translation_preprocess_function, batched=True)

## Train the Model

Finally, we can train the model. Note that the optimizer used is Adafactor. This is the optimizer that is preferred for translation tasks and for the T5 model in general. The transformers api includes a built in version of Adafactor, but I define it separately here so that we can optimize it with the 'accelerate' library.

In [13]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, Adafactor, EarlyStoppingCallback
from accelerate import Accelerator

accelerator = Accelerator()

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

model, optimizer = accelerator.prepare(model, optimizer)

In [14]:
import numpy as np
import evaluate

# Load BLEU and CHRF metrics
bleu_metric = evaluate.load("sacrebleu")
chrf_metric = evaluate.load("chrf")
ter_metric = evaluate.load("ter")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # Decode predictions and labels
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Postprocess text
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute BLEU score
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_score = bleu_result["score"]

    # Compute CHRF score
    chrf_result = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels)
    chrf_score = chrf_result["score"]

    # Compute TER score
    ter_result = ter_metric.compute(predictions=decoded_preds, references=decoded_labels)
    ter_score = ter_result["score"]

    # Return rounded results
    metrics = {
        "bleu": round(bleu_score, 4),
        "chrf": round(chrf_score, 4),
        "ter": round(ter_score, 4)
    }

    return metrics

In [15]:
%env WANDB_PROJECT=synth-data-experiment

env: WANDB_PROJECT=synth-data-experiment


In [16]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"ft-0%-synth-t5-small",
    auto_find_batch_size=True,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=1
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    processing_class=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Chrf,Ter
1,0.26,0.261638,5.1576,15.8623,116.187


TrainOutput(global_step=2813, training_loss=0.267848853237723, metrics={'train_runtime': 715.8716, 'train_samples_per_second': 31.43, 'train_steps_per_second': 3.929, 'total_flos': 1522868064878592.0, 'train_loss': 0.267848853237723, 'epoch': 1.0})