In [1]:
from datasets import load_dataset
train_dataset = load_dataset('pandas', data_files='/home/j/Documents/Projects/MLotsawa/data/large-dfs/10M-train.p', streaming=True)
eval_dataset = load_dataset('pandas', data_files='/home/j/Documents/Projects/MLotsawa/data/large-dfs/100k-eval.p', streaming=True)

In [2]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq

checkpoint = "/home/j/Documents/Projects/MLotsawa/models/final-model/small-data/checkpoint-375000"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

2024-08-19 14:11:13.657326: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-19 14:11:13.657374: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-19 14:11:13.657416: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-19 14:11:13.667362: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
source_lang = 'bo'
target_lang = 'en'
prefix = "translate Tibetan to English: "

def preprocess_function(examples):

    inputs = [prefix + example[source_lang] for example in examples['translation']]
    targets = [example[target_lang] for example in examples['translation']]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)

    return model_inputs


In [4]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

In [5]:
import evaluate

metric = evaluate.load("sacrebleu")

In [6]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [7]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback, Adafactor

early_stop = EarlyStoppingCallback()

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="auto")

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=2e-5
)

In [8]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer = accelerator.prepare(model, optimizer)

In [9]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../../models/final-model/",
    auto_find_batch_size=True,
    predict_with_generate=True,
    fp16=False, #check this
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    max_steps=12500000
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset['train'],
    eval_dataset=tokenized_eval_dataset['train'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stop]
)

trainer.train()

max_steps is given, it will override any value given in num_train_epochs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/12500000 [00:00<?, ?it/s]

{'loss': 0.1247, 'grad_norm': 0.4581294655799866, 'learning_rate': 1.99992e-05, 'epoch': 0.0}
{'loss': 0.1071, 'grad_norm': 0.5393722057342529, 'learning_rate': 1.99984e-05, 'epoch': 0.0}
{'loss': 0.1167, 'grad_norm': 4.868246078491211, 'learning_rate': 1.9997600000000003e-05, 'epoch': 0.0}
{'loss': 0.1111, 'grad_norm': 3.3111374378204346, 'learning_rate': 1.99968e-05, 'epoch': 0.0}
{'loss': 0.1056, 'grad_norm': 1.5437523126602173, 'learning_rate': 1.9996000000000003e-05, 'epoch': 0.0}
{'loss': 0.1172, 'grad_norm': 3.7210566997528076, 'learning_rate': 1.99952e-05, 'epoch': 0.0}
{'loss': 0.113, 'grad_norm': 1.6470668315887451, 'learning_rate': 1.9994400000000003e-05, 'epoch': 0.0}
{'loss': 0.1188, 'grad_norm': 2.9195923805236816, 'learning_rate': 1.99936e-05, 'epoch': 0.0}
{'loss': 0.1186, 'grad_norm': 4.325334072113037, 'learning_rate': 1.9992800000000003e-05, 'epoch': 0.0}
{'loss': 0.1186, 'grad_norm': 1.1441690921783447, 'learning_rate': 1.9992e-05, 'epoch': 0.0}
{'loss': 0.1042, 'gr

[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 2.1 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 4.0 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 8.9 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 18.8 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 38.5 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 66.4 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 151.5 seconds.), retrying request


{'loss': 0.0989, 'grad_norm': 0.49329954385757446, 'learning_rate': 1.99288e-05, 'epoch': 0.0}
{'loss': 0.0785, 'grad_norm': 4.125942707061768, 'learning_rate': 1.9928e-05, 'epoch': 0.0}
{'loss': 0.0893, 'grad_norm': 0.5031408071517944, 'learning_rate': 1.99272e-05, 'epoch': 0.0}
{'loss': 0.0918, 'grad_norm': 1.6843091249465942, 'learning_rate': 1.9926400000000003e-05, 'epoch': 0.0}
{'loss': 0.09, 'grad_norm': 0.264239639043808, 'learning_rate': 1.99256e-05, 'epoch': 0.0}
{'loss': 0.0811, 'grad_norm': 2.119865894317627, 'learning_rate': 1.9924800000000003e-05, 'epoch': 0.0}
{'loss': 0.0921, 'grad_norm': 0.9816790819168091, 'learning_rate': 1.9924e-05, 'epoch': 0.0}
{'loss': 0.0858, 'grad_norm': 0.18175910413265228, 'learning_rate': 1.9923200000000003e-05, 'epoch': 0.0}
{'loss': 0.0981, 'grad_norm': 6.538459777832031, 'learning_rate': 1.99224e-05, 'epoch': 0.0}
{'loss': 0.0828, 'grad_norm': 0.7483553290367126, 'learning_rate': 1.9921600000000003e-05, 'epoch': 0.0}
{'loss': 0.0782, 'grad



{'eval_loss': 0.04464036226272583, 'eval_bleu': 85.6363, 'eval_gen_len': 13.0149, 'eval_runtime': 10244.1751, 'eval_samples_per_second': 9.762, 'eval_steps_per_second': 1.22, 'epoch': 0.1}
{'loss': 0.0519, 'grad_norm': 0.6106557846069336, 'learning_rate': 1.7999200000000002e-05, 'epoch': 1.0}
{'loss': 0.0484, 'grad_norm': 0.24219955503940582, 'learning_rate': 1.7998400000000004e-05, 'epoch': 1.0}
{'loss': 0.0533, 'grad_norm': 2.8110477924346924, 'learning_rate': 1.7997600000000002e-05, 'epoch': 1.0}
{'loss': 0.0505, 'grad_norm': 0.8703691959381104, 'learning_rate': 1.79968e-05, 'epoch': 1.0}
{'loss': 0.0519, 'grad_norm': 0.3807297646999359, 'learning_rate': 1.7996000000000002e-05, 'epoch': 1.0}
{'loss': 0.0559, 'grad_norm': 0.24905918538570404, 'learning_rate': 1.79952e-05, 'epoch': 1.0}
{'loss': 0.0515, 'grad_norm': 0.2441745549440384, 'learning_rate': 1.7994400000000002e-05, 'epoch': 1.0}
{'loss': 0.0567, 'grad_norm': 1.4264875650405884, 'learning_rate': 1.7993600000000003e-05, 'epoc

KeyboardInterrupt: 