In [1]:
from datasets import load_dataset
train_dataset = load_dataset('pandas', data_files='/home/j/Documents/Projects/MLotsawa/data/size-selection-data/1M-train.p')
eval_dataset = load_dataset('pandas', data_files='/home/j/Documents/Projects/MLotsawa/data/size-selection-data/100k-eval.p')

In [2]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq

checkpoint = "/home/j/Documents/Projects/MLotsawa/models/size-selection/large/epoch-3"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

2024-08-14 18:31:06.327574: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-14 18:31:06.327623: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-14 18:31:06.327667: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-14 18:31:06.337679: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
source_lang = 'bo'
target_lang = 'en'
prefix = "translate Tibetan to English: "

def preprocess_function(examples):

    inputs = [prefix + example[source_lang] for example in examples['translation']]
    targets = [example[target_lang] for example in examples['translation']]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)

    return model_inputs


In [4]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

In [5]:
import evaluate

metric = evaluate.load("sacrebleu")

In [6]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [7]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback, Adafactor

early_stop = EarlyStoppingCallback()

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="auto")

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

In [8]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer = accelerator.prepare(model, optimizer)

In [10]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"../../models/final-model/",
    auto_find_batch_size=True,
    predict_with_generate=True,
    fp16=False, #check this
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    num_train_epochs=5
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset['train'],
    eval_dataset=tokenized_eval_dataset['train'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stop]
)

trainer.train()

max_steps is given, it will override any value given in num_train_epochs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/12500000 [00:00<?, ?it/s]

{'loss': 0.7419, 'grad_norm': 5.968578815460205, 'learning_rate': 0.00029998799999999995, 'epoch': 0.0}
{'loss': 0.8053, 'grad_norm': 7.301441669464111, 'learning_rate': 0.000299976, 'epoch': 0.0}
{'loss': 0.8107, 'grad_norm': 5.739782333374023, 'learning_rate': 0.00029996399999999995, 'epoch': 0.0}
{'loss': 0.8066, 'grad_norm': 4.712608337402344, 'learning_rate': 0.000299952, 'epoch': 0.0}
{'loss': 0.8459, 'grad_norm': 6.24125862121582, 'learning_rate': 0.00029994, 'epoch': 0.0}
{'loss': 0.8059, 'grad_norm': 3.6026172637939453, 'learning_rate': 0.000299928, 'epoch': 0.0}
{'loss': 0.8353, 'grad_norm': 7.338786602020264, 'learning_rate': 0.000299916, 'epoch': 0.0}
{'loss': 0.8275, 'grad_norm': 4.893742561340332, 'learning_rate': 0.000299904, 'epoch': 0.0}
{'loss': 0.8493, 'grad_norm': 8.208993911743164, 'learning_rate': 0.00029989199999999995, 'epoch': 0.0}
{'loss': 0.8073, 'grad_norm': 8.872082710266113, 'learning_rate': 0.00029988, 'epoch': 0.0}
{'loss': 0.8515, 'grad_norm': 4.9855308



{'eval_loss': 0.15402033925056458, 'eval_bleu': 76.2479, 'eval_gen_len': 12.9687, 'eval_runtime': 10387.1373, 'eval_samples_per_second': 9.627, 'eval_steps_per_second': 1.203, 'epoch': 0.01}
{'loss': 0.2507, 'grad_norm': 4.469458103179932, 'learning_rate': 0.000296988, 'epoch': 1.0}
{'loss': 0.257, 'grad_norm': 2.307915687561035, 'learning_rate': 0.00029697599999999996, 'epoch': 1.0}
{'loss': 0.2545, 'grad_norm': 4.1516900062561035, 'learning_rate': 0.000296964, 'epoch': 1.0}
{'loss': 0.2554, 'grad_norm': 3.5532469749450684, 'learning_rate': 0.000296952, 'epoch': 1.0}
{'loss': 0.2547, 'grad_norm': 4.177746295928955, 'learning_rate': 0.00029694, 'epoch': 1.0}
{'loss': 0.2514, 'grad_norm': 2.5770959854125977, 'learning_rate': 0.00029692799999999996, 'epoch': 1.0}
{'loss': 0.2588, 'grad_norm': 3.0951461791992188, 'learning_rate': 0.000296916, 'epoch': 1.0}
{'loss': 0.2536, 'grad_norm': 3.9138803482055664, 'learning_rate': 0.00029690399999999996, 'epoch': 1.0}
{'loss': 0.2511, 'grad_norm':



{'eval_loss': 0.09588701277971268, 'eval_bleu': 82.0651, 'eval_gen_len': 13.0374, 'eval_runtime': 10281.1998, 'eval_samples_per_second': 9.726, 'eval_steps_per_second': 1.216, 'epoch': 1.01}
{'loss': 0.1376, 'grad_norm': 2.2709035873413086, 'learning_rate': 0.00029398799999999996, 'epoch': 2.0}
{'loss': 0.1442, 'grad_norm': 4.277287006378174, 'learning_rate': 0.000293976, 'epoch': 2.0}
{'loss': 0.1386, 'grad_norm': 2.5494027137756348, 'learning_rate': 0.00029396399999999997, 'epoch': 2.0}
{'loss': 0.1445, 'grad_norm': 6.307223320007324, 'learning_rate': 0.000293952, 'epoch': 2.0}
{'loss': 0.1372, 'grad_norm': 2.8489413261413574, 'learning_rate': 0.00029393999999999997, 'epoch': 2.0}
{'loss': 0.141, 'grad_norm': 1.927731990814209, 'learning_rate': 0.00029392799999999994, 'epoch': 2.0}
{'loss': 0.1424, 'grad_norm': 3.4384982585906982, 'learning_rate': 0.00029391599999999997, 'epoch': 2.0}
{'loss': 0.138, 'grad_norm': 3.9315600395202637, 'learning_rate': 0.000293904, 'epoch': 2.0}
{'loss'



{'eval_loss': 0.0802435427904129, 'eval_bleu': 83.4374, 'eval_gen_len': 13.0407, 'eval_runtime': 10299.0274, 'eval_samples_per_second': 9.71, 'eval_steps_per_second': 1.214, 'epoch': 2.01}
{'loss': 0.0978, 'grad_norm': 0.8594223856925964, 'learning_rate': 0.000290988, 'epoch': 3.0}
{'loss': 0.1034, 'grad_norm': 2.8545756340026855, 'learning_rate': 0.000290976, 'epoch': 3.0}
{'loss': 0.1012, 'grad_norm': 0.9948955774307251, 'learning_rate': 0.00029096399999999995, 'epoch': 3.0}
{'loss': 0.1063, 'grad_norm': 2.501620292663574, 'learning_rate': 0.000290952, 'epoch': 3.0}
{'loss': 0.1053, 'grad_norm': 2.350952386856079, 'learning_rate': 0.00029093999999999995, 'epoch': 3.0}
{'loss': 0.1026, 'grad_norm': 0.4806984066963196, 'learning_rate': 0.000290928, 'epoch': 3.0}
{'loss': 0.1075, 'grad_norm': 2.849700450897217, 'learning_rate': 0.000290916, 'epoch': 3.0}
{'loss': 0.1005, 'grad_norm': 0.37258675694465637, 'learning_rate': 0.000290904, 'epoch': 3.0}
{'loss': 0.0989, 'grad_norm': 1.8506498

KeyboardInterrupt: 