In [1]:
# !pip install torch==1.11.0
# !pip install transformers==4.31.0
# !pip install datasets==1.18.0
# !pip install sentencepiece==0.1.96
!pip install sacrebleu==2.0.0
# !pip install accelerate==0.20.3



In [2]:
!pip show sentencepiece

Name: sentencepiece
Version: 0.2.1
Summary: Unsupervised text tokenizer and detokenizer.
Home-page: https://github.com/google/sentencepiece
Author: 
Author-email: Taku Kudo <taku@google.com>
License: 
Location: /usr/local/lib/python3.12/dist-packages
Requires: 
Required-by: torchtune


In [3]:
pip install -U peft accelerate



In [4]:
!pip install evaluate



In [5]:
import os
import sys
import logging
import numpy as np
from datasets import load_dataset
from evaluate import load
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
import torch
import random

In [6]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print("USING DEVICE:", device)

USING DEVICE: cuda:0


In [7]:
def start_training(model_name_or_path, train_file, val_file, test_file, output_dir,
                  source_lang="pt_XX", target_lang="es_XX",
                  max_source_length=128, max_target_length=128,
                  num_train_epochs=3, batch_size=8, learning_rate=1e-5, num_beams=4):

  # log
  logging.basicConfig(
      format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
      datefmt="%m/%d/%Y %H:%M:%S",
      handlers=[logging.StreamHandler(sys.stdout)],
  )
  logger = logging.getLogger(__name__)
  logger.setLevel(logging.INFO)

  print("\n" + "=" * 50)
  print("Loading datasets . . . ")
  print("=" * 50)
  data_files = {"train": train_file, "validation": val_file, "test": test_file}
  raw_datasets = load_dataset("csv", data_files=data_files)

  # load model and tokenizer
  print("\n" + "=" * 50)
  print(f"Loading {model_name_or_path} model and tokenizer . . . ")
  print("=" * 50)
  tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
  config = AutoConfig.from_pretrained(model_name_or_path)

  # set language codes
  tokenizer.src_lang = source_lang
  tokenizer.tgt_lang = target_lang
  model.config.forced_bos_token_id = tokenizer.lang_code_to_id[target_lang]

  # tokenizing
  def preprocess_function(examples):
      inputs = examples["warao_sentence"]
      targets = examples["spanish_sentence"]
      model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True)
      labels = tokenizer(targets, max_length=max_target_length, truncation=True)
      model_inputs["labels"] = labels["input_ids"]
      return model_inputs

  tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  # data collator to handle different sizes of sentences
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  metric = load("sacrebleu")

  def postprocess_text(preds, labels):
      preds = [p.strip() for p in preds]
      labels = [[l.strip()] for l in labels]
      return preds, labels

  def compute_metrics(eval_preds):
      preds, labels = eval_preds
      if isinstance(preds, tuple):
          preds = preds[0]
      decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
      labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
      decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
      decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
      result = metric.compute(predictions=decoded_preds, references=decoded_labels)
      return {"bleu": round(result["score"], 4)}


  training_args = Seq2SeqTrainingArguments(
      output_dir=output_dir,
      eval_strategy="epoch",
      save_strategy="epoch",
      learning_rate=learning_rate,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      weight_decay=0.01,
      save_total_limit=1,
      num_train_epochs=num_train_epochs,
      predict_with_generate=True,
      generation_max_length=max_target_length,
      generation_num_beams=num_beams,
      logging_dir=os.path.join(output_dir, "logs"),
      logging_steps=100,
  )

  trainer = Seq2SeqTrainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets["validation"],
      tokenizer=tokenizer,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
  )

  print("\n" + "=" * 50)
  print("Finetuning . . . ")
  print("=" * 50)
  trainer.train()
  trainer.save_model()
  print("Model saved!")

  return tokenized_datasets["test"], trainer, tokenizer


  # # evaluate  # comment out for now because we evaluate at the end of every epoch so this gives the same result as epoch 3 print statement
  # eval_results = trainer.evaluate()
  # logger.info(f"BLEU on validation: {eval_results}")

In [17]:
def generate_predictions(output_dir, tokenized_dataset_test, trainer, auto_tokenizer):
   # log
  logging.basicConfig(
      format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
      datefmt="%m/%d/%Y %H:%M:%S",
      handlers=[logging.StreamHandler(sys.stdout)],
  )
  logger = logging.getLogger(__name__)
  logger.setLevel(logging.INFO)


  # create predictions for 500 random examples in test set for later evaluative use
  random.seed(21)
  num_samples = 500
  sampled_test_pairs = random.sample(tokenized_dataset_test['warao_sentence'], min(num_samples, len(tokenized_dataset_test)))
  predict_results = trainer.predict(sampled_test_pairs)
  preds = auto_tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True)
  preds = [p.strip().replace("\n", " ") for p in preds]

  # save predictions
  os.makedirs(output_dir, exist_ok=True)
  pred_file = os.path.join(output_dir, "mBART50_predictions.txt")
  with open(pred_file, "w", encoding="utf-8") as f:
      f.write("\n".join(preds))

  logger.info(f"Predictions saved to {pred_file}")

In [9]:
!pip install protobuf



In [10]:
tokenized_dataset_test, trainer, auto_tokenizer = start_training(
    model_name_or_path="facebook/mbart-large-50",
    train_file="toy_data.csv",
    val_file="toy_data.csv",
    test_file="toy_data.csv",
    output_dir="./mbart50-finetuned-warao-es",
    source_lang="pt_XX",    # fake code for Warao, this was suggested by the Few Thousand Translations paper
    target_lang="es_XX",    # Spanish code
    num_train_epochs=3,
    batch_size=8,
)



Loading datasets . . . 

Loading facebook/mbart-large-50 model and tokenizer . . . 


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/13 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(



Finetuning . . . 


[34m[1mwandb[0m: Currently logged in as: [33macolmena[0m ([33macolmena-stanford-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Bleu
1,No log,10.202593,0.1951
2,No log,9.092884,0.1628
3,No log,8.759064,0.1682




Model saved!


In [18]:
print('\n' + '=' * 50)
print('Generating predictions . . .')
print('=' * 50)

generate_predictions(
    output_dir="./mbart50-finetuned-warao-es",
    tokenized_dataset_test=tokenized_dataset_test,
    trainer=trainer,
    auto_tokenizer=auto_tokenizer,
    )


Generating predictions . . .


AttributeError: 'str' object has no attribute 'items'