Vitor Domingos Baldoino dos Santos</br>
Universidade Presbiteriana Mackenzie</br>
Faculdade de Computação e Informática</br>
[vdbaldoino@gmail.com](mailto:vdbaldoino@gmail.com)</br>

Dataset: [Portuguese Tweets for Sentiment Analysis](https://www.kaggle.com/datasets/augustop/portuguese-tweets-for-sentiment-analysis)

Recursos:

- [BERT Fine-Tuning Tutorial with PyTorch · Chris McCormick](https://mccormickml.com/2019/07/22/BERT-fine-tuning/)
- [Hyperparameter Search with Transformers and Ray Tune](https://huggingface.co/blog/ray-tune)
- [Text Classification on GLUE using `Trainer`](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb#scrollTo=8sgjdLKcIrJm)
- [BERT Finetuning with Hugging Face and Training Visualizations with TensorBoard](https://medium.com/nlplanet/bert-finetuning-with-hugging-face-and-training-visualizations-with-tensorboard-46368a57fc97)
- [Análise de sentimentos em português utilizando Pytorch e Python](https://medium.com/data-hackers/an%C3%A1lise-de-sentimentos-em-portugu%C3%AAs-utilizando-pytorch-e-python-91a232165ec0)
- [How to tweak `Trainer` to monitor other metrics on the training set](https://discuss.huggingface.co/t/metrics-for-training-set-in-trainer/2461/3)
- [Batch and Epoch training metrics for transformers `Trainer`](https://stackoverflow.com/questions/78311534/batch-and-epoch-training-metrics-for-transformers-trainer/78311535#78311535)
- [Performance tips for training](https://huggingface.co/docs/transformers/v4.18.0/en/performance)

## Configurações

In [None]:
%%shell
pip install -q transformers datasets evaluate accelerate
pip install -q torch torchtext torchdata
pip install ray[tune]

In [10]:
import os
import torch
import evaluate
import numpy as np

from copy import deepcopy

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    TrainerCallback
)

from datasets import (load_from_disk,
                      DatasetDict)

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [None]:
# from google.colab import drive
#
# drive.mount('/content/drive')
# os.chdir('/content/drive/MyDrive/sentiment-analysis/')

In [3]:
print(os.getcwd())

/home/baldoinov/Projetos/tweets-sentiment-analysis/notebooks


In [7]:
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128
NUM_LABELS = 3
MAX_LENGTH = 128
TASK = "sentiment-analysis"
MODEL_NAME = "bertimbau"

ID2LABEL = {0: "Neutro", 1: "Positivo", 2: "Negativo"}
LABEL2ID = {"Neutro": 0, "Positivo": 1, "Negativo": 2}
MODEL_CHECKPOINT = "neuralmind/bert-base-portuguese-cased"

OUTPUT_DIR = f"models/{MODEL_NAME}-finetuned-{TASK}"


In [None]:
BATCH_SIZE

128

In [8]:
metric = evaluate.load("f1")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=NUM_LABELS, id2label=ID2LABEL, label2id=LABEL2ID
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def tokenize_function(examples: DatasetDict):
    return tokenizer(
        examples["text"],
        padding="max_length",
        max_length=MAX_LENGTH,
        truncation=True
    )


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="macro"
    )

    # metric.compute(predictions=predictions, references=labels, average="macro")

    return {"accuracy": accuracy,
            "macro-f1-score": f1,
            "precision": precision,
            "recall": recall}

## Fine-Tuning

---

In [None]:
ds = load_from_disk(f"/content/drive/MyDrive/sentiment-analysis/data/intermediate/without-emoticons")
ds = ds.map(tokenize_function, batched=True)
ds

Map:   0%|          | 0/630481 [00:00<?, ? examples/s]

Map:   0%|          | 0/135103 [00:00<?, ? examples/s]

Map:   0%|          | 0/135104 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 630481
    })
    dev: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 135103
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 135104
    })
})

In [None]:
ds.set_format("torch")

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    seed=SEED,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    save_total_limit=2,
    save_only_model=True,
    metric_for_best_model="f1",
    report_to="tensorboard",
    # logging_dir=model_logging_dir,
    # fp16=True,
    # warmup_ratio=0.01,
    # eval_steps=100,
    # logging_steps=100,
    # save_steps=500,
    # auto_find_batch_size=True,
    # ray_scope="",
)

In [None]:
class GetTrainingMetricsCallback(TrainerCallback):

    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        # Aggregate predictions and labels for the entire epoch
        epoch_predictions = np.concatenate(self._trainer.epoch_predictions)
        epoch_labels = np.concatenate(self._trainer.epoch_labels)

        # Compute accuracy
        accuracy = np.mean(epoch_predictions.argmax(axis=1) == epoch_labels)

        # Compute mean loss
        mean_loss = np.mean(self._trainer.epoch_loss)

        # Compute precision, recall, and F1-score
        precision, recall, f1, _ = precision_recall_fscore_support(
            epoch_labels, epoch_predictions.argmax(axis=1), average="macro"
        )

        # Clear stored predictions, labels, and loss for the next epoch
        self._trainer.epoch_predictions = []
        self._trainer.epoch_labels = []
        self._trainer.epoch_loss = []
        # Log epoch-level metrics

        return {
            "epoch_loss": mean_loss,
            "train_accuracy": accuracy,
            "train_macro_f1_score": f1,
            "train_precision": precision,
            "train_recall": recall,
        }


class CustomCallback(TrainerCallback):

    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(
                eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
            )
            return control_copy

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=ds["train"],
    eval_dataset=ds["dev"],
    compute_metrics=compute_metrics,
)

trainer.add_callback(GetTrainingMetricsCallback(trainer))

trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.3253,0.312839,0.881874
2,0.2762,0.313489,0.886566


Epoch,Training Loss,Validation Loss,F1
1,0.3253,0.312839,0.881874
2,0.2762,0.313489,0.886566
3,0.2358,0.322501,0.889371
4,0.202,0.351214,0.887342
5,0.1778,0.372704,0.887562


TrainOutput(global_step=24630, training_loss=0.2434224595299495, metrics={'train_runtime': 12342.0758, 'train_samples_per_second': 255.419, 'train_steps_per_second': 1.996, 'total_flos': 1.62000010471257e+17, 'train_loss': 0.2434224595299495, 'epoch': 5.0})

In [None]:
trainer.save_model("models/bertimbau-full-dataset-no-hyperopt/")

In [None]:
trainer.save_state(split="all", metrics="")

In [None]:
trainer.state.log_history
# TODO: Get f1 in training set

[{'loss': 0.019,
  'grad_norm': 0.0008756146999076009,
  'learning_rate': 1.6000000000000003e-05,
  'epoch': 1.0,
  'step': 125},
 {'eval_loss': 2.4969770908355713,
  'eval_f1': 0.740659320342837,
  'eval_runtime': 2.0963,
  'eval_samples_per_second': 477.031,
  'eval_steps_per_second': 59.629,
  'epoch': 1.0,
  'step': 125},
 {'loss': 0.0041,
  'grad_norm': 0.0007979935617186129,
  'learning_rate': 1.2e-05,
  'epoch': 2.0,
  'step': 250},
 {'eval_loss': 2.237870216369629,
  'eval_f1': 0.7787831114180831,
  'eval_runtime': 2.3174,
  'eval_samples_per_second': 431.516,
  'eval_steps_per_second': 53.939,
  'epoch': 2.0,
  'step': 250},
 {'loss': 0.0375,
  'grad_norm': 0.01883525773882866,
  'learning_rate': 8.000000000000001e-06,
  'epoch': 3.0,
  'step': 375},
 {'eval_loss': 2.0437841415405273,
  'eval_f1': 0.7715542376439819,
  'eval_runtime': 2.3932,
  'eval_samples_per_second': 417.852,
  'eval_steps_per_second': 52.232,
  'epoch': 3.0,
  'step': 375},
 {'loss': 0.009,
  'grad_norm':

# Hyperparameter Search

---

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
      MODEL_CHECKPOINT, num_labels=NUM_LABELS, id2label=ID2LABEL, label2id=LABEL2ID
    )


from ray import train, tune


def objective(config):  # ①
    score = config["a"] ** 2 + config["b"]
    return {"score": score}


search_space = {  # ②
    "a": tune.grid_search([0.001, 0.01, 0.1, 1.0]),
    "b": tune.choice([1, 2, 3]),
}

tuner = tune.Tuner(objective, param_space=search_space)  # ③

results = tuner.fit()
print(results.get_best_result(metric="score", mode="min").config)

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=ds["train"],
    eval_dataset=ds["dev"],
    compute_metrics=compute_metrics,
)

best = trainer.hyperparameter_search(n_trials=10, direction="maximize", backend="ray")