<!-- <center>
  <img src="https://drive.google.com/uc?export=view&id=1IJW7oAeXy1Ols7sSnupqWKK6GL7Jau03">
</center> -->

</br>

<div class="authors col-3", style="text-align:center;">
    <div class="author", style="text-align:center;">
      <div style="text-align:center;">Vitor Domingos Baldoino dos Santos</div>
      <div style="text-align:center;">Universidade Presbiteriana Mackenzie</div>
      <div style="text-align:center;">Faculdade de Computação e Informática</div>
      <div style="text-align:center;"><a href="mailto:vdbaldoino@gmail.com">vdbaldoino@gmail.com</a></div>
    </div>
</div>

Dataset: [Portuguese Tweets for Sentiment Analysis](https://www.kaggle.com/datasets/augustop/portuguese-tweets-for-sentiment-analysis)

Recursos:

- [BERT Fine-Tuning Tutorial with PyTorch · Chris McCormick](https://mccormickml.com/2019/07/22/BERT-fine-tuning/)
- [Hyperparameter Search with Transformers and Ray Tune](https://huggingface.co/blog/ray-tune)
- [Text Classification on GLUE using `Trainer`](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb#scrollTo=8sgjdLKcIrJm)
- [BERT Finetuning with Hugging Face and Training Visualizations with TensorBoard](https://medium.com/nlplanet/bert-finetuning-with-hugging-face-and-training-visualizations-with-tensorboard-46368a57fc97)
- [Análise de sentimentos em português utilizando Pytorch e Python](https://medium.com/data-hackers/an%C3%A1lise-de-sentimentos-em-portugu%C3%AAs-utilizando-pytorch-e-python-91a232165ec0)
- [How to tweak `Trainer` to monitor other metrics on the training set](https://discuss.huggingface.co/t/metrics-for-training-set-in-trainer/2461/3)
- [Batch and Epoch training metrics for transformers `Trainer`](https://stackoverflow.com/questions/78311534/batch-and-epoch-training-metrics-for-transformers-trainer/78311535#78311535)
- [Performance tips for training](https://huggingface.co/docs/transformers/v4.18.0/en/performance)

## Configurações

In [None]:
%%shell
pip install -q transformers datasets evaluate accelerate
pip install -q torch torchtext torchdata
pip install ray[tune]

In [None]:
!nvidia-smi

Sun Apr 28 21:08:26 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0              49W / 400W |   2833MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import os
import torch
import evaluate

import numpy as np

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

from datasets import (load_from_disk,
                      DatasetDict)
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/sentiment-analysis/')

Mounted at /content/drive


In [None]:
print(os.getcwd())

/content/drive/MyDrive/sentiment-analysis


In [None]:
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128
NUM_LABELS = 3
MAX_LENGTH = 128
TASK = "sentiment-analysis"
MODEL_NAME = "bertimbau"

ID2LABEL = {0: "Neutro", 1: "Positivo", 2: "Negativo"}
LABEL2ID = {"Neutro": 0, "Positivo": 1, "Negativo": 2}
model_checkpoint = "neuralmind/bert-base-portuguese-cased"

output_dir = f"models/checkpoints/{MODEL_NAME}-finetuned-{TASK}"
# logging_dir = f"../models/logging/{MODEL_NAME}-finetuned-{TASK}"

In [None]:
BATCH_SIZE

128

In [None]:
metric = evaluate.load("f1")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=NUM_LABELS, id2label=ID2LABEL, label2id=LABEL2ID
)

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_function(examples: DatasetDict):
    return tokenizer(
        examples["text"],
        padding="max_length",
        max_length=MAX_LENGTH,
        truncation=True
    )


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels, average="macro")

## Fine-Tuning

---

In [None]:
ds = load_from_disk(f"/content/drive/MyDrive/sentiment-analysis/data/intermediate/without-emoticons")
ds = ds.map(tokenize_function, batched=True)
ds

Map:   0%|          | 0/630481 [00:00<?, ? examples/s]

Map:   0%|          | 0/135103 [00:00<?, ? examples/s]

Map:   0%|          | 0/135104 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 630481
    })
    dev: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 135103
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 135104
    })
})

In [None]:
ds.set_format("torch")

In [None]:
training_args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    output_dir=output_dir,
    logging_strategy="epoch",
    save_total_limit=2,
    save_only_model=True,
    seed=SEED,
    metric_for_best_model="f1",
    report_to="tensorboard",
    # logging_dir=model_logging_dir,
    # fp16=True,
    # warmup_ratio=0.01,
    # eval_steps=100,
    # logging_steps=100,
    # save_steps=500,
    # auto_find_batch_size=True,
    # ray_scope="",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=ds["train"],
    eval_dataset=ds["dev"],
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.3253,0.312839,0.881874
2,0.2762,0.313489,0.886566


Epoch,Training Loss,Validation Loss,F1
1,0.3253,0.312839,0.881874
2,0.2762,0.313489,0.886566
3,0.2358,0.322501,0.889371
4,0.202,0.351214,0.887342
5,0.1778,0.372704,0.887562


TrainOutput(global_step=24630, training_loss=0.2434224595299495, metrics={'train_runtime': 12342.0758, 'train_samples_per_second': 255.419, 'train_steps_per_second': 1.996, 'total_flos': 1.62000010471257e+17, 'train_loss': 0.2434224595299495, 'epoch': 5.0})

In [None]:
trainer.save_model("saved-models/bertimbau-full-dataset-no-hyperopt/")

In [None]:
trainer.save_state(split="all", metrics="")

In [None]:
trainer.state.log_history
# TODO: Get f1 in training set

[{'loss': 0.019,
  'grad_norm': 0.0008756146999076009,
  'learning_rate': 1.6000000000000003e-05,
  'epoch': 1.0,
  'step': 125},
 {'eval_loss': 2.4969770908355713,
  'eval_f1': 0.740659320342837,
  'eval_runtime': 2.0963,
  'eval_samples_per_second': 477.031,
  'eval_steps_per_second': 59.629,
  'epoch': 1.0,
  'step': 125},
 {'loss': 0.0041,
  'grad_norm': 0.0007979935617186129,
  'learning_rate': 1.2e-05,
  'epoch': 2.0,
  'step': 250},
 {'eval_loss': 2.237870216369629,
  'eval_f1': 0.7787831114180831,
  'eval_runtime': 2.3174,
  'eval_samples_per_second': 431.516,
  'eval_steps_per_second': 53.939,
  'epoch': 2.0,
  'step': 250},
 {'loss': 0.0375,
  'grad_norm': 0.01883525773882866,
  'learning_rate': 8.000000000000001e-06,
  'epoch': 3.0,
  'step': 375},
 {'eval_loss': 2.0437841415405273,
  'eval_f1': 0.7715542376439819,
  'eval_runtime': 2.3932,
  'eval_samples_per_second': 417.852,
  'eval_steps_per_second': 52.232,
  'epoch': 3.0,
  'step': 375},
 {'loss': 0.009,
  'grad_norm':

# Hyperparameter Search

---

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
      model_checkpoint, num_labels=NUM_LABELS, id2label=ID2LABEL, label2id=LABEL2ID
    )


from ray import train, tune


def objective(config):  # ①
    score = config["a"] ** 2 + config["b"]
    return {"score": score}


search_space = {  # ②
    "a": tune.grid_search([0.001, 0.01, 0.1, 1.0]),
    "b": tune.choice([1, 2, 3]),
}

tuner = tune.Tuner(objective, param_space=search_space)  # ③

results = tuner.fit()
print(results.get_best_result(metric="score", mode="min").config)

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=ds["train"],
    eval_dataset=ds["dev"],
    compute_metrics=compute_metrics,
)

best = trainer.hyperparameter_search(n_trials=10, direction="maximize", backend="ray")