In [15]:
import os
import torch
import logging
import evaluate

import numpy as np

from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertConfig,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

from datasets import load_from_disk
from torch.utils.data import Dataset

In [16]:
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32
NUM_LABELS = 3
DATA_DIR = "data/interim/with-emoticon"

id2label = {0: "Neutro", 1: "Positivo", 2: "Negativo"}
label2id = {"Neutro": 0, "Positivo": 1, "Negativo": 2}
model_checkpoint = "neuralmind/bert-base-portuguese-cased"
model_output_dir = f"bert-base-portuguese-cased-finetuned-sentiment-analysis"

In [None]:
config = BertConfig()
# config = AutoConfig.from_config(model_checkpoint)


tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
metric = evaluate.load("f1")
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=NUM_LABELS
)

In [20]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        max_length=512,
        truncation=True
    )


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels, average="macro")

In [21]:

ds = load_from_disk(f"../data/interim/with-emoticons")
ds = ds.map(tokenize_function, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 630481/630481 [02:29<00:00, 4223.93 examples/s]
Map: 100%|██████████| 135103/135103 [00:32<00:00, 4102.07 examples/s]
Map: 100%|██████████| 135104/135104 [00:31<00:00, 4253.24 examples/s]


In [22]:
args = TrainingArguments(
    output_dir=model_output_dir,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="steps",
    save_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="tensorboard",
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["dev"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
%load_ext tensorboard
%tensorboard --logdir '{model_output_dir}'/runs

In [None]:
args = dict(attention_probs_dropout_prob=[0.1],
            classifier_dropout=["null"],
            hidden_dropout_prob=[0.1],
            hidden_size=[768],
            id2label={"0": "LABEL_0", "1": "LABEL_1", "2": "LABEL_2"},
            label2id={"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
            initializer_range=[0.02],
            intermediate_size=[3072],
            layer_norm_eps=[1e-12],
            max_position_embeddings=512,
            num_attention_heads=[12],
            num_hidden_layers=[12]
 
            )

In [None]:
{
    "_name_or_path": "neuralmind/bert-base-portuguese-cased",
    "architectures": ["BertForSequenceClassification"],
    "attention_probs_dropout_prob": 0.1,
    "classifier_dropout": null,
    "directionality": "bidi",
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "id2label": {"0": "LABEL_0", "1": "LABEL_1", "2": "LABEL_2"},
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "label2id": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
    "layer_norm_eps": 1e-12,
    "max_position_embeddings": 512,
    "model_type": "bert",
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "output_past": true,
    "pad_token_id": 0,
    "pooler_fc_size": 768,
    "pooler_num_attention_heads": 12,
    "pooler_num_fc_layers": 3,
    "pooler_size_per_head": 128,
    "pooler_type": "first_token_transform",
    "position_embedding_type": "absolute",
    "problem_type": "single_label_classification",
    "torch_dtype": "float32",
    "transformers_version": "4.38.2",
    "type_vocab_size": 2,
    "use_cache": true,
    "vocab_size": 29794,
}