In [1]:
import os
import torch
import evaluate

import numpy as np

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

from datasets import (load_from_disk, 
                      DatasetDict)

In [15]:
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 8
NUM_LABELS = 3
MAX_LENGTH = 100
TASK = "sentiment-analysis"
MODEL_NAME = "bertimbau"

ID2LABEL = {0: "Neutro", 1: "Positivo", 2: "Negativo"}
LABEL2ID = {"Neutro": 0, "Positivo": 1, "Negativo": 2}
model_checkpoint = "neuralmind/bert-base-portuguese-cased"


output_dir = f"../models/checkpoints/{MODEL_NAME}-finetuned-{TASK}"
# logging_dir = f"../models/logging/{MODEL_NAME}-finetuned-{TASK}"

In [12]:
ds = load_from_disk(f"../data/interim/without-emoticons")
ds = ds.shuffle(SEED)

idxs = [i for i in range(1000)]

ds = DatasetDict(
    {
        "train": ds["train"].select(idxs),
        "dev": ds["dev"].select(idxs),
        "test": ds["test"].select(idxs),
    }
)

In [30]:
metric = evaluate.load("f1")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=NUM_LABELS, id2label=ID2LABEL, label2id=LABEL2ID
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def tokenize_function(examples: DatasetDict):
    return tokenizer(
        examples["text"],
        padding="max_length",
        max_length=MAX_LENGTH,
        truncation=True
    )


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels, average="macro")

In [14]:
ds = ds.map(tokenize_function, batched=True)
ds

Map: 100%|██████████| 630481/630481 [00:54<00:00, 11663.87 examples/s]
Map: 100%|██████████| 135103/135103 [00:12<00:00, 11109.60 examples/s]
Map: 100%|██████████| 135104/135104 [00:12<00:00, 10644.60 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 630481
    })
    dev: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 135103
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 135104
    })
})

In [17]:
example = ds["train"][0]
print(example.keys())

dict_keys(['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])


In [18]:
tokenizer.decode(example["input_ids"])

'[CLS] ate podia dormir no chao se fosse contigo [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [21]:
ds.set_format("torch")

In [26]:
training_args = TrainingArguments(
    f"{MODEL_NAME}-finetuned-{TASK}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    output_dir=output_dir,
    logging_strategy="epoch",
    save_total_limit=2,
    save_only_model=True,
    seed=SEED,
    metric_for_best_model="f1",
    report_to="tensorboard",
    # logging_dir=model_logging_dir,
    # fp16=True,
    # warmup_ratio=0.01,
    # eval_steps=100,
    # logging_steps=100,
    # save_steps=500,
    # auto_find_batch_size=True,
    # ray_scope="",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=ds["train"],
    eval_dataset=ds["dev"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.state.log_history
# TODO: Get f1 in training set