# Fine-tuning BERT



In [None]:
!pip install -q transformers datasets --quiet
!pip install accelerate -U --quiet

## Load dataset


In [None]:
from datasets import load_dataset

dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/605k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/291k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/81.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6838 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3259 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/886 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 886
    })
})

In [None]:
example = dataset["train"][0]
example

{'ID': '2017-En-21441',
 'Tweet': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
 'anger': False,
 'anticipation': True,
 'disgust': False,
 'fear': False,
 'joy': False,
 'love': False,
 'optimism': True,
 'pessimism': False,
 'sadness': False,
 'surprise': False,
 'trust': True}

In [None]:
# Create mappings:
#   id: label
#   label: id
labels = [
    label
    for label in dataset["train"].features.keys()
    if label not in ["ID", "Tweet"]
]
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}
labels

## Preprocess data


In [None]:
from transformers import AutoTokenizer
import numpy as np

# create tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Data pre-processing


def preprocess_data(examples):
    text = examples["Tweet"]
    encoding = tokenizer(
        text, padding="max_length", truncation=True, max_length=128
    )
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding


encoded_dataset = dataset.map(
    preprocess_data, batched=True, remove_columns=dataset["train"].column_names
)
encoded_dataset.set_format("torch")

## Define model

In [None]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

## Model training


In [None]:
batch_size = 8
metric_name = "f1"

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    # push_to_hub=True,
)

### Evaluation metrics

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch


# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average="micro")
    roc_auc = roc_auc_score(y_true, y_pred, average="micro")
    accuracy = accuracy_score(y_true, y_pred)
    metrics = {"f1": f1_micro_average, "roc_auc": roc_auc, "accuracy": accuracy}
    return metrics


def compute_metrics(p: EvalPrediction):
    preds = (
        p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    )
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [None]:
encoded_dataset["train"][0]["labels"].type()

'torch.FloatTensor'

In [None]:
encoded_dataset["train"]["input_ids"][0]

tensor([    2,    13,     1, 10041,   622,    25,    21,   125,  7582,    27,
           21,  1448,    42,   123,   243,    57,    22,     9, 12675, 10078,
            9,  6926, 24271,   857,  6926, 15689,  2418,  6926, 10041,   622,
            3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [None]:
# forward pass
outputs = model(
    input_ids=encoded_dataset["train"]["input_ids"][0].unsqueeze(0),
    labels=encoded_dataset["train"][0]["labels"].unsqueeze(0),
)
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.7346, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.0677, -0.6086, -0.2806, -0.3060,  0.5410,  0.2137, -0.2440, -0.2870,
          0.3030, -0.8823, -0.2126]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# Model training
trainer.train()

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.443,0.361831,0.591082,0.723159,0.200903
2,0.348,0.345502,0.632388,0.748409,0.235892
3,0.3048,0.343536,0.637826,0.751979,0.252822
4,0.2816,0.346442,0.641706,0.755786,0.25395
5,0.2618,0.345702,0.64753,0.759753,0.260722


TrainOutput(global_step=4275, training_loss=0.3230514776218704, metrics={'train_runtime': 765.2833, 'train_samples_per_second': 44.676, 'train_steps_per_second': 5.586, 'total_flos': 204450755857920.0, 'train_loss': 0.3230514776218704, 'epoch': 5.0})

## Evaluation

In [None]:
trainer.evaluate()

{'eval_loss': 0.3457016050815582,
 'eval_f1': 0.6475303853116111,
 'eval_roc_auc': 0.759753214913992,
 'eval_accuracy': 0.26072234762979685,
 'eval_runtime': 6.8468,
 'eval_samples_per_second': 129.404,
 'eval_steps_per_second': 16.212,
 'epoch': 5.0}

## Save trained model

In [None]:
model.save_pretrained("/content/out")

# EmotionBERT definition and test


In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

In [None]:
class EmotionBERT:
    def __init__(
        self,
        path_load: str = None,
        path_save: str = None,
        tokenizer_name: str = "distilbert-base-uncased",
    ) -> None:
        self.path_load = path_load
        self.path_save = path_save
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.path_load
        )
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.labels = [
            "anger",
            "anticipation",
            "disgust",
            "fear",
            "joy",
            "love",
            "optimism",
            "pessimism",
            "sadness",
            "surprise",
            "trust",
        ]

    def predict(self, sentence: str) -> list[str]:
        encoding = self.tokenizer(sentence, return_tensors="pt")
        encoding = {k: v.to(self.model.device) for k, v in encoding.items()}
        outputs = self.model(**encoding)
        logits = outputs.logits
        # apply sigmoid + threshold
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(logits.squeeze().cpu())

        if any(probs >= 0.4):
            probs = probs.detach().numpy()
            probs = probs / sum(probs)
            percentile = 10
            quantile = np.percentile(probs, 100 - percentile)
            mask = probs >= quantile
            return [self.labels[i] for i in range(len(self.labels)) if mask[i]]
        else:
            return []

### EmotionBERT test

In [None]:
# REMEMBER TO LOAD THE WEIGHTS AND PUT THE CORRECT PATH IF YOU DIDN'T TRAIN IT BEFORE.
model_emotion = EmotionBERT(path_load="/content/out", path_save="/content/out")

In [None]:
phrase = "Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . "
o = model_emotion.predict(phrase)
o

['joy', 'optimism']