# Frozen: Finetuning BERTweet

Pretrained BERTweet: https://huggingface.co/vinai/bertweet-base

Fine tuning tutorial: https://huggingface.co/docs/transformers/training

Layer freezing: https://discuss.huggingface.co/t/freeze-lower-layers-with-auto-classification-model/11386

F1 score: https://torchmetrics.readthedocs.io/en/stable/classification/f1_score.html

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/My Drive/247 Project
%ls

/content/drive/.shortcut-targets-by-id/11dCuBITl5umJqjJki52-YAId8zeeW9i7/247 Project
'Christina marg-finetuning-tweetbert.ipynb'
'Claire Christina marg-finetuning-tweetbert.ipynb'
'Claire Copy of Copy of Copy of marg-finetuning-tweetbert.ipynb'
'Copy of Claire Christina marg-finetuning-tweetbert.ipynb'
'Copy of Copy of marg-finetuning-tweetbert.ipynb'
'Copy of marg-finetuning-tweetbert.ipynb'
'CS 247 Project Results.gsheet'
 [0m[01;34mdata[0m/
 finetuning-hatebert-full.ipynb
 finetuning-hatebert.ipynb
 lda-split.ipynb
 lda-topic-modeling.ipynb
 marg-finetuning-tweetbert-full.ipynb
 marg-finetuning-tweetbert.ipynb
 [01;34mmodels[0m/
 nlpositionality-analysis.ipynb
'Project Ideas.gdoc'
'Project Proposal.gdoc'
 [01;34mruns[0m/
 [01;34mtest_trainer[0m/


In [None]:
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets
!pip install transformers
!pip3 install emoji==0.6.0
!pip install evaluate
!pip install transformers[torch]

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/280.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m225.3/280.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━

In [None]:
import torch
torch.manual_seed(3)

<torch._C.Generator at 0x7f218035b570>

In [None]:
# shift the labels by one since we aren't able to have negtaive labels for training

def shift(example):
    example["labels"] = 1 + example["labels"]
    return example

In [None]:
import pandas as pd
from datasets import load_dataset
import datasets
from datasets import Dataset, DatasetDict

train_dataset = pd.read_csv('data/toxicity_processed_train.csv')
train_dataset['labels'] = train_dataset['litw'].astype(int)
train_dataset = Dataset.from_pandas(train_dataset)
train_dataset = train_dataset.map(shift)

val_dataset = pd.read_csv('data/toxicity_processed_val.csv')
val_dataset['labels'] = val_dataset['litw'].astype(int)
val_dataset = Dataset.from_pandas(val_dataset)
val_dataset = val_dataset.map(shift)

test_dataset = pd.read_csv('data/toxicity_processed_test.csv')
test_dataset['labels'] = test_dataset['litw'].astype(int)
test_dataset = Dataset.from_pandas(test_dataset)
test_dataset = test_dataset.map(shift)


Map:   0%|          | 0/5107 [00:00<?, ? examples/s]

Map:   0%|          | 0/568 [00:00<?, ? examples/s]

Map:   0%|          | 0/1419 [00:00<?, ? examples/s]

In [None]:
# create tokenizer and tokenizer function

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")


def tokenize_function(examples):
    return tokenizer(examples["action"], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

Map:   0%|          | 0/5107 [00:00<?, ? examples/s]

Map:   0%|          | 0/568 [00:00<?, ? examples/s]

Map:   0%|          | 0/1419 [00:00<?, ? examples/s]

In [None]:
# import pretrained bertweet model

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=3)

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# freeze layer roberta
for name, param in model.named_parameters():
  print(str(name))
  if name.startswith("roberta"): # choose whatever you like here
    param.requires_grad = False

roberta.embeddings.word_embeddings.weight
roberta.embeddings.position_embeddings.weight
roberta.embeddings.token_type_embeddings.weight
roberta.embeddings.LayerNorm.weight
roberta.embeddings.LayerNorm.bias
roberta.encoder.layer.0.attention.self.query.weight
roberta.encoder.layer.0.attention.self.query.bias
roberta.encoder.layer.0.attention.self.key.weight
roberta.encoder.layer.0.attention.self.key.bias
roberta.encoder.layer.0.attention.self.value.weight
roberta.encoder.layer.0.attention.self.value.bias
roberta.encoder.layer.0.attention.output.dense.weight
roberta.encoder.layer.0.attention.output.dense.bias
roberta.encoder.layer.0.attention.output.LayerNorm.weight
roberta.encoder.layer.0.attention.output.LayerNorm.bias
roberta.encoder.layer.0.intermediate.dense.weight
roberta.encoder.layer.0.intermediate.dense.bias
roberta.encoder.layer.0.output.dense.weight
roberta.encoder.layer.0.output.dense.bias
roberta.encoder.layer.0.output.LayerNorm.weight
roberta.encoder.layer.0.output.LayerNorm

In [None]:
tokenized_train

Dataset({
    features: ['action', 'litw', 'dynahate', 'perspective', 'rewire', 'hateroberta', 'gpt4', 'gender', 'ethnicity', 'annotator_id', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5107
})

In [None]:
# using litw as the target for now
tokenized_train = tokenized_train.remove_columns(['litw', 'action', 'dynahate', 'perspective', 'rewire', 'hateroberta', 'gpt4', 'gender', 'ethnicity', 'token_type_ids'])
tokenized_val = tokenized_val.remove_columns(['litw','action', 'dynahate', 'perspective', 'rewire', 'hateroberta', 'gpt4', 'gender', 'ethnicity', 'token_type_ids'])
tokenized_test = tokenized_test.remove_columns(['litw','action', 'dynahate', 'perspective', 'rewire', 'hateroberta', 'gpt4', 'gender', 'ethnicity', 'token_type_ids'])

In [None]:
tokenized_train

Dataset({
    features: ['annotator_id', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 5107
})

In [None]:
# set format to torch
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")
tokenized_test.set_format("torch")

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer",
                                  num_train_epochs=70,
                                  evaluation_strategy="epoch")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
trainer.save_model("/content/drive/My Drive/247 Project/models/full-frozen-seed-3")

Epoch,Training Loss,Validation Loss,F1
1,0.8934,0.851776,0.250549
2,0.8764,0.827572,0.342037
3,0.8631,0.805931,0.356545
4,0.8442,0.795471,0.386168
5,0.8392,0.791285,0.377924
6,0.8254,0.784359,0.367575
7,0.831,0.772894,0.382921
8,0.8291,0.768051,0.404588
9,0.8184,0.775031,0.373016
10,0.817,0.770084,0.441539


Checkpoint destination directory test_trainer/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-3500 already exists and is non-empty. Saving will 

In [None]:
predictions = trainer.predict(tokenized_val)
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("f1")
metric.compute(predictions=preds, references=predictions.label_ids, average="macro")

{'f1': 0.4610244668974759}