# Topic Finetuning BERTweet

# Frozen: Finetuning Tweet bert

Pretrained BERTweet: https://huggingface.co/vinai/bertweet-base

Fine tuning tutorial: https://huggingface.co/docs/transformers/training

Layer freezing: https://discuss.huggingface.co/t/freeze-lower-layers-with-auto-classification-model/11386

F1 score: https://torchmetrics.readthedocs.io/en/stable/classification/f1_score.html

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/247 Project
%ls

/content/drive/.shortcut-targets-by-id/11dCuBITl5umJqjJki52-YAId8zeeW9i7/247 Project
'Christina marg-finetuning-tweetbert.ipynb'
'Claire Christina marg-finetuning-tweetbert.ipynb'
 claire-finetuning-tweetbert.ipynb
'Copy of Claire Christina marg-finetuning-tweetbert.ipynb'
'Copy of Claire Copy of Copy of Copy of marg-finetuning-tweetbert (1).ipynb'
'Copy of Claire Copy of Copy of Copy of marg-finetuning-tweetbert.ipynb'
'Copy of Copy of marg-finetuning-tweetbert.ipynb'
'Copy of marg-finetuning-tweetbert.ipynb'
'CS 247 Project Results.gsheet'
 [0m[01;34mdata[0m/
 evp-marg-finetuning-tweetbert-full.ipynb
 finetuning-hatebert-full.ipynb
 finetuning-hatebert.ipynb
 lda-split.ipynb
 lda-topic-modeling.ipynb
 marg-finetuning-tweetbert-full.ipynb
 marg-finetuning-tweetbert.ipynb
 [01;34mmodels[0m/
 nlpositionality-analysis.ipynb
'Project Ideas.gdoc'
'Project Proposal.gdoc'
 [01;34mresults[0m/
 [01;34mruns[0m/
 tech-marg-finetuning-tweetbert-full.ipynb
 [01;34mtest_trainer[0m/


In [None]:
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets
!pip install transformers
!pip3 install emoji==0.6.0
!pip install evaluate
!pip install transformers[torch]



In [None]:
import torch
torch.manual_seed(9)

<torch._C.Generator at 0x7df5156cb5f0>

In [None]:
# shift the labels by one since we aren't able to have negtaive labels for training

def shift(example):
    example["labels"] = 1 + example["labels"]
    return example

In [None]:
import pandas as pd
from datasets import load_dataset
import datasets
from datasets import Dataset, DatasetDict

train_dataset = pd.read_csv('data/toxicity_processed_train_topic_2.csv')
train_dataset['labels'] = train_dataset['litw'].astype(int)
train_dataset = Dataset.from_pandas(train_dataset)
train_dataset = train_dataset.map(shift)

val_dataset = pd.read_csv('data/toxicity_processed_val_topic_2.csv')
val_dataset['labels'] = val_dataset['litw'].astype(int)
val_dataset = Dataset.from_pandas(val_dataset)
val_dataset = val_dataset.map(shift)

test_dataset = pd.read_csv('data/toxicity_processed_test_topic_2.csv')
test_dataset['labels'] = test_dataset['litw'].astype(int)
test_dataset = Dataset.from_pandas(test_dataset)
test_dataset = test_dataset.map(shift)


Map:   0%|          | 0/1672 [00:00<?, ? examples/s]

Map:   0%|          | 0/177 [00:00<?, ? examples/s]

Map:   0%|          | 0/437 [00:00<?, ? examples/s]

In [None]:
# create tokenizer and tokenizer function

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")


def tokenize_function(examples):
    return tokenizer(examples["action"], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1672 [00:00<?, ? examples/s]

Map:   0%|          | 0/177 [00:00<?, ? examples/s]

Map:   0%|          | 0/437 [00:00<?, ? examples/s]

In [None]:
# import pretrained bertweet model

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# freeze layer roberta <-- can freeze later
for name, param in model.named_parameters():
  print(str(name))
  if name.startswith("roberta"): # choose whatever you like here
    param.requires_grad = False

roberta.embeddings.word_embeddings.weight
roberta.embeddings.position_embeddings.weight
roberta.embeddings.token_type_embeddings.weight
roberta.embeddings.LayerNorm.weight
roberta.embeddings.LayerNorm.bias
roberta.encoder.layer.0.attention.self.query.weight
roberta.encoder.layer.0.attention.self.query.bias
roberta.encoder.layer.0.attention.self.key.weight
roberta.encoder.layer.0.attention.self.key.bias
roberta.encoder.layer.0.attention.self.value.weight
roberta.encoder.layer.0.attention.self.value.bias
roberta.encoder.layer.0.attention.output.dense.weight
roberta.encoder.layer.0.attention.output.dense.bias
roberta.encoder.layer.0.attention.output.LayerNorm.weight
roberta.encoder.layer.0.attention.output.LayerNorm.bias
roberta.encoder.layer.0.intermediate.dense.weight
roberta.encoder.layer.0.intermediate.dense.bias
roberta.encoder.layer.0.output.dense.weight
roberta.encoder.layer.0.output.dense.bias
roberta.encoder.layer.0.output.LayerNorm.weight
roberta.encoder.layer.0.output.LayerNorm

In [None]:
tokenized_train

Dataset({
    features: ['action', 'litw', 'dynahate', 'perspective', 'rewire', 'hateroberta', 'gpt4', 'gender', 'ethnicity', 'annotator_id', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1672
})

In [None]:
# using litw as the target for now
tokenized_train = tokenized_train.remove_columns(['litw', 'action', 'dynahate', 'perspective', 'rewire', 'hateroberta', 'gpt4', 'gender', 'ethnicity', 'annotator_id', 'token_type_ids'])
tokenized_val = tokenized_val.remove_columns(['litw','action', 'dynahate', 'perspective', 'rewire', 'hateroberta', 'gpt4', 'gender', 'ethnicity', 'annotator_id', 'token_type_ids'])
tokenized_test = tokenized_test.remove_columns(['litw','action', 'dynahate', 'perspective', 'rewire', 'hateroberta', 'gpt4', 'gender', 'ethnicity', 'annotator_id', 'token_type_ids'])

In [None]:
tokenized_train

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 1672
})

In [None]:
# set format to torch
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")
tokenized_test.set_format("torch")

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer/topic-0",
                                  num_train_epochs=70,
                                  evaluation_strategy="epoch")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,F1
1,No log,0.844646,0.261168
2,No log,0.819219,0.261168
3,0.876000,0.814091,0.304722
4,0.876000,0.793832,0.309195
5,0.838500,0.804383,0.425999
6,0.838500,0.777329,0.3319
7,0.838500,0.773462,0.383784
8,0.812400,0.758011,0.387536
9,0.812400,0.755275,0.428109
10,0.795200,0.751234,0.4342


Checkpoint destination directory test_trainer/topic-0/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/topic-0/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/topic-0/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/topic-0/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/topic-0/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/topic-0/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/topic-0/check

TrainOutput(global_step=14630, training_loss=0.7532243276245367, metrics={'train_runtime': 1293.4311, 'train_samples_per_second': 90.488, 'train_steps_per_second': 11.311, 'total_flos': 7698698602721280.0, 'train_loss': 0.7532243276245367, 'epoch': 70.0})

In [None]:
trainer.save_model("/content/drive/My Drive/247 Project/models/topic-2-seed-9-frozen")

In [None]:
predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("f1")
metric.compute(predictions=preds, references=predictions.label_ids, average="macro")

{'f1': 0.46003380006379935}

In [None]:
print(preds)

[2 2 2 2 2 2 2 0 2 0 0 2 2 2 2 0 2 2 2 0 2 0 2 2 2 2 2 2 0 0 2 2 0 2 0 2 2
 2 2 0 0 2 2 2 2 2 0 0 0 0 2 0 0 2 2 2 2 2 0 2 0 2 2 0 2 2 2 0 2 0 2 2 0 0
 0 2 2 0 2 2 2 0 2 2 0 2 2 2 0 2 0 0 0 0 2 2 2 2 0 2 2 0 0 0 2 2 0 0 0 2 2
 0 0 2 2 0 2 2 2 2 2 2 2 2 0 2 2 2 0 2 2 2 0 2 2 0 0 2 2 2 2 2 2 0 2 0 0 0
 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 0 0 2 2 2 2 2
 0 2 0 2 0 0 2 2 2 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 0 2 0 0 2 2 2 2 2 2 2 0
 0 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 0 2 2 2 2 0 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 0 2 2 2 2 2 2 2 0 0 0 0 2 0 2 0 2 2 2 2 2 0
 2 2 2 2 2 2 2 2 0 0 0 2 2 2 0 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 2 0 0 2 2 0
 2 2 0 0 2 2 0 2 2 2 0 2 0 0 2 2 2 2 2 2 2 0 2 0 2 2 2 2 2 2 2 2 2 0 2 2 2
 2 2 2 2 2 0 2 2 0 2 0 2 2 2 2 2 2 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2
 2 0 0 2 0 2 0 2 0 2 2 0 2 2 0 2 0 2 0 2 2 2 2 0 2 2 2 2 2 0]


In [None]:
# Assuming dataset is your Dataset object
test_dataset_dict = test_dataset.to_dict()  # Convert Dataset to a dictionary

# Create DataFrame from dictionary
results_df = pd.DataFrame(test_dataset_dict)

In [None]:
annotator_ids = test_dataset["annotator_id"]

results_df['predictions'] = preds

# Save DataFrame to CSV
results_df.to_csv("./results/metric_results-topic-2-seed-9-frozen.csv", index=False)