In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
import torch

from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

In [2]:
corpus_path = "datasets/sbic/"
p = Path(corpus_path)

csv_trn = "cate-SBIC.v2.trn.csv"
csv_dev = "cate-SBIC.v2.dev.csv"

checkpoint = "cardiffnlp/twitter-roberta-base"

max_len = 64  # based on data analysis below
batch_size = 32
n_epochs = 5
learning_rate = 5e-5
use_fp16 = True if torch.cuda.is_available() else False

label2id = {"race": 0, "gender": 1, "culture": 2, "victim": 3, 
            "disabled": 4, "social": 5, "body":6}
n_classes = len(label2id)

In [3]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
dataset = load_dataset("csv", data_files={"train": str(p/csv_trn), "test": str(p/csv_dev)})

Using custom data configuration default-0561e86159bdbcf5


Downloading and preparing dataset csv/default to /Users/admin/.cache/huggingface/datasets/csv/default-0561e86159bdbcf5/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /Users/admin/.cache/huggingface/datasets/csv/default-0561e86159bdbcf5/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['post', 'targetCategory'],
        num_rows: 12131
    })
    test: Dataset({
        features: ['post', 'targetCategory'],
        num_rows: 1807
    })
})

In [11]:
# Analyze the number of tokens
tokenized_datasets = tokenizer(dataset['train']['post'], padding=False, truncation=False)

In [12]:
input_lens = [len(seq) for seq in tokenized_dataset['input_ids']]
df_len = pd.DataFrame({"len": input_lens})
df_len.describe(percentiles=[0.95, 0.97, 0.99])

Unnamed: 0,len
count,12131.0
mean,27.192647
std,17.708654
min,4.0
50%,23.0
95%,53.0
97%,64.1
99%,94.0
max,389.0


### Preprocess data

In [6]:
def tokenize_function(batch):
    tokenized_batch = tokenizer(batch["post"], max_length=max_len, truncation=True)
    tokenized_batch["labels"] = [label2id[label] for label in batch["targetCategory"]]
    return tokenized_batch

In [7]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [9]:
# Remove columns because data collator doesn't know how to deal with string.
tokenized_datasets = tokenized_datasets.remove_columns(['post', 'targetCategory'])

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[23, 18, 8, 13, 19, 25, 9, 19]

In [12]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 25]),
 'attention_mask': torch.Size([8, 25]),
 'labels': torch.Size([8])}

### Training the model

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=n_classes)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and

In [34]:
def compute_metrics(eval_preds):
    f1_metric = evaluate.load("f1")
    p_metric = evaluate.load("precision")
    r_metric = evaluate.load("recall")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    acc = np.mean(predictions == labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')
    p = p_metric.compute(predictions=predictions, references=labels, average='macro')
    r = r_metric.compute(predictions=predictions, references=labels, average='macro')
    return {**f1, **p, **r, "acc":acc}
    return result

In [16]:
n_steps = len(tokenized_datasets['train']) * n_epochs // batch_size
n_steps

1895

In [17]:
training_args = TrainingArguments("roberta-hatespeech",
                                  evaluation_strategy="epoch",
                                  learning_rate=learning_rate,
                                  lr_scheduler_type='cosine',
                                  warmup_steps=n_steps//5,
                                  save_steps=n_steps//n_epochs,
                                  save_total_limit=3,
                                  num_train_epochs=n_epochs,
                                  fp16=use_fp16,
                                  seed=1,
                                  group_by_length=True
                                 )

In [18]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()