# Install and Import Dependencies

In [None]:
!pip install datasets transformers[sentencepiece] wandb



In [None]:
# Basic libraries
import numpy as np

# Metrics and datasets
from datasets import load_metric, load_dataset

# Model and training
from transformers import (
  TrainingArguments,
  Trainer,
  AutoTokenizer,
  AutoModelForSequenceClassification
)
import torch
from torch import nn

# Class imbalance
from sklearn.utils.class_weight import compute_class_weight

# For future experiment reporting
import wandb

In [None]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33maidan-o-brien[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# Find device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Load Data

In [None]:
fpath = "/content/drive/MyDrive/Colab Notebooks/Rewire/"
fpath_data = fpath + "data/{}"
data_files = {'train': fpath_data.format("train.csv"),
              'val': fpath_data.format("val.csv"),
              'test': fpath_data.format("test.csv")}
datasets = load_dataset('csv', data_files=data_files)

Using custom data configuration default-bd22d5c989d26128
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-bd22d5c989d26128/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Rename target variable to follow convention
for dataset in datasets:
  datasets[dataset] = datasets[dataset].rename_column("class", "labels")
  datasets[dataset] = datasets[dataset].rename_column("tweet", "text")

In [None]:
datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 19826
    })
    val: Dataset({
        features: ['labels', 'text'],
        num_rows: 2478
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 2479
    })
})

# Tokenise Data

+ DistilBERT was used because of its light-weight nature, meaning that it could be trained faster than larger models, e.g. BERT. It also retains much of the performance of BERT despite it being smaller.

In [None]:
# Load tokenizer
CHECKPOINT = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

In [None]:
# Add new tokens to tokenizer vocab
special_tokens_dict = {'additional_special_tokens': ['[MENTION]','[EMOJI]','[URL]', '[HASH]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
# Tokenize datasets
def tokenize(batch):
  # Pad to max sequence in batch
  return tokenizer(batch["text"], padding=True, truncation=True)


train_encoded = datasets["train"].map(tokenize, batched=True, batch_size=None)
val_encoded = datasets["val"].map(tokenize, batched=True, batch_size=None)
test_encoded = datasets["test"].map(tokenize, batched=True, batch_size=None)

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-bd22d5c989d26128/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-4d3708cdcc528ba2.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-bd22d5c989d26128/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-dffd4ef9644c9adf.arrow


In [None]:
# Provide an example of the tokenization process
example_idx = 12

print("Original tweet:")
example_tweet = train_encoded[example_idx]
print(example_tweet["text"])

print("Corresponding input IDs:")
print(example_tweet["input_ids"])

print("Tokenized tweet:")
print(tokenizer.convert_ids_to_tokens(example_tweet["input_ids"]))

Original tweet:
that nigguh will never play football again
Corresponding input IDs:
[101, 2008, 9152, 13871, 27225, 2097, 2196, 2377, 2374, 2153, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Tokenized tweet:
['[CLS]', 'that', 'ni', '##gg', '##uh', 'will', 'never', 'play', 'football', 'again', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[

+ Subword tokenization appears to be separating out some of the profanity.

# Prepare Data For Training and Evaluation

In [None]:
cols = ['input_ids', 'attention_mask', 'labels']
train_encoded.set_format(type='torch', columns=cols)
val_encoded.set_format(type='torch', columns=cols)
test_encoded.set_format(type='torch', columns=cols)

# Weighted Trainer

In [None]:
# Calculate class weights for weighted trainer
labels = datasets["train"]["labels"]
class_weights = compute_class_weight('balanced',
                                     classes=np.unique(labels),
                                     y=labels)
pt_class_weights = torch.FloatTensor(class_weights).to(device)

print(class_weights)

[0.53061771 8.66520979]


In [None]:
class WeightedTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
    labels = inputs.get("labels")

    # Forward pass
    outputs = model(**inputs)
    logits = outputs.get("logits")

    # Compute custom loss
    loss_fct = nn.CrossEntropyLoss(weight=pt_class_weights)
    loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)).to(device)

    return (loss, outputs) if return_outputs else loss

# Train Model

In [None]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT)

# Resize model vocabulary to match tokenizer vocab
model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Embedding(30526, 768)

In [None]:
# Define metrics
f1_metric = load_metric("f1")
acc_metric = load_metric("accuracy")

In [None]:
# Create function for calculating metrics
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  f1 = f1_metric.compute(predictions=predictions, references=labels)
  acc = acc_metric.compute(predictions=predictions, references=labels)
  return {"accuracy": acc, "f1": f1}

In [None]:
# Instantiate training arguments
training_args = TrainingArguments(
  report_to="wandb",
  output_dir=fpath + "model",
  evaluation_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
  num_train_epochs=10,
  seed=210522
)

In [None]:
# Instantiate trainer
trainer = WeightedTrainer(
  model=model,
  args=training_args,
  train_dataset=train_encoded,
  eval_dataset=val_encoded,
  compute_metrics=compute_metrics
)

In [None]:
# Train
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 19826
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 24790
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7167,0.673604,{'accuracy': 0.9402744148506861},{'f1': 0.39344262295081966}
2,0.673,0.63397,{'accuracy': 0.9386602098466506},{'f1': 0.44525547445255476}
3,0.5415,0.762612,{'accuracy': 0.9394673123486683},{'f1': 0.423076923076923}
4,0.4627,0.748174,{'accuracy': 0.940677966101695},{'f1': 0.3287671232876712}
5,0.4937,1.118401,{'accuracy': 0.9334140435835351},{'f1': 0.4}
6,0.3287,1.037279,{'accuracy': 0.9354317998385795},{'f1': 0.39849624060150374}
7,0.2476,1.142038,{'accuracy': 0.9322033898305084},{'f1': 0.373134328358209}
8,0.2206,1.348335,{'accuracy': 0.9398708635996772},{'f1': 0.34934497816593885}
9,0.1367,1.46033,{'accuracy': 0.9346246973365617},{'f1': 0.37692307692307697}
10,0.106,1.500519,{'accuracy': 0.937046004842615},{'f1': 0.35}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2478
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/Rewire/model/checkpoint-2479
Configuration saved in /content/drive/MyDrive/Colab Notebooks/Rewire/model/checkpoint-2479/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/Rewire/model/checkpoint-2479/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2478
  Batch size = 8
Saving model 

TrainOutput(global_step=24790, training_loss=0.3927558774282587, metrics={'train_runtime': 2075.4125, 'train_samples_per_second': 95.528, 'train_steps_per_second': 11.945, 'total_flos': 4513950797389440.0, 'train_loss': 0.3927558774282587, 'epoch': 10.0})

<iframe src="https://wandb.ai/aidan-o-brien/huggingface/reports/Shared-panel-22-05-24-08-05-65--VmlldzoyMDU4ODA4?highlightShare" style="border:none;height:1024px;width:100%">

# Save Model

In [None]:
trainer.save_model(fpath + "/model/final/distilBERT_hatespeech")
tokenizer.save_pretrained(fpath + "/model/final/tokenizer")

Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/Rewire//model/final/distilBERT_hatespeech
Configuration saved in /content/drive/MyDrive/Colab Notebooks/Rewire//model/final/distilBERT_hatespeech/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/Rewire//model/final/distilBERT_hatespeech/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/Rewire//model/final/tokenizer/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/Rewire//model/final/tokenizer/special_tokens_map.json


('/content/drive/MyDrive/Colab Notebooks/Rewire//model/final/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Rewire//model/final/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Rewire//model/final/tokenizer/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/Rewire//model/final/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/Rewire//model/final/tokenizer/tokenizer.json')

__Improvement:__

+ Use `model_init` for weight randomisation seeding > reproducibility
+ Data augmentation - oversampling minority class - different to `WeightedTrainer`?
+ Hyperparameter optimisation (warm-up steps, learning rate, weight decay, Adam hyperparameters)
+ Remove file paths from all notebooks