<a href="https://colab.research.google.com/github/bhadreshpsavani/UnderstandingNLP/blob/master/go_emotion_of_transformers_multilabel_text_classification_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Thu Nov 11 12:58:59 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
%%capture
!pip install transformers==4.2.1 pandas torch

In [3]:
import transformers

print(f"Running on transformers v{transformers.__version__}")

Running on transformers v4.2.1


## Imports

In [4]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          PreTrainedModel, BertModel, BertForSequenceClassification,
                          TrainingArguments, Trainer)
from transformers.modeling_outputs import SequenceClassifierOutput

## Load data

In [6]:
!pip install -q datasets

In [None]:
from datasets import load_dataset
emotions = load_dataset("go_emotions", "raw")

Downloading and preparing dataset go_emotions/raw (download: 40.76 MiB, generated: 52.78 MiB, post-processed: Unknown size, total: 93.54 MiB) to /root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d...


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

In [15]:
emotions['train']['labels']

[[27],
 [27],
 [2],
 [14],
 [3],
 [26],
 [15],
 [8, 20],
 [0],
 [27],
 [6],
 [1, 4],
 [27],
 [5],
 [3],
 [3, 12],
 [15],
 [2],
 [27],
 [6, 22],
 [6, 9, 27],
 [12],
 [27],
 [27],
 [27],
 [2],
 [27],
 [16, 25],
 [15],
 [27],
 [2],
 [6],
 [27],
 [2, 7],
 [6],
 [17],
 [27],
 [0],
 [25],
 [27],
 [0, 15],
 [15, 18],
 [16, 27],
 [27],
 [7, 13],
 [10],
 [20],
 [27],
 [27],
 [27],
 [27],
 [27],
 [4],
 [27],
 [13, 15],
 [10],
 [27],
 [27],
 [27],
 [15],
 [0, 1],
 [12],
 [27],
 [13],
 [27],
 [0, 15],
 [27],
 [1],
 [27],
 [0],
 [27],
 [0, 5],
 [3],
 [27],
 [27],
 [27],
 [0, 13, 15],
 [0],
 [27],
 [1],
 [13],
 [4],
 [25],
 [4],
 [27],
 [25],
 [0, 15],
 [9],
 [4, 22],
 [27],
 [4],
 [27],
 [24],
 [18],
 [4],
 [27],
 [7],
 [27],
 [7],
 [27],
 [0],
 [3],
 [10],
 [27],
 [27],
 [5],
 [27],
 [6, 7],
 [27],
 [15],
 [27],
 [0],
 [22],
 [27],
 [17],
 [27],
 [2],
 [2],
 [27],
 [27],
 [9, 27],
 [4, 5],
 [27],
 [3],
 [2, 3],
 [26],
 [7],
 [2],
 [27],
 [0, 8, 15, 17],
 [15],
 [27],
 [11],
 [27],
 [27],
 [9],
 [7

In [13]:
# path to train.csv test.csv and test_labels.csv
emotions['train'][0]

{'id': 'eebbqej',
 'labels': [27],
 'text': "My favourite food is anything I didn't have to cook myself."}

## Preprocess data

In [None]:
# create labels column
label_cols = [c for c in df.columns if c not in ["id", "comment_text"]]
label_cols

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
df["labels"] = df[label_cols].values.tolist()
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,labels
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [None]:
# take sample for quick prototyping
df_sample = df.sample(n=1000)
df_sample.shape

(1000, 9)

In [None]:
# create train / test splits
mask = np.random.rand(len(df_sample)) < 0.8
df_train = df_sample[mask]
df_test = df_sample[~mask]

(df_train.shape, df_test.shape)

((799, 9), (201, 9))

## Tokenize and encode 

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
train_encodings = tokenizer(df_train["comment_text"].values.tolist(), truncation=True)
test_encodings = tokenizer(df_test["comment_text"].values.tolist(), truncation=True)

In [None]:
train_labels = df_train["labels"].values.tolist()
test_labels = df_test["labels"].values.tolist()

In [None]:
class JigsawDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = JigsawDataset(train_encodings, train_labels)
test_dataset = JigsawDataset(test_encodings, test_labels)

In [None]:
train_dataset[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'input_ids': tensor([  101,  6148,  2022,  1996,  2689,  2017,  2215,  2000,  2156,  1012,
          1011,  1043, 11774,  2072,  2065,  2017,  2215,  2000,  2224,  1996,
          2831,  3931,  2079,  2061,  1010,  2021,  1999,  2023,  2553,  1010,
          2009,  2001, 23197,  7481,  2054,  1045,  2001,  2725,  1010,  1998,
          1045,  2106,  2421,  1037,  3120,  2065,  2017,  2246,  2054,  2017,
          2020, 25672,  2075,  1012,  1996,  8154,  2038,  2042, 25963,  2005,
          2070, 

In [None]:
# sanity check
tokenizer.decode(train_dataset[0]["input_ids"])

"[CLS] seek be the change you want to see. - ghandi if you want to use the talk page do so, but in this case, it was brutally honest what i was doing, and i did include a source if you looked what you were undoing. the logo has been outdated for some time, so it also shows me i'm the olny one actively involved with the party. i'm going to read, no suprise reverts to the page please, i skipped the damn deep breathes and my heart might not make it past another one of your ( undo's ) come morning. harhar... 74. 14. 147. 245 [SEP]"

## Fine-tuning

There are two ways we can implement multi-label classification:

* Creating a custom BERT model that overrides the `forward` method
* Creating a custom `Trainer` that overrides the `compute_loss` method

The second method does not work with v4.2.1 of `transformers` due to some bugs, so we'll work with the first approach instead :)

### Creating a Custom Model

In [None]:
class BertForMultilabelSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
      super().__init__(config)

    def forward(self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), 
                            labels.float().view(-1, self.num_labels))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions)

In [None]:
num_labels=6
model = BertForMultilabelSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to('cuda')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForMultilabelSequenceClassification: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transforme

In [None]:
def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True): 
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid: 
      y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.bool()).float().mean().item()

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {'accuracy_thresh': accuracy_thresh(predictions, labels)}

In [None]:
batch_size = 8
# configure logging so we see training loss
logging_steps = len(train_dataset) // batch_size

args = TrainingArguments(
    output_dir="jigsaw",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=logging_steps
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

In [None]:
# sanity check that we can run evaluation
trainer.evaluate()

{'eval_accuracy_thresh': 0.42122718691825867,
 'eval_loss': 0.6822524666786194,
 'eval_runtime': 3.6796,
 'eval_samples_per_second': 54.626}

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy Thresh,Runtime,Samples Per Second
1,0.1967,0.139653,0.965174,3.5604,56.455
2,0.1721,0.137785,0.965174,3.7675,53.35
3,0.1687,0.137138,0.965174,3.9282,51.169


TrainOutput(global_step=300, training_loss=0.17782570227980612, metrics={'train_runtime': 140.9167, 'train_samples_per_second': 2.129, 'total_flos': 447555763333332, 'epoch': 3.0})