<a href="https://colab.research.google.com/github/bhadreshpsavani/UnderstandingNLP/blob/master/go_emotion_of_transformers_multilabel_text_classification_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Mon Nov 29 09:17:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
%%capture
!pip install transformers==4.12.5 pandas torch

In [3]:
import transformers

print(f"Running on transformers v{transformers.__version__}")

Running on transformers v4.12.5


## Imports

In [4]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          PreTrainedModel, DistilBertModel, DistilBertForSequenceClassification,
                          TrainingArguments, Trainer)
from transformers.modeling_outputs import SequenceClassifierOutput

## Load data

In [5]:
!pip install -q datasets

In [6]:
from datasets import load_dataset
emotions = load_dataset("go_emotions", "raw")

Reusing dataset go_emotions (/root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
        num_rows: 211225
    })
})

In [8]:
df = emotions['train'].to_pandas()

In [9]:
label_cols = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
len(label_cols)

28

In [10]:
id2label = {str(i):label for i, label in enumerate(label_cols)}
label2id = {label:str(i) for i, label in enumerate(label_cols)}

In [11]:
id2label

{'0': 'admiration',
 '1': 'amusement',
 '10': 'disapproval',
 '11': 'disgust',
 '12': 'embarrassment',
 '13': 'excitement',
 '14': 'fear',
 '15': 'gratitude',
 '16': 'grief',
 '17': 'joy',
 '18': 'love',
 '19': 'nervousness',
 '2': 'anger',
 '20': 'optimism',
 '21': 'pride',
 '22': 'realization',
 '23': 'relief',
 '24': 'remorse',
 '25': 'sadness',
 '26': 'surprise',
 '27': 'neutral',
 '3': 'annoyance',
 '4': 'approval',
 '5': 'caring',
 '6': 'confusion',
 '7': 'curiosity',
 '8': 'desire',
 '9': 'disappointment'}

In [12]:
label2id

{'admiration': '0',
 'amusement': '1',
 'anger': '2',
 'annoyance': '3',
 'approval': '4',
 'caring': '5',
 'confusion': '6',
 'curiosity': '7',
 'desire': '8',
 'disappointment': '9',
 'disapproval': '10',
 'disgust': '11',
 'embarrassment': '12',
 'excitement': '13',
 'fear': '14',
 'gratitude': '15',
 'grief': '16',
 'joy': '17',
 'love': '18',
 'nervousness': '19',
 'neutral': '27',
 'optimism': '20',
 'pride': '21',
 'realization': '22',
 'relief': '23',
 'remorse': '24',
 'sadness': '25',
 'surprise': '26'}

## Preprocess data

In [13]:
df["labels"] = df[label_cols].values.tolist()
df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,labels
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [14]:
# take sample for quick prototyping
df_sample = df.sample(n=1000)
df_sample.shape

(1000, 38)

In [15]:
# create train / test splits
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]

(df_train.shape, df_test.shape)

((168844, 38), (42381, 38))

## Tokenize and encode 

In [16]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [17]:
train_encodings = tokenizer(df_train["text"].values.tolist(), truncation=True)
test_encodings = tokenizer(df_test["text"].values.tolist(), truncation=True)

In [18]:
train_labels = df_train["labels"].values.tolist()
test_labels = df_test["labels"].values.tolist()

In [19]:
class GoEmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [20]:
train_dataset = GoEmotionDataset(train_encodings, train_labels)
test_dataset = GoEmotionDataset(test_encodings, test_labels)

In [21]:
train_dataset[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1]),
 'input_ids': tensor([ 101, 2008, 2208, 3480, 1012,  102]),
 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0])}

In [22]:
# sanity check
tokenizer.decode(train_dataset[0]["input_ids"])

'[CLS] that game hurt. [SEP]'

## Fine-tuning

There are two ways we can implement multi-label classification:

* Creating a custom BERT model that overrides the `forward` method
* Creating a custom `Trainer` that overrides the `compute_loss` method

The second method does not work with v4.2.1 of `transformers` due to some bugs, so we'll work with the first approach instead :)

### Creating a Custom Model

In [23]:
class DistilBertForMultilabelSequenceClassification(DistilBertForSequenceClassification):
    def __init__(self, config):
      super().__init__(config)

    def forward(self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.distilbert(input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)

        hidden_state = outputs[0]
        pooled_output = hidden_state[:, 0]  
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels).cuda(), 
                            labels.float().view(-1, self.num_labels).cuda() )

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions)

In [24]:
num_labels=28
model = DistilBertForMultilabelSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to('cuda')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForMultilabelSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForMultilabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForMultilabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForMultilabelSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier

In [25]:
model.config.id2label = {
    "0": "admiration",
    "1": "amusement",
    "2": "anger",
    "3": "annoyance",
    "4": "approval",
    "5": "caring",
    "6": "confusion",
    "7": "curiosity",
    "8": "desire",
    "9": "disappointment",
    "10": "disapproval",
    "11": "disgust",
    "12": "embarrassment",
    "13": "excitement",
    "14": "fear",
    "15": "gratitude",
    "16": "grief",
    "17": "joy",
    "18": "love",
    "19": "nervousness",
    "20": "optimism",
    "21": "pride",
    "22": "realization",
    "23": "relief",
    "24": "remorse",
    "25": "sadness",
    "26": "surprise",
    "27": "neutral"
  },
model.config.label2id ={
    "admiration": 0,
    "amusement": 1,
    "anger": 2,
    "annoyance": 3,
    "approval": 4,
    "caring": 5,
    "confusion": 6,
    "curiosity": 7,
    "desire": 8,
    "disappointment": 9,
    "disapproval": 10,
    "disgust": 11,
    "embarrassment": 12,
    "excitement": 13,
    "fear": 14,
    "gratitude": 15,
    "grief": 16,
    "joy": 17,
    "love": 18,
    "nervousness": 19,
    "neutral": 27,
    "optimism": 20,
    "pride": 21,
    "realization": 22,
    "relief": 23,
    "remorse": 24,
    "sadness": 25,
    "surprise": 26
  }
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": [
    {
      "0": "admiration",
      "1": "amusement",
      "10": "disapproval",
      "11": "disgust",
      "12": "embarrassment",
      "13": "excitement",
      "14": "fear",
      "15": "gratitude",
      "16": "grief",
      "17": "joy",
      "18": "love",
      "19": "nervousness",
      "2": "anger",
      "20": "optimism",
      "21": "pride",
      "22": "realization",
      "23": "relief",
      "24": "remorse",
      "25": "sadness",
      "26": "surprise",
      "27": "neutral",
      "3": "annoyance",
      "4": "approval",
      "5": "caring",
      "6": "confusion",
      "7": "curiosity",
      "8": "desire",
      "9": "disappointment"
    }
  ],
  "initializer_range": 0.02,
  "label2id": {
    "admiration": 0,
    "amusement": 1

In [26]:
def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True): 
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid: 
      y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.bool()).float().mean().item()

In [27]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {'accuracy_thresh': accuracy_thresh(predictions, labels)}

In [28]:
batch_size = 32
# configure logging so we see training loss
logging_steps = len(train_dataset) // batch_size

args = TrainingArguments(
    output_dir="emotion",
    evaluation_strategy = "epoch",
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=logging_steps
)

In [29]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

In [30]:
# sanity check that we can run evaluation
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 42381
  Batch size = 32


{'eval_accuracy_thresh': 0.4506003260612488,
 'eval_loss': 0.7234644889831543,
 'eval_runtime': 95.663,
 'eval_samples_per_second': 443.024,
 'eval_steps_per_second': 13.851}

In [31]:
trainer.train()

***** Running training *****
  Num examples = 168844
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 15831


Epoch,Training Loss,Validation Loss,Accuracy Thresh
1,0.1262,0.113293,0.961871
2,0.11,0.110863,0.962302
3,0.1055,0.110903,0.962024


***** Running Evaluation *****
  Num examples = 42381
  Batch size = 32


Saving model checkpoint to emotion/checkpoint-5277
Configuration saved in emotion/checkpoint-5277/config.json
Model weights saved in emotion/checkpoint-5277/pytorch_model.bin
tokenizer config file saved in emotion/checkpoint-5277/tokenizer_config.json
Special tokens file saved in emotion/checkpoint-5277/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 42381
  Batch size = 32
Saving model checkpoint to emotion/checkpoint-10554
Configuration saved in emotion/checkpoint-10554/config.json
Model weights saved in emotion/checkpoint-10554/pytorch_model.bin
tokenizer config file saved in emotion/checkpoint-10554/tokenizer_config.json
Special tokens file saved in emotion/checkpoint-10554/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 42381
  Batch size = 32
Saving model checkpoint to emotion/checkpoint-15831
Configuration saved in emotion/checkpoint-15831/config.json
Model weights saved in emotion/checkpoint-15831/pytorch_model.bin
tokenizer confi

TrainOutput(global_step=15831, training_loss=0.11387918509534, metrics={'train_runtime': 4028.2919, 'train_samples_per_second': 125.744, 'train_steps_per_second': 3.93, 'total_flos': 4792093044377088.0, 'train_loss': 0.11387918509534, 'epoch': 3.0})

In [32]:
# sanity check that we can run evaluation
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 42381
  Batch size = 32


{'epoch': 3.0,
 'eval_accuracy_thresh': 0.962023913860321,
 'eval_loss': 0.11090277135372162,
 'eval_runtime': 97.0732,
 'eval_samples_per_second': 436.588,
 'eval_steps_per_second': 13.649}

In [33]:
!transformers-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: bhadresh-savani
Password: 
Login successful
Your token: USSrKekMAIdNYfcxwutSzdOSvEXmxQJcQWvJIQYlcxLUEOdLuBlwYFYRPUksnUhbBXDghVggoqQhdHoXexdlwHWVyytklJLRARIxPrkYfecZgatctBQwalZlMrMVWVyG 

Your token has been saved to /root/.huggingface/token


In [34]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (1,808 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 155222 files and directories c

In [35]:
!git config --global user.email "bhadreshpsavani@gmail.com"
!git config --global user.name "bhadresh-savani"
!git config --global user.password "#####"

In [36]:
trainer.model.push_to_hub('distilbert-base-uncased-go-emotion')

Cloning https://huggingface.co/bhadresh-savani/distilbert-base-uncased-go-emotion into local empty directory.
Configuration saved in distilbert-base-uncased-go-emotion/config.json
Model weights saved in distilbert-base-uncased-go-emotion/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 3.36k/256M [00:00<?, ?B/s]

To https://huggingface.co/bhadresh-savani/distilbert-base-uncased-go-emotion
   f6496b4..8a54d32  main -> main



'https://huggingface.co/bhadresh-savani/distilbert-base-uncased-go-emotion/commit/8a54d32eda0d6a25c3aabb08c4ffa907f54b3d5f'

In [37]:
trainer.tokenizer.push_to_hub('distilbert-base-uncased-go-emotion')

tokenizer config file saved in distilbert-base-uncased-go-emotion/tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-go-emotion/special_tokens_map.json
To https://huggingface.co/bhadresh-savani/distilbert-base-uncased-go-emotion
   8a54d32..8e86a59  main -> main



'https://huggingface.co/bhadresh-savani/distilbert-base-uncased-go-emotion/commit/8e86a597d46ce7e42b16ba8dd9ce516d5279d6f0'