In [12]:
import torch
import jsonlines
import numpy as np
import transformers
import pandas as pd
import pickle as pkl
from torch import nn
from tqdm import tqdm
from os.path import join
import multiprocessing as mp
from importlib import reload
from collections import Counter
from datasets import load_dataset
from torch.utils.data import Dataset
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from transformers import (WEIGHTS_NAME,
                          BertConfig, BertForSequenceClassification, BertTokenizer,
                          XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
from transformers import BertPreTrainedModel, BertModel, AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModel

In [13]:
class emotionDataset(Dataset):
    """Class to load the dataset and get batches of paras"""
    
    def __init__(self, list_data, 
                 tokenizer, max_length):
        
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = list_data
        self.pad_token = 1
    
    def __len__(self):
        """Return length of dataset."""
        return self.data.__len__()

    def __getitem__(self, i):
        """Return sample from dataset at index i."""
        example = self.data[i]
        inputs = self.tokenizer.encode_plus(example['text'],
                                            add_special_tokens=True,
                                            truncation=True,
                                            max_length=self.max_length)
                
        input_ids = inputs["input_ids"]
        input_ids = input_ids[:self.max_length]
        attention_mask = [1] * len(input_ids)
        
        padding_length = self.max_length - len(input_ids)
        input_ids = input_ids + ([self.pad_token] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        
        assert len(input_ids) == self.max_length, "Error with input length {} vs {}".format(len(input_ids), self.max_length)
        
        nli_label = example['labels'][0]
        
        return_dict = {'input_ids':torch.LongTensor(input_ids),
                       'attention_mask':torch.LongTensor(attention_mask),
                       'labels': torch.LongTensor([nli_label])}
        
        return return_dict

In [14]:
id2label = {0:"admiration",
            1:"amusement",
            2:"anger",
            3:"annoyance",
            4:"approval",
            5:"caring",
            6:"confusion",
            7:"curiosity",
            8:"desire",
            9:"disappointment",
            10:"disapproval",
            11:"disgust",
            12:"embarrassment",
            13:"excitement",
            14:"fear",
            15:"gratitude",
            16:"grief",
            17:"joy",
            18:"love",
            19:"nervousness",
            20:"optimism",
            21:"pride",
            22:"realization",
            23:"relief",
            24:"remorse",
            25:"sadness",
            26:"surprise",
            27:"neutral"}

In [15]:
dataset = load_dataset("go_emotions", "simplified")

Reusing dataset go_emotions (/home/brawat/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
Counter([x['labels'].__len__() for x in dataset['train']])

Counter({1: 36308, 2: 6541, 3: 532, 4: 28, 5: 1})

In [17]:
dataset.keys()

dict_keys(['train', 'validation', 'test'])

In [18]:
dataset['train'][0]

{'text': "My favourite food is anything I didn't have to cook myself.",
 'labels': [27],
 'id': 'eebbqej'}

In [19]:
roberta_config = RobertaConfig.from_pretrained('roberta-base',
                                      num_labels=len(id2label),
                                      finetuning_task='GoEmotions',
                                      cache_dir=None,
                                      output_attentions=False,
                                      output_hidden_states=False)

In [20]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False)

In [21]:
train_dataset = emotionDataset(list_data=dataset['train'], 
                               tokenizer=tokenizer, 
                               max_length=200)

In [22]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,
                          num_workers=mp.cpu_count())

In [23]:
example_batch = next(iter(train_loader))

In [24]:
example_batch['input_ids'].shape, example_batch['attention_mask'].shape, example_batch['labels'].shape

(torch.Size([32, 200]), torch.Size([32, 200]), torch.Size([32, 1]))

In [25]:
args = {'weight_decay':0.0,
        'learning_rate':2e-5,
        'epochs':5,
        'gradient_accumulation_steps':1,
        'adam_epsilon':1e-8}
args['t_total'] = len(train_loader) // args['gradient_accumulation_steps'] * args['epochs']
args['warmup_steps'] = int(0.10*args['t_total'])

In [26]:
model = RobertaForSequenceClassification(config=roberta_config).cuda()

In [27]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'],
                                            num_training_steps=args['t_total'])

In [28]:
model = nn.DataParallel(model.cuda())

In [32]:
model.train()
for each_epoch in range(args['epochs']):
    for batch in tqdm(train_loader):
        model.zero_grad()
        outputs = model(**batch)
        ### Loss calculation
        loss = outputs[0].mean()
        
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate schedule

100%|██████████| 1357/1357 [07:50<00:00,  2.89it/s]
100%|██████████| 1357/1357 [07:42<00:00,  2.94it/s]
100%|██████████| 1357/1357 [07:45<00:00,  2.91it/s]
100%|██████████| 1357/1357 [07:46<00:00,  2.91it/s]
100%|██████████| 1357/1357 [07:45<00:00,  2.91it/s]


In [34]:
test_dataset = emotionDataset(list_data=dataset['test'], 
                               tokenizer=tokenizer, 
                               max_length=200)

In [35]:
test_loader = DataLoader(test_dataset, batch_size=32, 
                         shuffle=False, num_workers=mp.cpu_count())

In [38]:
def evaluate(model_, eval_loader):
    model.eval()
    dict_result = {'actual':[],
                   'preds':[]}
    with torch.no_grad():
        for batch in tqdm(eval_loader):
            outputs = model(**batch)
            logits = outputs[1]
            
            dict_result['actual'] += batch['labels'].numpy().tolist()
            dict_result['preds'] += np.argmax(logits.detach().cpu().numpy(), axis=1).tolist()
    return dict_result

In [39]:
dict_test_results = evaluate(model_=model,
                             eval_loader=test_loader)

100%|██████████| 170/170 [00:27<00:00,  6.22it/s]


In [129]:
def get_performance(actual_og, preds_og, dict_mapping, avoid_labels=[]):
    actual_og = [x[0] for x in actual_og]
    actual_ = []
    preds_ = []
    for ind in tqdm(range(actual_og.__len__())):
        if actual_og[ind] not in avoid_labels and preds_og[ind] not in avoid_labels:
            actual_.append(actual_og[ind])
            preds_.append(preds_og[ind])
    df_report = classification_report(actual_, preds_)
    print(df_report)
    print('--'*20)
    print('STATS')
    print('--'*20)
    print('Actual counter:', Counter(actual_))
    print('Prediction counter:', Counter(preds_))
    print('Mapping:', dict_mapping)
    return df_report

In [122]:
df_test = get_performance(actual_og=dict_test_results['actual'], 
                          preds_og=dict_test_results['preds'], 
                          dict_mapping=id2label)

100%|██████████| 5427/5427 [00:00<00:00, 1191503.76it/s]

              precision    recall  f1-score   support

           0       0.61      0.63      0.62       504
           1       0.70      0.87      0.78       252
           2       0.45      0.43      0.44       197
           3       0.26      0.20      0.22       286
           4       0.34      0.29      0.31       318
           5       0.29      0.21      0.24       114
           6       0.31      0.28      0.29       139
           7       0.46      0.50      0.48       233
           8       0.51      0.34      0.41        74
           9       0.30      0.13      0.18       127
          10       0.32      0.22      0.26       220
          11       0.50      0.42      0.45        84
          12       0.36      0.13      0.20        30
          13       0.37      0.33      0.35        84
          14       0.59      0.59      0.59        74
          15       0.82      0.86      0.84       288
          16       0.00      0.00      0.00         6
          17       0.48    


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [130]:
df_test = get_performance(actual_og=dict_test_results['actual'], 
                          preds_og=dict_test_results['preds'], 
                          dict_mapping=id2label,
                          avoid_labels=[12, 16, 19, 21, 23, 24])

100%|██████████| 5427/5427 [00:00<00:00, 677293.73it/s]

              precision    recall  f1-score   support

           0       0.61      0.63      0.62       503
           1       0.71      0.87      0.78       252
           2       0.45      0.43      0.44       197
           3       0.27      0.20      0.23       284
           4       0.34      0.29      0.32       316
           5       0.30      0.21      0.25       113
           6       0.31      0.28      0.30       137
           7       0.46      0.51      0.48       231
           8       0.51      0.34      0.41        73
           9       0.31      0.13      0.19       126
          10       0.33      0.22      0.27       216
          11       0.51      0.42      0.46        84
          13       0.38      0.33      0.35        84
          14       0.65      0.60      0.62        73
          15       0.83      0.86      0.84       288
          17       0.50      0.47      0.48       115
          18       0.63      0.75      0.68       169
          20       0.53    




In [134]:
df_test = get_performance(actual_og=dict_test_results['actual'], 
                          preds_og=dict_test_results['preds'], 
                          dict_mapping=id2label,
                          avoid_labels=[8, 11, 12, 13, 14, 16, 19, 21, 23, 24, 26])

100%|██████████| 5427/5427 [00:00<00:00, 458329.73it/s]

              precision    recall  f1-score   support

           0       0.63      0.65      0.64       489
           1       0.72      0.88      0.79       251
           2       0.48      0.45      0.46       191
           3       0.30      0.20      0.24       274
           4       0.36      0.30      0.33       304
           5       0.31      0.22      0.26       110
           6       0.32      0.29      0.31       133
           7       0.49      0.53      0.51       221
           9       0.33      0.15      0.20       117
          10       0.34      0.23      0.28       205
          15       0.84      0.87      0.85       285
          17       0.55      0.49      0.52       110
          18       0.66      0.75      0.70       169
          20       0.57      0.55      0.56       113
          22       0.43      0.11      0.17        93
          25       0.48      0.49      0.48        90
          27       0.59      0.70      0.64      1558

    accuracy              




In [103]:
dict_roberta = {}
for each_ in [x.split('      ') for x in df_test.split('\n')[2:-5]]:
    dict_roberta[id2label[int(each_[1])]] = [float(each_[-2]) , int(each_[-1])]

In [60]:
baseline_results = open('goemotions_results.txt').read()

In [63]:
dict_baseline = {}
for each_ in baseline_results.split('\n'):
    dict_baseline[each_.split(' ')[0]] = each_.split(' ')[-1]

In [132]:
labels_to_avoid = []
list_macro_f1_score = []
for each_key in dict_baseline:
    print('=='*30)
    print('Emotion:', each_key)
    print('=='*30)
    print('Baseline:', dict_baseline[each_key])
    print('RoBERTa:', dict_roberta[each_key][0])
    print('Support:', dict_roberta[each_key][1])
    if dict_roberta[each_key][1]<100:
        labels_to_avoid.append(each_key)
        print('Added.')

Emotion: admiration
Baseline: 0.65
RoBERTa: 0.62
Support: 504
Emotion: amusement
Baseline: 0.80
RoBERTa: 0.78
Support: 252
Emotion: anger
Baseline: 0.47
RoBERTa: 0.44
Support: 197
Emotion: annoyance
Baseline: 0.34
RoBERTa: 0.22
Support: 286
Emotion: approval
Baseline: 0.36
RoBERTa: 0.31
Support: 318
Emotion: caring
Baseline: 0.39
RoBERTa: 0.24
Support: 114
Emotion: confusion
Baseline: 0.37
RoBERTa: 0.29
Support: 139
Emotion: curiosity
Baseline: 0.54
RoBERTa: 0.48
Support: 233
Emotion: desire
Baseline: 0.49
RoBERTa: 0.41
Support: 74
Added.
Emotion: disappointment
Baseline: 0.28
RoBERTa: 0.18
Support: 127
Emotion: disapproval
Baseline: 0.39
RoBERTa: 0.26
Support: 220
Emotion: disgust
Baseline: 0.45
RoBERTa: 0.45
Support: 84
Added.
Emotion: embarrassment
Baseline: 0.43
RoBERTa: 0.2
Support: 30
Added.
Emotion: excitement
Baseline: 0.34
RoBERTa: 0.35
Support: 84
Added.
Emotion: fear
Baseline: 0.60
RoBERTa: 0.59
Support: 74
Added.
Emotion: gratitude
Baseline: 0.86
RoBERTa: 0.84
Support: 288


In [133]:
[key for key,val in id2label.items() if val in labels_to_avoid]

[8, 11, 12, 13, 14, 16, 19, 21, 23, 24, 26]

In [147]:
print(labels_to_avoid)

['desire', 'disgust', 'embarrassment', 'excitement', 'fear', 'grief', 'nervousness', 'pride', 'relief', 'remorse', 'surprise']


In [108]:
# When the cut-off for support is around 50
[key for key,val in id2label.items() if val in labels_to_avoid]

[12, 16, 19, 21, 23, 24]

In [None]:
model.module

In [144]:
def save_model(model, tokenizer, out_dir):
    # Take care of distributed/parallel training
    os.mkdir(out_dir)
    model_to_save = model.module if hasattr(model, 'module') else model  
    model_to_save.save_pretrained(out_dir)
    tokenizer.save_pretrained(out_dir)

In [145]:
save_model(model=model, 
           tokenizer=tokenizer,
           out_dir='roberta_goEmotion')