We have a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise, see https://aclanthology.org/D18-1404/. The train, val, and test datasets were merged to one. We created our own dataset of emotions for validating my model. The transfer learning was used for the task. The linear layer was added to BERT(the training of the linear layer didn't perform separately because it it led to a degradation of the model). 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import math

import numpy as np
import matplotlib.pyplot as plt

import torch.nn as nn

import random
import torch

def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

set_random_seed(42)

In [4]:
!pip install -q transformers
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import transformers as ppb
from datasets import load_dataset
import warnings
warnings.filterwarnings('ignore')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
train_df = pd.read_csv('/content/drive/MyDrive/train.txt', sep=';')

In [15]:
train_df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
19995,i just keep feeling like someone is being unki...,anger
19996,im feeling a little cranky negative after this...,anger
19997,i feel that i am useful to my people and that ...,joy
19998,im feeling more comfortable with derby i feel ...,joy


Lets analyze the lenght of the sentences.

In [16]:
train_df['text'].map(lambda x: len(x.split())).sort_values()

4997      2
13316     2
9355      2
17632     2
4150      2
         ..
46       64
7222     64
9618     64
9626     64
6322     66
Name: text, Length: 20000, dtype: int64

In [None]:
labels = {'sadness':0,
          'joy':1,
          'love':2,
          'anger':3,
          'fear':4,
          'surprise':5}

In [None]:
train_df['emotion'] = train_df['emotion'].map(lambda x: labels[x])

In [None]:
train_df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
19995,i just keep feeling like someone is being unki...,3
19996,im feeling a little cranky negative after this...,3
19997,i feel that i am useful to my people and that ...,1
19998,im feeling more comfortable with derby i feel ...,1


In [None]:
train_df['emotion'].value_counts()

1    6761
0    5797
3    2709
4    2373
2    1641
5     719
Name: emotion, dtype: int64

The data is imbalanced, that  is why accuracy on the part of dataset is not correct. 

Initialization of the tokenizer and the model. Before sending the text to the model, it should be tokenized. See the tokenizer API for BERT on the website: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer



In [None]:
df = train_df

In [None]:
from transformers import BertModel



from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [label for label in df['emotion']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 128, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

set_random_seed(42)

df_train, df_val, df_test = np.split(train_df.sample(frac=1, random_state=42), 
                                     [int(.999*len(df)), int(.9995*len(df))])

print(len(df_train),len(df_val), len(df_test))

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.2):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(768, 6)

  

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)


        return linear_output

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

19980 10 10


Download my dataset. 

In [None]:
df_val = pd.read_csv('/content/drive/MyDrive/Neuro.csv', sep=';')

In [None]:
df_val

Unnamed: 0,text,emotion
0,I hate my neighbour.,3
1,I have a phobia of spiders.,4
2,I didn't expect her to do that.,5
3,I miss my parents.,0
4,I am so excited for this!,1
5,I will always be by your side.,2
6,My sister is so annoying.,3
7,I am scared to do that.,4
8,That was so unexpected!,5
9,I am so upset.,0


In [None]:
set_random_seed(42)

from torch.optim import AdamW
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=10, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=10)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 7
model = BertClassifier()
LR = 1e-5
              
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1998/1998 [07:28<00:00,  4.46it/s]


Epochs: 1 | Train Loss:  0.043                 | Train Accuracy:  0.845                 | Val Loss:  0.089                 | Val Accuracy:  0.735


100%|██████████| 1998/1998 [07:26<00:00,  4.47it/s]


Epochs: 2 | Train Loss:  0.012                 | Train Accuracy:  0.942                 | Val Loss:  0.078                 | Val Accuracy:  0.755


100%|██████████| 1998/1998 [07:26<00:00,  4.47it/s]


Epochs: 3 | Train Loss:  0.010                 | Train Accuracy:  0.951                 | Val Loss:  0.086                 | Val Accuracy:  0.735


100%|██████████| 1998/1998 [07:26<00:00,  4.47it/s]


Epochs: 4 | Train Loss:  0.009                 | Train Accuracy:  0.956                 | Val Loss:  0.071                 | Val Accuracy:  0.796


100%|██████████| 1998/1998 [07:26<00:00,  4.47it/s]


Epochs: 5 | Train Loss:  0.007                 | Train Accuracy:  0.967                 | Val Loss:  0.074                 | Val Accuracy:  0.776


100%|██████████| 1998/1998 [07:26<00:00,  4.47it/s]


Epochs: 6 | Train Loss:  0.005                 | Train Accuracy:  0.978                 | Val Loss:  0.068                 | Val Accuracy:  0.796


100%|██████████| 1998/1998 [07:26<00:00,  4.47it/s]


Epochs: 7 | Train Loss:  0.004                 | Train Accuracy:  0.985                 | Val Loss:  0.059                 | Val Accuracy:  0.837


In [None]:
#torch.save(model, 'model.pth')

In [None]:
model = torch.load('/content/drive/MyDrive/model.pth',map_location ='cpu')

In [None]:
model.eval()

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [None]:
X = []


def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=16)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)
              X.extend((np.array(output.cpu().detach().numpy()).astype("float").tolist()))

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
              #print(labels[output.argmax(dim=1).cpu().detach().numpy()[0]])

    #print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
   
    
    
    
#evaluate(model, df1)


Test our model!

In [None]:
sent = "Students of our university are clever"

labels = {0:'sadness',
          1:'joy',
          2:'love',
          3:'anger',
          4:'fear',
          5:'surprise'}

X = []
df_custom = pd.DataFrame({'text':[sent] , 'emotion':[0]})
evaluate(model, df_custom)
print(labels[np.array(*X).argmax()])

joy
