# BERT Model for Emotion Classification

This is a section of an undergraudate thesis titled "Emotion Based Music Recommendation Using Sentiment Analysis".

In [3]:
## Installs necessary dependencies into Google Colab.
!pip install transformers torch torchvision flask



# Access the Google drive address where the dataset files are stored and saves their paths.
from google.colab import drive
drive.mount('/content/drive')

train_file = '/content/drive/MyDrive/go_emotions/data/train.tsv'
test_file = '/content/drive/MyDrive/go_emotions/data/test.tsv'
dev_file = '/content/drive/MyDrive/go_emotions/data/dev.tsv'

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [5]:
def load_data(file_path: str):
    '''Load emotion data into dataframe.'''
    df = pd.read_csv(file_path, sep='\t', header=None, names=['text', 'emotion', 'emotion_id'])
    df['emotion'] = df['emotion'].apply(lambda x: int(x.split(',')[0]))
    return df

train_data = load_data(train_file)
test_data = load_data(test_file)
dev_data = load_data(dev_file)


In [6]:
class EmotionDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data.loc[index, 'text'])
        emotion = int(self.data.loc[index, 'emotion'])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            paddding=True,
            return_tensors='pt',
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': emotion
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = EmotionDataset(train_data, tokenizer, max_len=30)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
num_classes = 28  # 27 emotions + neutral
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

model.to('cuda')

optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

for epoch in range(4):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
model.eval()
test_dataset = EmotionDataset(test_data, tokenizer, max_len=30)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average=None)
recall = recall_score(all_labels, all_preds, average=None)
f1 = f1_score(all_labels, all_preds, average=None)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)


Accuracy: 0.5297586143357288
Precision: [0.70264317 0.70358306 0.51666667 0.26299694 0.30151844 0.43617021
 0.45945946 0.40482574 0.53703704 0.3164557  0.39310345 0.46268657
 0.46153846 0.35789474 0.64285714 0.83636364 0.23076923 0.41139241
 0.66489362 0.4        0.4587156  0.18181818 0.27083333 0.
 0.63636364 0.49090909 0.42105263 0.6084724 ]
Recall: [0.63293651 0.85714286 0.47208122 0.3006993  0.43710692 0.35964912
 0.24460432 0.64806867 0.39189189 0.19685039 0.25909091 0.36904762
 0.4        0.4047619  0.60810811 0.79861111 0.5        0.56034483
 0.73964497 0.375      0.41666667 0.25       0.11926606 0.
 0.76086957 0.5        0.34782609 0.59028643]
F1 Score: [0.66597077 0.77280859 0.4933687  0.28058728 0.35686778 0.39423077
 0.31924883 0.49834983 0.453125   0.24271845 0.31232877 0.41059603
 0.42857143 0.37988827 0.625      0.81705151 0.31578947 0.47445255
 0.70028011 0.38709677 0.43668122 0.21052632 0.1656051  0.
 0.69306931 0.49541284 0.38095238 0.59924147]


In [35]:
model.save_pretrained('/content/drive/MyDrive/emotion_model')

emotion_mapping = {
    0: "admiration",
    1: "amusement",
    2: "anger",
    3: "annoyance",
    4: "approval",
    5: "caring",
    6: "confusion",
    7: "curiosity",
    8: "desire",
    9: "disappointment",
    10: "disbelief",
    11: "disgust",
    12: "embarrassment",
    13: "excitement",
    14: "fear",
    15: "gratitude",
    16: "grief",
    17: "joy",
    18: "love",
    19: "nervousness",
    20: "optimism",
    21: "pride",
    22: "realization",
    23: "relief",
    24: "remorse",
    25: "sadness",
    26: "surprise",
    27: "neutral"
}



In [52]:
from transformers import BertForSequenceClassification

loaded_model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/emotion_model')
loaded_model.to('cuda')

def predict_emotion(text):
  '''Predicts an emotion based on input text.'''
  inputs = tokenizer.encode_plus(
      text,
      None,
      add_special_tokens=True,
      padding=True,
      truncation=True,
      return_tensors='pt',
  )

  input_ids = inputs['input_ids'].to('cuda')
  attention_mask = inputs['attention_mask'].to('cuda')

  with torch.no_grad():
      outputs = loaded_model(input_ids, attention_mask=attention_mask)
      logits = outputs.logits
      predicted_emotion = torch.argmax(logits, dim=1).cpu().item()

  predicted_emotion_label = emotion_mapping[predicted_emotion]
  return predicted_emotion_label

predict_emotion("I am ")

'approval'