# BERT Model for Emotion Classification.

In [1]:
!pip install transformers torch pandas numpy scikit-learn

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m81.8 MB/s[0m eta [36m0:00:

In [2]:
# Import the necessary packages
import transformers
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


from google.colab import drive

drive.mount('/content/drive')
train_file = '/content/drive/MyDrive/go_emotions/data/train.tsv'
test_file = '/content/drive/MyDrive/go_emotions/data/test.tsv'
dev_file = '/content/drive/MyDrive/go_emotions/data/dev.tsv'

Mounted at /content/drive


In [3]:


# Load the GoEmotions dataset from the files train.tsv, test.tsv and dev.tsv
train = pd.read_csv(train_file, sep='\t', names=['text', 'emotion', 'emotion_id'])
test = pd.read_csv(test_file, sep='\t', names=['text', 'emotion', 'emotion_id'])
dev = pd.read_csv(dev_file, sep='\t', names=['text', 'emotion', 'emotion_id'])

# For samples with multiple emotion labels, select only the first label as the target emotion
train['emotion'] = train['emotion'].str.split(',').str.get(0)
test['emotion'] = test['emotion'].str.split(',').str.get(0)
dev['emotion'] = dev['emotion'].str.split(',').str.get(0)

# Encode the text and emotion columns using the BertTokenizer and LabelEncoder classes
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
label_encoder = LabelEncoder()

train['input_ids'] = train['text'].apply(lambda x: tokenizer.encode(x, max_length=256, truncation=True, padding='max_length'))
test['input_ids'] = test['text'].apply(lambda x: tokenizer.encode(x, max_length=256, truncation=True, padding='max_length'))
dev['input_ids'] = dev['text'].apply(lambda x: tokenizer.encode(x, max_length=256, truncation=True, padding='max_length'))

train['attention_mask'] = train['input_ids'].apply(lambda x: [1 if i > 0 else 0 for i in x])
test['attention_mask'] = test['input_ids'].apply(lambda x: [1 if i > 0 else 0 for i in x])
dev['attention_mask'] = dev['input_ids'].apply(lambda x: [1 if i > 0 else 0 for i in x])

train['label'] = label_encoder.fit_transform(train['emotion'])
test['label'] = label_encoder.transform(test['emotion'])
dev['label'] = label_encoder.transform(dev['emotion'])

train_input_ids = torch.tensor(train['input_ids'].tolist())
test_input_ids = torch.tensor(test['input_ids'].tolist())
dev_input_ids = torch.tensor(dev['input_ids'].tolist())

train_attention_mask = torch.tensor(train['attention_mask'].tolist())
test_attention_mask = torch.tensor(test['attention_mask'].tolist())
dev_attention_mask = torch.tensor(dev['attention_mask'].tolist())

train_label = torch.tensor(train['label'].tolist())
test_label = torch.tensor(test['label'].tolist())
dev_label = torch.tensor(dev['label'].tolist())


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
# Create PyTorch datasets and data loaders for the train, test and dev sets
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_label)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_label)
dev_dataset = TensorDataset(dev_input_ids, dev_attention_mask, dev_label)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
dev_loader = DataLoader(dev_dataset, batch_size=16)


In [5]:
# Load the pre-trained BERT model
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=28)
model.to('cuda') # move the model to GPU if available

# Define the loss function, optimizer and learning rate scheduler
from torch.nn import CrossEntropyLoss
from transformers import AdamW, get_linear_schedule_with_warmup

loss_fn = CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 4)

# Train the model using a for loop over the number of epochs
for epoch in range(4):
    # Set the model to train mode
    model.train()
    # Initialize some variables to store the training loss and accuracy
    train_loss = 0.0
    train_correct = 0
    # Iterate over the batches in the train loader
    for batch in train_loader:
        # Move the input ids, attention masks and labels to GPU if available
        input_ids = batch[0].to('cuda')
        attention_mask = batch[1].to('cuda')
        labels = batch[2].to('cuda')
        # Clear any previously calculated gradients
        optimizer.zero_grad()
        # Feed the input ids and attention masks to the model and get the output logits
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
        # Calculate the loss value using the loss function and the output logits and labels
        loss = loss_fn(logits, labels)
        # Backpropagate the loss value
        loss.backward()
        # Clip any large gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update the model parameters
        optimizer.step()
        # Update the learning rate
        scheduler.step()
        # Accumulate the training loss over all batches
        train_loss += loss.item()
        # Accumulate the number of correct predictions over all batches
        train_correct += (torch.argmax(logits, dim=1) == labels).sum().item()
    # Calculate the average training loss and accuracy over the epoch
    train_loss = train_loss / len(train_loader)
    train_acc = train_correct / len(train_dataset)
    # Print the training loss and accuracy for each epoch
    print(f'Epoch {epoch + 1}: Train loss = {train_loss:.4f}, Train accuracy = {train_acc:.4f}')

    # Set the model to evaluation mode
    model.eval()
    # Initialize some variables to store the validation loss, accuracy and predictions
    val_loss = 0.0
    val_correct = 0
    val_preds = []
    val_labels = []
    # Iterate over the batches in the dev loader
    for batch in dev_loader:
        # Move the input ids, attention masks and labels to GPU if available
        input_ids = batch[0].to('cuda')
        attention_mask = batch[1].to('cuda')
        labels = batch[2].to('cuda')
        # Feed the input ids and attention masks to the model and get the output logits
        with torch.no_grad(): # disable gradient computation to speed up inference
            outputs = model(input_ids, attention_mask)
            logits = outputs.logits
        # Calculate the loss value using the loss function and the output logits and labels
        loss = loss_fn(logits, labels)
        # Accumulate the validation loss over all batches
        val_loss += loss.item()
        # Accumulate the number of correct predictions over all batches
        val_correct += (torch.argmax(logits, dim=1) == labels).sum().item()
        # Accumulate the predictions and labels over all batches
        val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        val_labels.extend(labels.cpu().numpy())
    # Calculate the average validation loss and accuracy over the epoch
    val_loss = val_loss / len(dev_loader)
    val_acc = val_correct / len(dev_dataset)
    # Print the validation loss and accuracy for each epoch
    print(f'Epoch {epoch + 1}: Validation loss = {val_loss:.4f}, Validation accuracy = {val_acc:.4f}')


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Train loss = 1.6252, Train accuracy = 0.5288
Epoch 1: Validation loss = 1.3960, Validation accuracy = 0.5759
Epoch 2: Train loss = 1.1785, Train accuracy = 0.6299
Epoch 2: Validation loss = 1.3961, Validation accuracy = 0.5691
Epoch 3: Train loss = 0.7908, Train accuracy = 0.7497
Epoch 3: Validation loss = 1.5492, Validation accuracy = 0.5686
Epoch 4: Train loss = 0.4684, Train accuracy = 0.8582
Epoch 4: Validation loss = 1.7667, Validation accuracy = 0.5562


In [6]:
# Test the model performance on the test set
# Initialize some variables to store the test loss, accuracy and predictions
test_loss = 0.0
test_correct = 0
test_preds = []
test_labels = []
# Iterate over the batches in the test loader
for batch in test_loader:
    # Move the input ids, attention masks and labels to GPU if available
    input_ids = batch[0].to('cuda')
    attention_mask = batch[1].to('cuda')
    labels = batch[2].to('cuda')
    # Feed the input ids and attention masks to the model and get the output logits
    with torch.no_grad(): # disable gradient computation to speed up inference
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
    # Calculate the loss value using the loss function and the output logits and labels
    loss = loss_fn(logits, labels)
    # Accumulate the test loss over all batches
    test_loss += loss.item()
    # Accumulate the number of correct predictions over all batches
    test_correct += (torch.argmax(logits, dim=1) == labels).sum().item()
    # Accumulate the predictions and labels over all batches
    test_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
    test_labels.extend(labels.cpu().numpy())
# Calculate the average test loss and accuracy over the epoch
test_loss = test_loss / len(test_loader)
test_acc = test_correct / len(test_dataset)
# Print the test loss and accuracy
print(f'Test loss = {test_loss:.4f}, Test accuracy = {test_acc:.4f}')
# Calculate and print the recall, precision and F1 score for each emotion label
print(classification_report(test_labels, test_preds, target_names=label_encoder.classes_))

Test loss = 1.7983, Test accuracy = 0.5454
              precision    recall  f1-score   support

           0       0.66      0.72      0.69       504
           1       0.76      0.84      0.80       252
          10       0.39      0.42      0.40       220
          11       0.45      0.39      0.42        84
          12       0.65      0.43      0.52        30
          13       0.39      0.46      0.42        84
          14       0.63      0.69      0.66        74
          15       0.87      0.80      0.83       288
          16       0.00      0.00      0.00         6
          17       0.55      0.48      0.51       116
          18       0.66      0.71      0.69       169
          19       0.33      0.31      0.32        16
           2       0.50      0.46      0.48       197
          20       0.46      0.49      0.48       120
          21       0.50      0.12      0.20         8
          22       0.21      0.15      0.17       109
          23       0.50      0.14     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
model.save_pretrained('/content/drive/MyDrive/bert-emotion-model') # save the model to your Google Drive
tokenizer.save_pretrained('/content/drive/MyDrive/bert-emotion-tokenizer') # save the tokenizer to your Google Drive

('/content/drive/MyDrive/bert-emotion-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/bert-emotion-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/bert-emotion-tokenizer/vocab.txt',
 '/content/drive/MyDrive/bert-emotion-tokenizer/added_tokens.json')