# Emotion Classification using Transformer Embeddings

##  Importing Libraries and Dependencies

In [1]:
from pathlib import Path

import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from tqdm import tqdm

##  Loading and Preprocessing the Dataset

In [2]:
df = pd.read_csv("Snappfood - Sentiment Analysis.csv", delimiter="\t")
df.drop(columns=["Unnamed: 0", "label_id"], inplace=True)
happy_mask = df["label"] == "HAPPY"
sad_mask = df["label"] == "SAD"
happy_comments = df["comment"][happy_mask].to_list()
happy_labels = [1 for c in happy_comments]
sad_comments = df["comment"][sad_mask].to_list()
sad_labels = [0 for c in sad_comments]

comments = [*happy_comments, *sad_comments]
labels = [*happy_labels, *sad_labels]

train_comments, test_comments, train_labels, test_labels = train_test_split(
    comments, labels, test_size=0.2, random_state=42
)

In [3]:
# # Load model from HuggingFace Hub
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# # Mean Pooling - Take attention mask into account for correct averaging
# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# class Body:
#     def __init__(self, tokenizer, model):
#         self.tokenizer = tokenizer
#         self.model = model

#     def __call__(self, sentences):
#         # Tokenize sentences
#         encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

#         # Compute token embeddings
#         with torch.no_grad():
#             model_output = model(**encoded_input)

#         # Perform pooling. In this case, max pooling.
#         sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
#         return sentence_embeddings


# body = Body(tokenizer, model)

In [4]:
# train_embeddings = []
# for c in tqdm(train_comments):
#     embedding = body(c)
#     train_embeddings.append(embedding)

# test_embeddings = []
# for c in tqdm(test_comments):
#     embedding = body(c)
#     test_embeddings.append(embedding)

# torch.save(torch.cat(train_embeddings, dim=0), "train_embeddings.pt")
# torch.save(torch.cat(test_embeddings, dim=0), "test_embeddings.pt")

In [5]:
train_embeddings = torch.load("train_embeddings.pt")
test_embeddings = torch.load("test_embeddings.pt")

In [6]:
class SentimentAnalysisDatase(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx]
        label = self.labels[idx]
        return embedding, label


train_dataset = SentimentAnalysisDatase(train_embeddings, train_labels)
test_dataset = SentimentAnalysisDatase(test_embeddings, test_labels)

batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [13]:
device = torch.device("cuda")
head = nn.Sequential(
    nn.Linear(384, 128),
    nn.BatchNorm1d(128),
    nn.ReLU(),
    nn.Linear(128, 2),
)
head = head.to(device)
loss_function = nn.CrossEntropyLoss()

In [14]:
optimizer = optim.Adam(head.parameters(), lr=5e-4)

In [None]:
# embeddings, labels = next(iter(train_loader))
for epoch in range(50):
    loss_sum = 0.0
    counter = 0
    for embeddings, labels in (pbar := tqdm(train_loader, desc="Train")):
        embeddings = embeddings.to(device)
        labels = labels.to(device)
        logits = head(embeddings)
        loss = loss_function(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss_sum += loss.item()
        counter += 1

        mean_loss = loss_sum / counter

        pbar.set_postfix({"mean_loss": mean_loss})

    loss_sum = 0.0
    counter = 0
    for embeddings, labels in (pbar := tqdm(test_loader, desc="Test")):
        embeddings = embeddings.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            logits = head(embeddings)
        loss = loss_function(logits, labels)

        loss_sum += loss.item()
        counter += 1

        mean_loss = loss_sum / counter

        pbar.set_postfix({"mean_loss": mean_loss})

Train: 100%|██████████| 109/109 [00:00<00:00, 193.96it/s, mean_loss=0.397]
Test: 100%|██████████| 28/28 [00:00<00:00, 227.09it/s, mean_loss=0.364]
Train: 100%|██████████| 109/109 [00:00<00:00, 189.83it/s, mean_loss=0.358]
Test: 100%|██████████| 28/28 [00:00<00:00, 230.31it/s, mean_loss=0.354]
Train: 100%|██████████| 109/109 [00:00<00:00, 191.64it/s, mean_loss=0.346]
Test: 100%|██████████| 28/28 [00:00<00:00, 272.62it/s, mean_loss=0.351]
Train: 100%|██████████| 109/109 [00:00<00:00, 186.76it/s, mean_loss=0.338]
Test: 100%|██████████| 28/28 [00:00<00:00, 263.67it/s, mean_loss=0.347]
Train: 100%|██████████| 109/109 [00:00<00:00, 182.96it/s, mean_loss=0.331]
Test: 100%|██████████| 28/28 [00:00<00:00, 121.37it/s, mean_loss=0.349]
Train: 100%|██████████| 109/109 [00:00<00:00, 182.16it/s, mean_loss=0.324]
Test: 100%|██████████| 28/28 [00:00<00:00, 276.09it/s, mean_loss=0.349]
Train: 100%|██████████| 109/109 [00:00<00:00, 182.34it/s, mean_loss=0.32]
Test: 100%|██████████| 28/28 [00:00<00:00, 2

In [None]:
# Sentences we want sentence embeddings for
sentences = happy_comments[:3]


print("Sentence embeddings:")
print(sentence_embeddings)
print(sentence_embeddings.shape)