In [1]:
from datasets import load_dataset
import random
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tokenizers import Tokenizer, models, trainers
from tokenizers.pre_tokenizers import Whitespace
import torch.nn as nn
import torch
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

In [2]:
emotions = load_dataset("dair-ai/emotion")

In [3]:
labels = ["sadness", "joy", "love", "anger", "fear", "surprise"]

In [4]:
train_data = emotions["train"]
validation_data = emotions["validation"]
test_data = emotions["test"]

In [5]:
# Tokenization
vocab_n = 5000
sequence_len = 64

# Initialize a tokenizer using BPE (Byte Pair Encoding)
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = Whitespace()
tokenizer.enable_padding(length=sequence_len)
tokenizer.enable_truncation(max_length=sequence_len)
tokenizer_trainer = trainers.BpeTrainer(vocab_size=vocab_n)
tokenizer.train_from_iterator(train_data["text"], trainer=tokenizer_trainer)






In [6]:
# Load a small and fast transformer model for sentence embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate sentence embeddings for the training texts
X_embeddings = model.encode(train_data["text"], show_progress_bar=True)

# Apply SMOTE on sentence embeddings
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_embeddings, train_data["label"])

# Convert to torch tensors for use in training
X_tensor = torch.tensor(X_resampled, dtype=torch.float32)
y_tensor = torch.tensor(y_resampled, dtype=torch.long)

print("After SMOTE class distribution:", dict(zip(*np.unique(y_resampled, return_counts=True))))


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 500/500 [00:04<00:00, 123.99it/s]


After SMOTE class distribution: {np.int64(0): np.int64(5362), np.int64(1): np.int64(5362), np.int64(2): np.int64(5362), np.int64(3): np.int64(5362), np.int64(4): np.int64(5362), np.int64(5): np.int64(5362)}


In [7]:
def preprocess_text(text: str, tokenizer: Tokenizer):
    """ 
    Helper function to tokenize text and return corresponding token IDs as tensors.

    Args:
        text, str: Text instance from training data.
        tokenizer, Tokenizer: The respective tokenizer to be used for tokenization.
    Returns:
        Tensor: One-dimensional PyTorch tensor with token IDs.
    """
    return torch.tensor(tokenizer.encode(text).ids)


def preprocess_label(label: int):
    """ 
    Helper function to return label as tensor.

    Args:
        label, int: Label from instance.
    Returns:
        Tensor: One-dimensional PyTorch tensor containing the label index.
    """
    return torch.tensor(label)


def preprocess(data: dict, tokenizer: Tokenizer):
    """ 
    Transforms input dataset to tokenized vector representations.

    Args:
        data, dict: Dictionary with text instances and labels.
        tokenizer, Tokenizer: The respective tokenizer to be used for tokenization.
    Returns:
        list: List with tensors for the input texts and labels.
    """
    instances = []

    for text, label in zip(data["text"], data["label"]):
        input = preprocess_text(text, tokenizer)
        label = preprocess_label(label)
        
        instances.append((input, label))

    return instances

In [8]:
train_instances = preprocess(train_data, tokenizer)
val_instances = preprocess(validation_data, tokenizer)
test_instances = preprocess(test_data, tokenizer)

In [9]:
# Batching

def batching(instances: list, batch_size: int, shuffle: bool):
    """ 
    Batches input instances along the given size and returns list of batches.

    Args:
        instances, list: List of instances, containing a tuple of two tensors 
            for each text as well as corresponding label.
        batch_size, int: Size for batches.
        shuffle, bool: If true, the instances will be shuffled before batching.
    Returns:
        list: List containing tuples that correspond to single batches.
    """
    if shuffle:
        random.shuffle(instances)

    batches = []

    # We iterate through the instances with batch_size steps
    for i in range(0, len(instances), batch_size):

        # Stacking the instances with dim=0 (default value)
        batch_texts = torch.stack(
            [instance[0] for instance in instances[i : i + batch_size]]
        )
        batch_labels = torch.stack(
            [instance[1] for instance in instances[i : i + batch_size]]
        )

        batches.append((batch_texts, batch_labels))
    
    return batches

In [10]:
# CNN Network

class CNN_Classifier(nn.Module):
    """ 
    CNN for sentiment classification with 6 classes, consisting of an embedding 
    layer, two convolutional layers with different filter sizes, different 
    pooling sizes, as well as one linear output layer.
    """
    def __init__(self):
        super().__init__()
        # We can implement embeddings as a simple lookup-table for given word 
        # indices
        self.embedding = nn.Embedding(tokenizer.get_vocab_size(), 300)

        # One-dimensional convolution-layer with 300 input channels, and 100  
        # output channels as well as kernel size of 3; note that the
        # one-dimensional convolutional layer has 3 dimensions
        self.conv_1 = nn.Conv1d(300, 100, 3, padding="same")

        # Pooling with with a one-dimensional sliding window of length 3, 
        # reducing in this fashion the sequence length 
        self.pool_1 = nn.MaxPool1d(3)

        # The input will be the reduced number of maximum picks from the
        # previous operation; the dimension of those picks is the same as the
        # output channel size from self.conv_1. We apply a different filter of 
        # size 5.
        self.conv_2 = nn.Conv1d(100, 50, 5, padding="same")

        # Pooling with window size of 5
        self.pool_2 = nn.MaxPool1d(5)

        # Final fully connected linear layer from the 50 output channels to the
        # 6 sentiment categories 
        self.linear_layer = nn.Linear(50, 6)

    def forward(self, x):
        """ 
        Defining the forward pass of an input batch x.

        Args:
            x, tensor: The input is a batch of tweets from the data.
        Returns:
            y, float: The output are the logits from the final layer.
        """
        # x will correspond here to a batch; therefore, the input dimensions of 
        # the embedding will be by PyTorch convention as follows:
        # [batch_size, seq_len, emb_dim]
        x = self.embedding(x)

        # Unfortunately the embedding tensor does not correspond to the shape 
        # that is needed for nn.Conv1d(); for this reason, we must switch its 
        # order to [batch_size, emb_dim, seq_len] for PyTorch
        x = x.permute(0, 2, 1)

        # We can wrap the ReLu activation function around our convolution layer
        # The output tensor will have the following shape: 
        # [batch_size, 100, seq_len]
        x = nn.functional.relu(self.conv_1(x))

        # Applying max pooling of size 3 means that the output length of the 
        # sequence is shrunk to seq_len//3
        x = self.pool_1(x)

        # Output of the following layer: [batch_size, 50, seq_len//3]
        x = nn.functional.relu(self.conv_2(x))

        # Shrinking the sequence length by 5
        x = self.pool_2(x)
        # print(x.shape)

        # At this point we have a tensor with 3 dimensions; however, the final layer 
        # requires an input of size [batch_size x 50]. To get this value we can 
        # aggregate the values and continue only with their mean
        x = x.mean(dim=-1)

        # In this fasion, the linear layer can be used to make predictions
        y = self.linear_layer(x)

        return y
    
    def fit(self, train_instances, val_instances, epochs, batch_size):
        """ 
        Gradient based fitting method with Adam optimization and automatic 
        evaluation (F1 score) for each epoch.

        Args:
            train_instances, list: List of instance tuples.
            val_instances, list: List of instance tuples.
            epochs, int: Number of training epochs.
            batch_size, int: Number of batch size.
        """
        self.train()
        optimizer = torch.optim.Adam(self.parameters())

        for epoch in range(epochs):
            train_batches = batching(
                train_instances,
                batch_size=batch_size,
                shuffle=True)
            
            for inputs, labels in tqdm(train_batches):
                optimizer.zero_grad()
                outputs = self(inputs)
                loss = nn.functional.cross_entropy(outputs, labels)
                loss.backward()
                optimizer.step()
            
            train_f1 = self.evaluate2(train_instances, batch_size=batch_size)
            val_f1 = self.evaluate2(val_instances, batch_size=batch_size)

            print(f"Epoch {epoch + 1} train F1 score: {train_f1}, validation F1 score: {val_f1}")

    def predict(self, input):
        """ 
        To make inferences from the model.

        Args:
            input, tensor: Single instance.
        Returns:
            int: Integer for most probable class.
        """
        self.eval()
        outputs = self(input)

        return torch.argmax(outputs, dim=-1)

    def evaluate(self, instances, batch_size):
        """ 
        To evaluate model's performance by various processes/standard.

        Args:
            instances, list: List of instance tuples.
            batch_size, int: Batch size.
        Returns:
            float: Macro F1 score for given instances.
        """
        batches = batching(instances, batch_size=batch_size, shuffle=False)
        y_test = []
        y_pred = []

        for inputs, labels in batches:
            y_test.extend(labels)
            y_pred.extend(self.predict(inputs))

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        cm = confusion_matrix(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        print("CNN Classifier:")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 Score: {f1}")
        print(f"Confusion Matrix:\n{cm}")
        print(f"Classification Report:\n{report}")
    
    def evaluate2(self, instances, batch_size):
        """ 
        To make evaluations against the gold standard (true labels) from the 
        data.

        Args:
            instances, list: List of instance tuples.
            batch_size, int: Batch size.
        Returns:
            float: Macro F1 score for given instances.
        """
        batches = batching(instances, batch_size=batch_size, shuffle=False)
        true = []
        pred = []

        for inputs, labels in batches:
            true.extend(labels)
            pred.extend(self.predict(inputs))

        return f1_score(true, pred, average="macro")

In [11]:
classifier = CNN_Classifier()
classifier.fit(train_instances, val_instances, epochs=50, batch_size=16)

100%|██████████| 1000/1000 [00:03<00:00, 273.56it/s]


Epoch 1 train F1 score: 0.5840321467773238, validation F1 score: 0.5377111231942294


100%|██████████| 1000/1000 [00:03<00:00, 258.44it/s]


Epoch 2 train F1 score: 0.9050560351203166, validation F1 score: 0.8365100276197261


100%|██████████| 1000/1000 [00:03<00:00, 255.00it/s]


Epoch 3 train F1 score: 0.9613028281910769, validation F1 score: 0.8645149751730302


100%|██████████| 1000/1000 [00:03<00:00, 253.20it/s]


Epoch 4 train F1 score: 0.9786472144147581, validation F1 score: 0.8673525036939628


100%|██████████| 1000/1000 [00:04<00:00, 238.85it/s]


Epoch 5 train F1 score: 0.9878496023146632, validation F1 score: 0.8694417226393752


100%|██████████| 1000/1000 [00:03<00:00, 254.98it/s]


Epoch 6 train F1 score: 0.9870641162103458, validation F1 score: 0.8594942471862733


100%|██████████| 1000/1000 [00:03<00:00, 254.36it/s]


Epoch 7 train F1 score: 0.9888874873001031, validation F1 score: 0.8571500915776241


100%|██████████| 1000/1000 [00:03<00:00, 256.88it/s]


Epoch 8 train F1 score: 0.9918405098529431, validation F1 score: 0.8588954133980483


100%|██████████| 1000/1000 [00:03<00:00, 268.20it/s]


Epoch 9 train F1 score: 0.9908891570732058, validation F1 score: 0.8639545599047805


100%|██████████| 1000/1000 [00:03<00:00, 268.35it/s]


Epoch 10 train F1 score: 0.9933817278344823, validation F1 score: 0.8564615767109344


100%|██████████| 1000/1000 [00:03<00:00, 252.71it/s]


Epoch 11 train F1 score: 0.9949796594581833, validation F1 score: 0.870765449584288


100%|██████████| 1000/1000 [00:04<00:00, 247.75it/s]


Epoch 12 train F1 score: 0.9942951235572225, validation F1 score: 0.8643489480998453


100%|██████████| 1000/1000 [00:03<00:00, 256.40it/s]


Epoch 13 train F1 score: 0.9919676054559731, validation F1 score: 0.8541470313276807


100%|██████████| 1000/1000 [00:03<00:00, 253.59it/s]


Epoch 14 train F1 score: 0.9944790660587429, validation F1 score: 0.867018533106604


100%|██████████| 1000/1000 [00:03<00:00, 257.01it/s]


Epoch 15 train F1 score: 0.9881898027772054, validation F1 score: 0.8536122854355829


100%|██████████| 1000/1000 [00:03<00:00, 257.26it/s]


Epoch 16 train F1 score: 0.9927611772552573, validation F1 score: 0.8548240288292498


100%|██████████| 1000/1000 [00:03<00:00, 261.79it/s]


Epoch 17 train F1 score: 0.9959265512967602, validation F1 score: 0.8709424518706799


100%|██████████| 1000/1000 [00:03<00:00, 263.37it/s]


Epoch 18 train F1 score: 0.996051155601941, validation F1 score: 0.8709094870506715


100%|██████████| 1000/1000 [00:03<00:00, 266.85it/s]


Epoch 19 train F1 score: 0.9960687531819308, validation F1 score: 0.8649695208685927


100%|██████████| 1000/1000 [00:03<00:00, 250.50it/s]


Epoch 20 train F1 score: 0.9953819485649739, validation F1 score: 0.8649127475143875


100%|██████████| 1000/1000 [00:03<00:00, 263.81it/s]


Epoch 21 train F1 score: 0.9949001192547658, validation F1 score: 0.8607965519014388


100%|██████████| 1000/1000 [00:03<00:00, 271.65it/s]


Epoch 22 train F1 score: 0.9961821549176739, validation F1 score: 0.8751131339353672


100%|██████████| 1000/1000 [00:03<00:00, 268.22it/s]


Epoch 23 train F1 score: 0.9952998434606041, validation F1 score: 0.8722814643208142


100%|██████████| 1000/1000 [00:03<00:00, 260.60it/s]


Epoch 24 train F1 score: 0.9959289916894402, validation F1 score: 0.8673770114273985


100%|██████████| 1000/1000 [00:03<00:00, 265.95it/s]


Epoch 25 train F1 score: 0.99505339143732, validation F1 score: 0.8570339816766998


100%|██████████| 1000/1000 [00:03<00:00, 258.08it/s]


Epoch 26 train F1 score: 0.9938829321279562, validation F1 score: 0.8629106631575051


100%|██████████| 1000/1000 [00:03<00:00, 254.48it/s]


Epoch 27 train F1 score: 0.9961773042107609, validation F1 score: 0.871163579144575


100%|██████████| 1000/1000 [00:03<00:00, 263.52it/s]


Epoch 28 train F1 score: 0.996762771627086, validation F1 score: 0.868653545375674


100%|██████████| 1000/1000 [00:03<00:00, 266.33it/s]


Epoch 29 train F1 score: 0.9969566753622373, validation F1 score: 0.8673296595292573


100%|██████████| 1000/1000 [00:03<00:00, 252.02it/s]


Epoch 30 train F1 score: 0.9932782704274903, validation F1 score: 0.8599982005455981


100%|██████████| 1000/1000 [00:03<00:00, 264.79it/s]


Epoch 31 train F1 score: 0.9965000086954842, validation F1 score: 0.8682760617192485


100%|██████████| 1000/1000 [00:03<00:00, 262.79it/s]


Epoch 32 train F1 score: 0.9954772724120097, validation F1 score: 0.8621542229081611


100%|██████████| 1000/1000 [00:03<00:00, 263.29it/s]


Epoch 33 train F1 score: 0.9966203049206568, validation F1 score: 0.855171565235405


100%|██████████| 1000/1000 [00:03<00:00, 256.72it/s]


Epoch 34 train F1 score: 0.9963607488750651, validation F1 score: 0.8552176568398254


100%|██████████| 1000/1000 [00:03<00:00, 262.38it/s]


Epoch 35 train F1 score: 0.995205728730542, validation F1 score: 0.8604900424224965


100%|██████████| 1000/1000 [00:03<00:00, 263.18it/s]


Epoch 36 train F1 score: 0.9948216335301702, validation F1 score: 0.8658225058991226


100%|██████████| 1000/1000 [00:03<00:00, 265.56it/s]


Epoch 37 train F1 score: 0.9951865895909102, validation F1 score: 0.8682974261665484


100%|██████████| 1000/1000 [00:03<00:00, 262.49it/s]


Epoch 38 train F1 score: 0.9966460110636985, validation F1 score: 0.862593854400253


100%|██████████| 1000/1000 [00:03<00:00, 263.70it/s]


Epoch 39 train F1 score: 0.9965167192028926, validation F1 score: 0.8641331129425177


100%|██████████| 1000/1000 [00:03<00:00, 265.68it/s]


Epoch 40 train F1 score: 0.9933028773441065, validation F1 score: 0.8652833212643394


100%|██████████| 1000/1000 [00:03<00:00, 264.78it/s]


Epoch 41 train F1 score: 0.9964453948052205, validation F1 score: 0.8728728313498055


100%|██████████| 1000/1000 [00:03<00:00, 257.50it/s]


Epoch 42 train F1 score: 0.9951471013050465, validation F1 score: 0.8654611949524087


100%|██████████| 1000/1000 [00:03<00:00, 264.60it/s]


Epoch 43 train F1 score: 0.9945664472878116, validation F1 score: 0.8635127287208743


100%|██████████| 1000/1000 [00:03<00:00, 262.03it/s]


Epoch 44 train F1 score: 0.996715064914178, validation F1 score: 0.8602342917983141


100%|██████████| 1000/1000 [00:03<00:00, 262.21it/s]


Epoch 45 train F1 score: 0.9969477435517954, validation F1 score: 0.8582006948062414


100%|██████████| 1000/1000 [00:03<00:00, 262.66it/s]


Epoch 46 train F1 score: 0.9968720556483932, validation F1 score: 0.8652546550746525


100%|██████████| 1000/1000 [00:03<00:00, 258.91it/s]


Epoch 47 train F1 score: 0.9967606738685437, validation F1 score: 0.8653764409039338


100%|██████████| 1000/1000 [00:03<00:00, 265.04it/s]


Epoch 48 train F1 score: 0.9969552809416294, validation F1 score: 0.8598880813855573


100%|██████████| 1000/1000 [00:04<00:00, 249.07it/s]


Epoch 49 train F1 score: 0.991440676892807, validation F1 score: 0.8478531742387924


100%|██████████| 1000/1000 [00:03<00:00, 252.80it/s]


Epoch 50 train F1 score: 0.9942998287670823, validation F1 score: 0.8743280720529613


In [12]:
f1_test = classifier.evaluate(test_instances, batch_size=16)

CNN Classifier:
Accuracy: 0.8915
Precision: 0.8382884144946093
Recall: 0.8591981982533056
F1 Score: 0.8438163876927041
Confusion Matrix:
[[557   8   3  10   3   0]
 [  7 636  39   5   3   5]
 [  5  25 124   2   0   3]
 [ 14  12   3 240   5   1]
 [ 17   1   2  12 168  24]
 [  1   3   0   1   3  58]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       581
           1       0.93      0.92      0.92       695
           2       0.73      0.78      0.75       159
           3       0.89      0.87      0.88       275
           4       0.92      0.75      0.83       224
           5       0.64      0.88      0.74        66

    accuracy                           0.89      2000
   macro avg       0.84      0.86      0.84      2000
weighted avg       0.90      0.89      0.89      2000

