In [1]:
#Installing libraries
!pip install transformers datasets evaluate numpy pandas huggingface torch gensim tensorflow

from sklearn import preprocessing
import torch

def set_seed(seed = 0): # Set seed function
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    
from IPython.display import clear_output # To remove outputs once done
clear_output()

In [2]:
# Importing dataset into pandas dataframe, splitting into features and labels
import pandas as pd
import numpy as np
import re

textEmotion = pd.read_csv("text_emotion.csv")

## Data preparation ##
# Function for removing unwanted text patterns
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

# Removing twitter handles
textEmotion['clean_textEmotion'] = np.vectorize(remove_pattern)(textEmotion['content'], "@[\w]*")

# Removing punctuations, numbers, special characters
textEmotion.clean_textEmotion = textEmotion.clean_textEmotion.str.replace("[^a-zA-Z#]", " ")

# Sliced out features and labels
features = textEmotion.loc[:, 'clean_textEmotion']
labels = textEmotion.loc[:, 'sentiment'] #Note that these are Series

  textEmotion.clean_textEmotion = textEmotion.clean_textEmotion.str.replace("[^a-zA-Z#]", " ")


In [3]:
# Train test splitting and encoding
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =  train_test_split(features, labels, test_size=0.3, random_state=42) # splitting into training and test sets

In [4]:
## Tokenisation. Note that we use glove because using AutoTokenizer makes it complex 
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

#tokenising with word2vec
train_tokenised = X_train.apply(lambda x: word_tokenize(x) if type(x) == str else x) # for training set
test_tokenised = X_test.apply(lambda x: word_tokenize(x) if type(x) == str else x) # for test set
## NOTE THAT TOKENISATION OF FEATURES IS FINISHED AT THIS STAGE ##

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aldan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# installing torchtext to use glove for vectorising
!pip install torchtext
import torchtext

glove = torchtext.vocab.GloVe(name="6B", dim=50, max_vectors=20000)
clear_output()

In [6]:
# Vectorising train_tokenised, test_tokenised
glove.get_vecs_by_tokens(train_tokenised.to_numpy()[0], lower_case_backup=False)
glove.get_vecs_by_tokens(test_tokenised.to_numpy()[0], lower_case_backup=False)

# Embedding and padding for TRAINING data
train_embedded = train_tokenised.loc[train_tokenised.map(len)!=0].map(glove.get_vecs_by_tokens)

train_embedded.shape
#goal: torch tensor of shape (27940, 41, 50)

start_token = torch.rand((50,))
end_token = torch.rand((50,))

train_embedded_padded = torch.empty((train_embedded.shape[0], 41, 50))
for i in range(len(train_embedded)):
    train_embedded_vec = train_embedded.iloc[i]
    train_embedded_padded[i, 0, :] = start_token
    train_embedded_padded[i, 1:len(train_embedded_vec) + 1, :] = train_embedded_vec
    train_embedded_padded[i, len(train_embedded_vec) + 1, :] = end_token
    
# Embedding and padding for TEST data
test_embedded = test_tokenised.loc[test_tokenised.map(len)!=0].map(glove.get_vecs_by_tokens)

test_embedded_padded = torch.empty((test_embedded.shape[0], 41, 50))
for i in range(len(test_embedded)):
    test_embedded_vec = test_embedded.iloc[i]
    test_embedded_padded[i, 0, :] = start_token
    test_embedded_padded[i, 1:len(test_embedded_vec) + 1, :] = test_embedded_vec
    test_embedded_padded[i, len(test_embedded_vec) + 1, :] = end_token

In [7]:
# encoding labels using one-hot encoding - converting labels into binary matrix
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# reshaping both training and test labels
y_train_reshaped = y_train.to_numpy().reshape(-1, 1)
y_test_reshaped = y_test.to_numpy().reshape(-1, 1)

# fitting one-hot encoder on TRAINING labels
encoder.fit(y_train_reshaped)

# transforming TRAINING labels into binary matrix
y_train_encoded = encoder.transform(y_train_reshaped)
y_test_encoded = encoder.transform(y_test_reshaped)

### From this cell, we obtain the following encoded LABELS:
# y_train_encoded - for training set
# y_test_encoded - for test set

# To find out which index of the matrix corresponds to which emotion. Note 13 unique emotions
y_train_encoded.shape, y_test_encoded.shape

((28000, 13), (12000, 13))

In [8]:
from torch.utils.data import Dataset, DataLoader
class TextEmotionDataset(Dataset):
    def __init__(self, embeddings_tensor, labels_tensor):
        self.embeddings_tensor = embeddings_tensor
        self.labels_tensor = labels_tensor

    def __len__(self):
        return len(self.embeddings_tensor)

    def __getitem__(self, index):
        return self.embeddings_tensor[index], self.labels_tensor[index] # Retrieve the pre-embedded and padded sequence and its corresponding label

# Converting labels to tensors, .long used for CrossEntropyLoss
train_labels_tensor = torch.tensor(y_train_encoded).long()
test_labels_tensor = torch.tensor(y_test_encoded).long()

BATCH_SIZE = 128

# Creating TextEmotionDataset, DataLoader
train_dataset = TextEmotionDataset(train_embedded_padded, train_labels_tensor)
test_dataset = TextEmotionDataset(test_embedded_padded, test_labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# # Hyperparameters
# vocab_size = 20000  # Size of your vocabulary
# embedding_dim = 50  # Dimensions of your word vectors
hidden_dim = 128  # LSTM hidden dimensions
output_dim = 13  # Number of classes

# Flatten the first two dimensions to treat each word embedding as an entry in a list
flattened_embeddings = train_embedded_padded.view(-1, 50)

# Here you need to ensure that you only have unique rows corresponding to unique words
# This code assumes that each word's embedding in `flattened_embeddings` is unique
# which might not be the case in a real-world scenario
unique_embeddings, indices = torch.unique(flattened_embeddings, return_inverse=True, dim=0)

In [9]:
from torch import nn, optim
class SelfAttention(nn.Module):
    def __init__(self, input_dim):
        super(SelfAttention, self).__init__()
        self.input_dim = input_dim
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        self.softmax = nn.Softmax(dim=2)
        
    def forward(self, x):
        queries = self.query(x)
        keys = self.key(x)
        values = self.value(x)
        scores = torch.bmm(queries, keys.transpose(1, 2)) / (self.input_dim ** 0.5)
        attention = self.softmax(scores)
        weighted = torch.bmm(attention, values)
        return weighted

class RCNNAttention(nn.Module):
    def __init__(self, embedding_dim, lstm_size, hidden_size_linear):
        super(RCNNAttention, self).__init__()
        self.lstm = nn.LSTM(embedding_dim, lstm_size, batch_first=True, bidirectional=True)
        self.W = nn.Linear(embedding_dim + 2*lstm_size, hidden_size_linear)
        self.fc = nn.Linear(hidden_size_linear, 13)
        
        self.attention = SelfAttention(embedding_dim)

    def forward(self, embedded):
        output = self.attention(embedded)
        # x = |bs, seq_len, embedding_dim|
        output, _ = self.lstm(output)
        # output = |bs, seq_len, 2*hidden_size|
        output = torch.cat([output, embedded], 2)
        # output = |bs, seq_len, embedding_dim + 2*hidden_size|
        
        output = nn.functional.tanh(self.W(output)).transpose(1, 2)
        # output = |bs, seq_len, hidden_size_linear| -> |bs, hidden_size_linear, seq_len|
        output = nn.functional.max_pool1d(output, output.size(2)).squeeze(2)
        # output = |bs, hidden_size_linear|
        output = self.fc(output)
        # output = |bs, class_num|
        return output

model = RCNNAttention(50, hidden_dim, 100)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [10]:
# Example train function
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    
#     text, label = next(iter(iterator))
#     if True:
    for text, label in iterator:
        optimizer.zero_grad()
        label = label.to(dtype=torch.float32)
        predictions = model(text)
        loss = criterion(predictions, label)
        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

num_epochs = 50

losses = []
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    losses.append(float(train_loss))
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}')
    if epoch == 20:
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        

Epoch: 01, Train Loss: 2.165
Epoch: 02, Train Loss: 2.160
Epoch: 03, Train Loss: 2.157
Epoch: 04, Train Loss: 2.153
Epoch: 05, Train Loss: 2.150
Epoch: 06, Train Loss: 2.147
Epoch: 07, Train Loss: 2.140
Epoch: 08, Train Loss: 2.140
Epoch: 09, Train Loss: 2.134
Epoch: 10, Train Loss: 2.129
Epoch: 11, Train Loss: 2.125
Epoch: 12, Train Loss: 2.117
Epoch: 13, Train Loss: 2.109
Epoch: 14, Train Loss: 2.107
Epoch: 15, Train Loss: 2.100
Epoch: 16, Train Loss: 2.112
Epoch: 17, Train Loss: 2.096
Epoch: 18, Train Loss: 2.096
Epoch: 19, Train Loss: 2.133
Epoch: 20, Train Loss: 2.110
Epoch: 21, Train Loss: 2.094
Epoch: 22, Train Loss: 2.039
Epoch: 23, Train Loss: 2.032
Epoch: 24, Train Loss: 2.025
Epoch: 25, Train Loss: 2.022
Epoch: 26, Train Loss: 2.018
Epoch: 27, Train Loss: 2.016
Epoch: 28, Train Loss: 2.013
Epoch: 29, Train Loss: 2.010
Epoch: 30, Train Loss: 2.009
Epoch: 31, Train Loss: 2.007
Epoch: 32, Train Loss: 2.006
Epoch: 33, Train Loss: 2.005
Epoch: 34, Train Loss: 2.003
Epoch: 35, Tra