In [1]:
import numpy as np
import random
import torch
import os
import spacy
from torchtext.vocab import GloVe, FastText
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import csv

import csv

pos_set = []
neg_set = []
neutral_set = []
with open("/data/home/ayyoobmohd/DLNLP/Glove-and-Sentiments/data/Dataset0.csv", encoding='utf-8') as csvf:
    data = csv.DictReader(csvf)
    #data['Review'] = data['Review'].apply(lambda x:remove_punctuation(x))
    for rows in data:
        # Removing punctuations
        chars_to_remove = [ '+', '#', '¡', '§', '…','‘', '’', '¿', '«', '»', '¨', '%', '-', '“', '”', '--', '`', '~', '<', '>', '*', '{', '}', '^', '=', '_', '[', ']', '|', '- ', '/']
        
        review = rows['Review'].replace('<br />', " ", -1)
        review = review.replace('´', "'", -1)
        for char in chars_to_remove:
            review = review.replace(char, " ", -1)
        
        
        if rows['Label'] == 'positive':
            pos_set.append(rows['Review'])
        elif rows['Label'] == 'negative':
            neg_set.append(rows['Review'])
        else:
            neutral_set.append(rows['Review'])

2023-08-18 15:13:05.672897: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-18 15:13:06.790584: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-18 15:13:06.790839: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

In [2]:
def set_seed(seed = 42):
    '''
        For Reproducibility: Sets the seed of the entire notebook.
    '''

    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    # Sets a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(1)

In [3]:
from torchtext.data import get_tokenizer

# Downloads GloVe and FastText
global_vectors = GloVe(name='840B', dim=300)

# ----------- Text Preprocessing -----------
nlp = spacy.load("en_core_web_md")

data_set = []
vocab = []
tokenizer = get_tokenizer("basic_english")

for line in pos_set:

    # Tokenizes the input text into words
    tokens = tokenizer(line)

    data_set.append((tokens, 0))
    # Adds the extracted words to a list
    vocab.extend(tokens)


print("--- Positive Finished ---")

for line in neg_set:

    # Tokenizes the input text into words
    tokens = tokenizer(line)

    data_set.append((tokens, 1))
    # Adds the extracted words to a list
    vocab.extend(tokens)

print("--- Negative Finished ---")

for line in neutral_set:

    # Tokenizes the input text into words
    tokens = tokenizer(line)

    data_set.append((tokens, 2))
    # Adds the extracted words to a list
    vocab.extend(tokens)

print("--- Negative Finished ---")

--- Positive Finished ---
--- Negative Finished ---
--- Negative Finished ---


In [4]:
# Sorting the samples based on their sequence length
def sort_key(s):
    return len(s[0])
    
#data_set = sorted(data_set, key=sort_key)   # Sorting did not gave better result

In [5]:
# Stores all the unique words in the dataset and their frequencies
vocabulary = {}

# Calculates the frequency of each unique word in the vocabulary
for word in vocab:
    if word in vocabulary:
        vocabulary[word] += 1
    else:
        vocabulary[word] = 1

print("Number of unique words in the vocabulary: ", len(vocabulary))

# Stores the integer token for each unique word in the vocabulary
ids_vocab = {}

id = 0

# Assigns words in the vocabulary to integer tokens
for word, v in vocabulary.items():
    ids_vocab[word] = id
    id += 1

Number of unique words in the vocabulary:  9542


In [6]:
# Tokenization function
def tokenize(corpus, ids_vocab):
    """
        Converts words in the dataset to integer tokens
    """

    tokenized_corpus = []
    for line, sentiment in corpus:
        new_line = []
        for i, word in enumerate(line):
            if word in ids_vocab and (i == 0 or word != line[i-1]):
                new_line.append(ids_vocab[word])

        new_line = torch.Tensor(new_line).long()
        tokenized_corpus.append((new_line, sentiment))

    return tokenized_corpus

token_corpus = tokenize(data_set, ids_vocab)

In [7]:
# Loading the embedding matrix
emb_dim = 300

embeds = torch.zeros(len(ids_vocab) + 1, emb_dim)

n = 0
for token, idx in ids_vocab.items():
    embeds[idx] = global_vectors[token]

    if sum(embeds[idx]) == 0:
        embeds[idx] = torch.rand(300)
        n+=1
print(n)

1739


In [8]:
# Train-Valid split of 90-10
def split_indices(n, val_pct):

    # Determine size of Validation set
    n_val = int(val_pct * n)

    # Create random permutation of 0 to n-1
    idxs = np.random.permutation(n)
    #return np.sort(idxs[n_val:]), np.sort(idxs[:n_val])
    return idxs[n_val:], idxs[:n_val]

train_pos_indices, val_pos_indices = split_indices(len(pos_set), 0.1)
train_neg_indices, val_neg_indices = split_indices(len(neg_set), 0.1)
train_neutral_indices, val_neutral_indices = split_indices(len(neutral_set), 0.1)

# train_indices = np.concatenate((train_pos_indices, train_neg_indices+len(pos_set)-1))
# val_indices = np.concatenate((val_pos_indices, val_neg_indices+len(pos_set)-1))
train_indices = np.concatenate((train_pos_indices, train_neg_indices, train_neutral_indices))
val_indices = np.concatenate((val_pos_indices, val_neg_indices, train_neutral_indices))

In [9]:
from torch.nn.utils.rnn import pad_sequence

# ----------- Batching the data -----------
def collate_fn(instn):

    sentence = [x[0] for x in instn]

    # Pre padding
    sen_len = [len(x[0]) for x in instn]
    max_len = max(sen_len)

    padded_sent = torch.zeros(1, max_len)
    sentence_pad = [torch.cat((torch.zeros(max_len-len(x[0])), x[0]), dim=0) for x in instn]
    
    for i in sentence_pad:
        padded_sent = torch.cat((padded_sent, i.unsqueeze(dim=0)), dim=0)
    padded_sent = padded_sent[1:].long()

    # Post padding
    #padded_sent = pad_sequence(sentence, batch_first=True, padding_value=0)

    labels = torch.Tensor([x[1] for x in instn])

    return (padded_sent, labels)


batch_size = 128

train_sampler   = SubsetRandomSampler(train_indices)
train_loader    = DataLoader(token_corpus, batch_size, sampler=train_sampler, collate_fn=collate_fn)

val_sampler     = SubsetRandomSampler(val_indices)
val_loader      = DataLoader(token_corpus, batch_size, sampler=val_sampler, collate_fn=collate_fn)

In [10]:
# ----------- Model -----------
class BILSTM(nn.Module):
    
    def __init__(self, embeds):
        super().__init__()

        self.embeddings = nn.Embedding.from_pretrained(embeds, padding_idx=0)

        self.gru = nn.GRU(input_size = 300, hidden_size = 128, num_layers = 2, batch_first = True, bidirectional = True, dropout=0.5)

        self.lin1 = nn.Linear(256, 64)
        self.lin2 = nn.Linear(64, 3)

        self.lin3 = nn.Linear(256, 1)

    def forward(self, xb):

        xe = self.embeddings(xb)
        out, y = self.gru(xe)
        
        x = self.lin3(out).squeeze(dim=-1)
        x = torch.softmax(x, dim=-1).unsqueeze(dim=1)
        x = torch.bmm(x, out).squeeze(dim=1)              # Weighted average

        #x = torch.cat((x, y[2][ :, :], y[3][ :, :]), dim = 1) # Tried concatenating the representation with hidden units - got similar results
        x = self.lin1(x)
        x = F.relu(x)
        x = self.lin2(x)
        x = torch.sigmoid(x)
        return x

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"


model = BILSTM(embeds)
model.to(device)
opt_c = torch.optim.AdamW(model.parameters(), lr = 0.001) # Same as Adam with weight decay = 0.001
# loss_fn_c = F.cross_entropy #Tried Cross Entropy with log_softmax output function - gave similar results
loss_fn_c = F.cross_entropy

# ----------- Main Training Loop -----------
max_epoch = 10

best_test_acc = 0
for ep in range(max_epoch):

    epoch_loss = 0

    model.train()

    for xb, yb in tqdm(train_loader):
        yb = yb.type(torch.LongTensor)
        xb = xb.to(device)
        yb = yb.to(device)
        
        y_hat = model(xb)
#         print(y_hat.shape)
#         print(yb.shape)
#         y_hat = torch.argmax(y_hat, dim =1)
#         print(y_hat.shape)
#         print(yb.shape)
        loss = loss_fn_c(y_hat ,yb)

        loss.backward()

        opt_c.step()

        opt_c.zero_grad()

        nn.utils.clip_grad_norm_(model.parameters(), 5)

        epoch_loss += float(loss)

    print("Epoch: ", ep+1, " Training Loss: ", epoch_loss/len(train_loader))


    #----------- Validation -----------

    val_labels = []
    val_pred = []

    model.eval()

    val_epoch_loss = 0

    with torch.no_grad():
        for xb, yb in tqdm(val_loader):
            yb = yb.type(torch.LongTensor)
            xb = xb.to(device)
            yb = yb.to(device)

            y_hat = model(xb)
#             y_hat = torch.argmax(y_hat, dim =1)
            loss = loss_fn_c(y_hat,yb)

            val_epoch_loss += float(loss)

            val_labels.extend(torch.round(yb).cpu().detach().numpy())
            val_pred.extend(y_hat.round().cpu().detach().numpy())

    print("Validation loss: ", val_epoch_loss/len(val_loader))
    #print("Validation accuracy: ", accuracy_score(val_labels, val_pred)*100)

    if ep > 5 and prev_val_loss - val_epoch_loss > 0.015:
        print("Saving Model")
        torch.save(model.state_dict(), "best_model.pt")
    
    prev_val_loss = val_epoch_loss

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 212.13it/s]


Epoch:  1  Training Loss:  0.9089754655443388


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 372.75it/s]


Validation loss:  0.9620582282543182


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 212.89it/s]


Epoch:  2  Training Loss:  0.8552538711449196


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 378.92it/s]


Validation loss:  0.9572440594434738


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 216.40it/s]


Epoch:  3  Training Loss:  0.8561467035063381


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 381.56it/s]


Validation loss:  0.9441723197698593


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 216.96it/s]


Epoch:  4  Training Loss:  0.846838603759634


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 378.97it/s]


Validation loss:  0.9134273618459702


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 216.15it/s]


Epoch:  5  Training Loss:  0.8084861553948501


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 376.17it/s]


Validation loss:  0.8684382528066635


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 217.50it/s]


Epoch:  6  Training Loss:  0.7834040255382143


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 380.70it/s]


Validation loss:  0.835868564248085


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 216.85it/s]


Epoch:  7  Training Loss:  0.7661753790131931


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 378.01it/s]


Validation loss:  0.8389383971691131


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 216.54it/s]


Epoch:  8  Training Loss:  0.7588861584663391


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 374.31it/s]


Validation loss:  0.7858832269906998
Saving Model


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 211.55it/s]


Epoch:  9  Training Loss:  0.7388864488437258


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 374.07it/s]


Validation loss:  0.7638770163059234
Saving Model


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 212.33it/s]


Epoch:  10  Training Loss:  0.7118914415096415


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 373.03it/s]


Validation loss:  0.7311125338077545
Saving Model


In [None]:
z = torch.randn(128,3)
print(z.shape)
y = torch.argmax(z, axis = 1)
y.shape

In [None]:
# Tokenization function
def tokenize_test(corpus, ids_vocab):
    """
        Converts words in the dataset to integer tokens
    """

    tokenized_corpus = []
    for line, sentiment, idx in corpus:
        new_line = []
        for i, word in enumerate(line):
            if word in ids_vocab and (i == 0 or word != line[i-1]):
                new_line.append(ids_vocab[word])

        new_line = torch.Tensor(new_line).long()
        tokenized_corpus.append((new_line, sentiment, idx))

    return tokenized_corpus


In [None]:
# -------- Text Preprocessing ----------

test_set = []
with open("./E0334 Assignment2 Test Dataset.csv", encoding='utf-8') as csvf:
    data = csv.DictReader(csvf)

    for idx, rows in enumerate(data):

        # Removing punctuations
        chars_to_remove = [ '+', '#', '¡', '§', '…','‘', '’', '¿', '«', '»', '¨', '%', '-', '“', '”', '--', '`', '~', '<', '>', '*', '{', '}', '^', '=', '_', '[', ']', '|', '- ', '/']
        
        review = rows['review'].replace('<br />', " ", -1)
        review = review.replace('´', "'", -1)
        for char in chars_to_remove:
            review = review.replace(char, " ", -1)

        tokens = tokenizer(review)

        if rows['sentiment'] == 'positive':
            test_set.append((tokens, 1, idx))
        else:
            test_set.append((tokens, 0, idx))

#test_set = sorted(test_set, key=sort_key)

# ----------- Batching the data -----------
def collate_fn_test(instn):

    sentence = [x[0] for x in instn]

    # Pre padding
    sen_len = [len(x[0]) for x in instn]
    max_len = max(sen_len)

    padded_sent = torch.zeros(1, max_len)
    sentence_pad = [torch.cat((torch.zeros(max_len-len(x[0])), x[0]), dim=0) for x in instn]
    
    for i in sentence_pad:
        padded_sent = torch.cat((padded_sent, i.unsqueeze(dim=0)), dim=0)
    padded_sent = padded_sent[1:].long()

    # Post padding
    #padded_sent = pad_sequence(sentence, batch_first=True, padding_value=0)

    labels = torch.Tensor([x[1] for x in instn])

    idx = torch.Tensor([x[2] for x in instn])

    return (padded_sent, labels, idx)

token_corpus_test = tokenize_test(test_set, ids_vocab)

test_loader      = DataLoader(token_corpus_test, batch_size, collate_fn=collate_fn_test)

In [None]:
model = BILSTM(embeds)
model.load_state_dict(torch.load("best_model.pt"))
model.to(device)

test_labels = []
test_pred = []

model.eval()

test_epoch_loss = 0

n = 0
# ---------- Testing ----------
with torch.no_grad():
    for xb, yb, idx in tqdm(test_loader):
        xb = xb.to(device)
        yb = yb.to(device)

        y_hat = model(xb)
        loss = loss_fn_c(y_hat.squeeze(), yb)

        test_epoch_loss += float(loss)

        test_labels.extend(torch.round(yb).cpu().detach().numpy())
        test_pred.extend(y_hat.round().cpu().detach().numpy())

        for i, v in enumerate(torch.round(yb).cpu().detach().numpy()):
            if v != y_hat.round().cpu().detach().numpy()[i]:
                print(test_set[int(idx[i])])
                n += 1
print(n, 79*128)
print("Test loss: ", test_epoch_loss/len(test_loader))
print("Test accuracy: ", accuracy_score(test_labels, test_pred)*100)

In [None]:
# Seed doesn't work in Jupyter notebook, to replicate my results, kindly, run it as .py file