###  Sentiment Analysis using a GRU-based encoder with GloVe embeddings

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from torch.utils.data import DataLoader, Dataset
print(torch.__version__)
print(torch.cuda.is_available())
print('Using', torch.cuda.get_device_name())

1.13.1+cu116
True
Using Tesla T4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
root_dir = './drive/MyDrive/IMDB/train'
df_train = pd.DataFrame(columns=['review', 'label'])
for folder in ['pos', 'neg']:
    folder_path = os.path.join(root_dir, folder)
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)

        with open(file_path, 'r') as f:
            string = f.read().strip()

        df_train.loc[len(df_train.index)] = [string, 1 if folder=='pos' else 0]

root_dir = './drive/MyDrive/IMDB/test'
df_test = pd.DataFrame(columns=['review', 'label'])
for folder in ['pos', 'neg']:
    folder_path = os.path.join(root_dir, folder)
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)

        with open(file_path, 'r') as f:
            string = f.read().strip()

        df_test.loc[len(df_test.index)] = [string, 1 if folder=='pos' else 0]

In [None]:
# df_train.to_csv('./drive/MyDrive/IMDB/imdb_train.txt', header=True, index=False)
# df_test.to_csv('./drive/MyDrive/IMDB/imdb_test.txt', header=True, index=False)

In [None]:
df_train = pd.read_csv('./drive/MyDrive/IMDB/imdb_train.txt')
df_test = pd.read_csv('./drive/MyDrive/IMDB/imdb_test.txt')

In [None]:
from sklearn.model_selection import train_test_split

X_train = df_train.copy()
y_train = X_train.pop('label')
X_test = df_test.copy()
y_test = X_test.pop('label')
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test,
                                                test_size=0.2, stratify=y_test)

In [None]:
class IMDBDataset(Dataset):
    def __init__(self, X, y):
        self.X = np.array(X['review'])
        self.y = np.array(y)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        idx = np.array(idx)
        text = self.X[idx]
        label = self.y[idx]
        return text, label

In [None]:
class Vocab():
    def __init__(self, dataset, max_size=float('inf'), min_freq=2):
        self.dataset = dataset
        self.max_size = max_size
        self.spacy_eng = spacy.load('en_core_web_sm')
        self.min_freq = min_freq
        self.itos = {0:'<PAD>', 1:'<UNK>'}
        self.stoi = {v:k for k, v in self.itos.items()}

    def __len__(self):
        return len(self.itos)

    def tokenize(self, text):
        return [token.text.lower() for token in self.spacy_eng.tokenizer(text)]

    def numericalize(self, text):
        tokens = self.tokenize(text)
        idxs = []
        for token in tokens:
            if self.stoi.get(token):
                idxs.append(self.stoi[token])
            else:
                idxs.append(self.stoi['<UNK>'])
        return idxs

    def idx_to_token(self, numericalized):
        return [self.itos[num] for num in numericalized]

    def build_vocab(self):
        freqs = Counter()
        idx = 2
        for i in range(len(self.dataset)):
            review, _ = self.dataset[i]
            for w in self.tokenize(review):
                freqs[w] += 1
        for w, _ in freqs.most_common():
            if freqs[w] >= self.min_freq:
                self.itos[idx] = w
                self.stoi[w] = idx
                idx += 1

                if idx == self.max_size:
                    break

    def build_vocab_glove(self, dim=200, init_emb={}):
        freqs = Counter()
        idx = 2
        for i in range(len(self.dataset)):
            review, _ = self.dataset[i]
            for w in self.tokenize(review):
                freqs[w] += 1

        embeddings = [v for k, v in init_emb.items()]
        glove_dict = {}
        with open(f'glove.6B.{dim}d.txt','rt') as fi:
            content = fi.read().strip().split('\n')
        for i in range(len(content)):
            w = content[i].split(' ')[0]
            emb = [float(val) for val in content[i].split(' ')[1:]]
            glove_dict[w] = emb
        for w, _ in freqs.most_common():
            if freqs[w] >= self.min_freq and glove_dict.get(w):
                self.itos[idx] = w
                self.stoi[w] = idx
                embeddings.append(glove_dict[w])
                idx += 1
                if idx == self.max_size:
                    break
        return torch.tensor(embeddings)

In [None]:
train_dataset = IMDBDataset(X_train, y_train)
test_dataset = IMDBDataset(X_test, y_test)
val_dataset = IMDBDataset(X_val, y_val)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2023-01-11 19:58:01--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-01-11 19:58:02--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-01-11 19:58:02--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2

In [None]:
vocab = Vocab(dataset=train_dataset, max_size=30000, min_freq=2)
# vocab.build_vocab()
init_emb = {
    'pad': torch.zeros(200).tolist(),
    'unk': torch.zeros(200).tolist(),
}
embeddings = vocab.build_vocab_glove(dim=200, init_emb=init_emb)
print('Vocab size:', len(vocab))
print('Embedding size:', embeddings.shape)

Vocab size: 30000
Embedding size: torch.Size([30000, 200])


In [None]:
from torch.nn.utils.rnn import pad_sequence

def numericalize_and_pad_text(batch, padding_idx, batch_first=False, max_seq_len=512):
    batch_text, batch_label, batch_lens = [], [], []
    for tupl in batch:
        text, label = tupl
        numericalized = vocab.numericalize(text)[:max_seq_len] # truncate at max_seq_len
        batch_lens.append(len(numericalized))
        batch_text.append(torch.tensor(numericalized)) # pad_sequence expects a list of tensors
        batch_label.append(label)
    batch_text = pad_sequence(batch_text, batch_first=batch_first, padding_value=padding_idx)
    return batch_text.to(device), torch.tensor(batch_label).to(device), torch.tensor(batch_lens).to(device)

In [None]:
from functools import partial

batch_size = 128
max_seq_len = 512
collate_fn = partial(numericalize_and_pad_text,
                     padding_idx=vocab.stoi["<PAD>"],
                     max_seq_len = max_seq_len,
                     batch_first=False)
trainloader = DataLoader(dataset=train_dataset, shuffle=True,
                        batch_size=batch_size, collate_fn=collate_fn)
testloader = DataLoader(dataset=test_dataset, shuffle=False,
                        batch_size=batch_size, collate_fn=collate_fn)
valloader = DataLoader(dataset=val_dataset, shuffle=False,
                        batch_size=batch_size, collate_fn=collate_fn)

In [None]:
for batch in trainloader:
    print(batch[0].shape, batch[1].shape)

torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512, 128]) torch.Size([128])
torch.Size([512,

In [None]:
class Model(nn.Module):
    def __init__(self, len_vocab, emb_dim, hidden_dim, padding_idx):
        super().__init__()
        self.embed = nn.Embedding(len_vocab, emb_dim, padding_idx = padding_idx)
        self.gru_layers = nn.GRU(emb_dim, hidden_dim, num_layers=3, bidirectional=True, dropout=0.5)
        self.fc = nn.Linear(2*hidden_dim, 1)

    def forward(self, text, seq_lens):
        embedding = nn.Dropout(0.5)(self.embed(text))   # (seq_len, batch_size) ->  (seq_len, batch_size, emb_dim)
        # Pack padded sequences
        packed_embedding = nn.utils.rnn.pack_padded_sequence(embedding, seq_lens.to('cpu'), enforce_sorted=False)
        packed_output, hidden_all_layers = self.gru_layers(packed_embedding) # hidden = (2*layers, batch_size, hidden_dim) [no. of directions=2]

        # Unpack padded sequences
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output) # output = (seq_len, batch_size, 2*hidden dim)

        hidden_last_layer = nn.Dropout(0.5)(torch.cat((hidden_all_layers[-2,:,:], hidden_all_layers[-1,:,:]), dim = 1)) # hidden = (batch_size, 2*hidden_dim)
        return self.fc(hidden_last_layer)

In [None]:
padding_idx = vocab.stoi['<PAD>']
unknown_index = vocab.stoi['<UNK>']
model = Model(len_vocab=len(vocab), emb_dim=200,
                 hidden_dim=256, padding_idx=padding_idx).to(device)
print(model)

Model(
  (embed): Embedding(30000, 200, padding_idx=0)
  (gru_layers): GRU(200, 256, num_layers=3, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
)


In [None]:
model.embed.weight.data.copy_(embeddings.to(device))

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0715,  0.0935,  0.0237,  ...,  0.3362,  0.0306,  0.2558],
        ...,
        [ 0.9558, -0.0636,  0.4445,  ...,  0.3814,  0.2111,  0.2643],
        [ 0.0190, -0.0739, -0.5298,  ...,  0.6844,  0.2365,  0.2638],
        [-0.1377,  0.8168,  0.4333,  ...,  0.4585, -0.4849, -0.5137]],
       device='cuda:0')

In [None]:
lr=0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [None]:
params = sum(param.numel() for param in model.parameters() if param.requires_grad)
print(f'Trainable parameters: {params:,}')

Trainable parameters: 9,069,441


In [None]:
def train():
    losses, corrects = [], []
    model.train()
    for batch in trainloader:
        optimizer.zero_grad()
        text, label, seq_lens = batch

        preds = model(text, seq_lens)
        preds = preds.squeeze(1)
        loss = loss_fn(preds, label.float()) # label is int64 and preds is float, so need to typecast

        preds = torch.round(torch.sigmoid(preds))
        num_correct = (preds == label).sum().float()

        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        corrects.append(num_correct.item()/len(label))

    return sum(losses) / len(trainloader), sum(corrects) / len(trainloader)

In [None]:
def evaluate(loader):
    losses, corrects = [], []
    model.eval()
    with torch.no_grad():
        for batch in loader:
            text, label, seq_lens = batch
            preds = model(text, seq_lens)
            preds = preds.squeeze(1)
            loss = loss_fn(preds, label.float())

            preds = torch.round(torch.sigmoid(preds))
            num_correct = (preds == label).sum().float()


            losses.append(loss.item())
            corrects.append(num_correct.item()/len(label))

    return sum(losses) / len(loader), sum(corrects) / len(loader)

In [None]:
import time

def time_epoch(start, end):
    diff = end - start
    mins = int(diff / 60)
    secs = int(diff - (mins * 60))
    return mins, secs

# Without pre-trained embeddings

In [None]:
from IPython import display
res = pd.DataFrame(columns=['Epoch', 'Train Loss', 'Train Acc', 'Val Loss', 'Val Acc', 'Time'])
display.display(res)

best_val_loss = float('inf')
for epoch in range(8):

    start = time.time()
    train_loss, train_acc = train()
    val_loss, val_acc = evaluate(valloader)
    end = time.time()
    mins, secs = time_epoch(start, end)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), './drive/MyDrive/IMDB/gru++.pth')

    display.clear_output(wait=True)
    res.loc[len(res)] = [epoch+1, f'{train_loss:.3f}', f'{train_acc*100:.2f}%', f'{val_loss:.3f}', f'{val_acc*100:.2f}%', f'{mins}min {secs}s']
    display.display(res)

Unnamed: 0,Epoch,Train Loss,Train Acc,Val Loss,Val Acc,Time
0,1,0.639,62.82%,0.653,67.34%,1min 13s
1,2,0.476,77.00%,0.465,77.89%,1min 12s
2,3,0.368,83.75%,0.377,83.12%,1min 13s
3,4,0.298,87.41%,0.333,85.74%,1min 12s
4,5,0.261,89.27%,0.325,86.15%,1min 14s
5,6,0.221,91.02%,0.308,87.48%,1min 12s
6,7,0.196,92.07%,0.315,86.97%,1min 13s
7,8,0.175,93.11%,0.334,86.68%,1min 13s


In [None]:
model.load_state_dict(torch.load('./drive/MyDrive/IMDB/gru++.pth'))
test_loss, test_acc = evaluate(testloader)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.303 | Test Acc: 87.64%


In [None]:
spacy_eng = spacy.load('en_core_web_sm')

def sentiment_analysis(text):
    model.eval()
    numericalized = vocab.numericalize(text)
    inp_idx = torch.LongTensor(numericalized).unsqueeze(1).to(device)
    seq_len = torch.LongTensor([len(numericalized)]).to(device)
    pred = torch.sigmoid(model(inp_idx, seq_len))
    feel = 'positive' if pred.item() > 0.5 else 'negative'
    return f'{round(pred.item(), 3)} => {feel}'

# results are a little stochastic, might be a good idea to take an average
print(sentiment_analysis('Me neither! I am so annoyed because my laptop is new, and yet I have not been able to get this new OS to work.'))
print(sentiment_analysis('Good morning Bangalore, great to see a bright and sunny day here.'))

0.129 => negative
0.516 => positive


# GloVe embeddings

In [None]:
from IPython import display
res = pd.DataFrame(columns=['Epoch', 'Train Loss', 'Train Acc', 'Val Loss', 'Val Acc', 'Time'])
display.display(res)

best_val_loss = float('inf')
for epoch in range(8):

    start = time.time()
    train_loss, train_acc = train()
    val_loss, val_acc = evaluate(valloader)
    end = time.time()
    mins, secs = time_epoch(start, end)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), './drive/MyDrive/IMDB/gru_glove++.pth')

    display.clear_output(wait=True)
    res.loc[len(res)] = [epoch+1, f'{train_loss:.3f}', f'{train_acc*100:.2f}%', f'{val_loss:.3f}', f'{val_acc*100:.2f}%', f'{mins}min {secs}s']
    display.display(res)

Unnamed: 0,Epoch,Train Loss,Train Acc,Val Loss,Val Acc,Time
0,1,0.615,65.29%,0.612,75.12%,1min 15s
1,2,0.358,84.99%,0.312,86.76%,1min 12s
2,3,0.234,90.84%,0.297,88.09%,1min 13s
3,4,0.18,93.20%,0.361,87.64%,1min 13s
4,5,0.147,94.68%,0.364,87.48%,1min 15s
5,6,0.109,96.24%,0.395,86.58%,1min 13s
6,7,0.089,96.94%,0.427,85.23%,1min 13s
7,8,0.074,97.53%,0.486,85.92%,1min 15s


In [None]:
model.load_state_dict(torch.load('./drive/MyDrive/IMDB/gru_glove++.pth'))
test_loss, test_acc = evaluate(testloader)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.302 | Test Acc: 87.97%


In [None]:
spacy_eng = spacy.load('en_core_web_sm')

def sentiment_analysis(text):
    model.eval()
    numericalized = vocab.numericalize(text)
    inp_idx = torch.LongTensor(numericalized).unsqueeze(1).to(device)
    seq_len = torch.LongTensor([len(numericalized)]).to(device)
    pred = torch.sigmoid(model(inp_idx, seq_len))
    feel = 'positive' if pred.item() > 0.5 else 'negative'
    return f'{round(pred.item(), 3)} => {feel}'

# results are a little stochastic, might be a good idea to take an average
print(sentiment_analysis('Me neither! I am so annoyed because my laptop is new, and yet I have not been able to get this new OS to work.'))
print(sentiment_analysis('Good morning Bangalore, great to see a bright and sunny day here.'))

0.168 => negative
0.948 => positive
