In [1]:
import numpy as np
import pandas as pd
import torch
import torchtext
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pyprind

### The dataset can be downloaded through this link - https://www.kaggle.com/columbine/imdb-dataset-sentiment-analysis-in-csv-format

# Loading and preprocessing the data

In [None]:
class CreateDataset(torch.utils.data.Dataset):

    def __init__(self, root_dir, batch_size=32):
        self.root_dir = root_dir
        self.batch_size = batch_size
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.spacy = spacy.load("en_core_web_sm")

        self.TEXT = torchtext.data.Field(sequential=True, tokenize="spacy")
        self.LABEL = torchtext.data.LabelField(dtype=torch.long, sequential=False)

        self.initData()
        self.initEmbed()

        self.makeData()

    def initData(self):
        
        df_path = self.root_dir + 'imdb-dataset-sentiment-analysis-in-csv-format'

        self.train_data, self.valid_data, self.test_data = torchtext.data.TabularDataset.splits(
                        path=df_path, 
                        train="Train.csv", validation="Valid.csv", test="Test.csv", 
                        format="csv", 
                        skip_header=True, 
                        fields=[('Text', self.TEXT), ('Label', self.LABEL)])

    def initEmbed(self):
        
        embed_path = self.root_dir + 'glove6b300dtxt/glove.6B.300d.txt'

        self.TEXT.build_vocab(self.train_data,
                         vectors=torchtext.vocab.Vectors(embed_path), 
                         max_size=20000, 
                         min_freq=10)
        self.LABEL.build_vocab(self.train_data)

    def makeData(self):
        self.train_iterator, self.valid_iterator, self.test_iterator = torchtext.data.BucketIterator.splits(
                        (self.train_data, self.valid_data, self.test_data), 
                        sort_key=lambda x: len(x.Text), 
                        batch_size=self.batch_size,
                        device=self.device)

    def lengthData(self):
        return len(self.train_data), len(self.valid_data), len(self.test_data)
    
    def lengthVocab(self):
        return len(self.TEXT.vocab), len(self.LABEL.vocab)

    def freqLABEL(self):
        return self.LABEL.vocab.freqs

    def getData(self):
        return self.train_iterator, self.valid_iterator, self.test_iterator

    def getEmbeddings(self):
        return self.TEXT.vocab.vectors

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = CreateDataset('../input/')
train_iterator, valid_iterator, test_iterator = dataset.getData()
pretrained_embeddings = dataset.getEmbeddings()
pretrained_embeddings.to(device)

100%|█████████▉| 399999/400000 [00:54<00:00, 7362.74it/s]


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')

# Models

In [3]:
# RNN from linear layers. Since this model only takes one input we will have to run a loop to input the words one by one.
class RNN_fc(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RNN_fc, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.i2h = nn.Linear(input_dim  +  hidden_dim, hidden_dim)
        self.i2o = nn.Linear(input_dim + hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs, hidden):
        combined = torch.cat((inputs, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)    


# RNN using nn.Module ( got an accuracy of 86% on the test set, similar to LSTM but takes significantly lesser time to train.)
class RNN(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim)
        self.linear = torch.nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        output = torch.mean(output, dim = 0)  # taking the average of all the ouputs after each word. This significantly improved the accuracy compared to just taking the last output 
        out = self.linear(output)
        return out


# LSTM using nn.Module ( got an accuracy of 85% on the test set. I trained it for only 20 epohs and I think the accuracy would increase if we train it for longer)
class LSTM(torch.nn.Module): 
    def __init__(self, input_dim, embedding_dim, num_layers, hidden_dim, dropout = 0.2, bidirectional = False):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        self.dropout = torch.nn.Dropout(p=dropout)

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.embedding.load_state_dict({'weight': pretrained_embeddings})
        self.embedding.weight.requires_grad = False

        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, 
                                         num_layers=num_layers,
                                         bidirectional=bidirectional,
                                         dropout=dropout)
        if bidirectional:
            self.linear = torch.nn.Linear(hidden_dim*num_layers*2, 2)
        else:
            self.linear = torch.nn.Linear(hidden_dim*num_layers, 2)
    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        out = self.linear(self.dropout(torch.cat([cell[i,:, :] for i in range(cell.shape[0])], dim=1)))  # concatnating the cell state from all the layers and directions
        return out



# Hyperparameters

In [5]:
input_dim = dataset.lengthVocab()[0]
embedding_dim = 300
hidden_dim = 256
output_dim = 2
num_layers = 2
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM(input_dim, embedding_dim, num_layers, hidden_dim, bidirectional = True)  # using a bidirectional 2-layer lstm
model.to(device)

LSTM(
  (dropout): Dropout(p=0.2, inplace=False)
  (embedding): Embedding(20002, 300)
  (lstm): LSTM(300, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=2, bias=True)
)

In [6]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

In [8]:
def accuracy(preds, y):

    preds, ind= torch.max(F.softmax(preds, dim=-1), 1)
    correct = (ind == y).float()
    acc = correct.sum()/float(len(correct))
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    bar = pyprind.ProgBar(len(iterator), bar_char='█')
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.Text).squeeze(0)

        loss = criterion(predictions, batch.Label)

        acc = accuracy(predictions, batch.Label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        bar.update()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:

            predictions = model(batch.Text).squeeze(0)
            
            loss = criterion(predictions, batch.Label)
            
            acc = accuracy(predictions, batch.Label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Training the LSTM model

In [9]:
epochs = 20
best_acc = 0
for epoch in range(epochs):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    if valid_acc > best_acc:
        torch.save(model.state_dict(), 'weights_lstm_sentiment.pth')
    print(f'Epoch: {epoch+1} \t Train Loss: {train_loss:.3f}  \t Train Acc: {train_acc*100:.2f}% \nVal. Loss: {valid_loss:.3f} \t Val. Acc: {valid_acc*100:.2f}% ')


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:29


Epoch: 1 	 Train Loss: 0.689  	 Train Acc: 54.02% 
Val. Loss: 0.686 	 Val. Acc: 53.74% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:30


Epoch: 2 	 Train Loss: 0.681  	 Train Acc: 57.61% 
Val. Loss: 0.678 	 Val. Acc: 56.55% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:29


Epoch: 3 	 Train Loss: 0.669  	 Train Acc: 60.03% 
Val. Loss: 0.664 	 Val. Acc: 59.65% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:30


Epoch: 4 	 Train Loss: 0.647  	 Train Acc: 62.47% 
Val. Loss: 0.634 	 Val. Acc: 64.23% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:29


Epoch: 5 	 Train Loss: 0.619  	 Train Acc: 65.48% 
Val. Loss: 0.578 	 Val. Acc: 69.53% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:28


Epoch: 6 	 Train Loss: 0.601  	 Train Acc: 68.09% 
Val. Loss: 0.544 	 Val. Acc: 75.10% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:30


Epoch: 7 	 Train Loss: 0.657  	 Train Acc: 60.43% 
Val. Loss: 0.686 	 Val. Acc: 53.03% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:31


Epoch: 8 	 Train Loss: 0.692  	 Train Acc: 53.72% 
Val. Loss: 0.680 	 Val. Acc: 55.06% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:34


Epoch: 9 	 Train Loss: 0.679  	 Train Acc: 57.06% 
Val. Loss: 0.663 	 Val. Acc: 60.43% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:32


Epoch: 10 	 Train Loss: 0.660  	 Train Acc: 60.43% 
Val. Loss: 0.669 	 Val. Acc: 59.36% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:31


Epoch: 11 	 Train Loss: 0.642  	 Train Acc: 63.84% 
Val. Loss: 0.651 	 Val. Acc: 63.95% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:35


Epoch: 12 	 Train Loss: 0.632  	 Train Acc: 64.80% 
Val. Loss: 0.572 	 Val. Acc: 71.95% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:32


Epoch: 13 	 Train Loss: 0.612  	 Train Acc: 67.00% 
Val. Loss: 0.536 	 Val. Acc: 73.71% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:33


Epoch: 14 	 Train Loss: 0.592  	 Train Acc: 69.10% 
Val. Loss: 0.660 	 Val. Acc: 63.38% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:30


Epoch: 15 	 Train Loss: 0.576  	 Train Acc: 69.80% 
Val. Loss: 0.453 	 Val. Acc: 80.25% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:33


Epoch: 16 	 Train Loss: 0.461  	 Train Acc: 79.66% 
Val. Loss: 0.420 	 Val. Acc: 81.53% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:33


Epoch: 17 	 Train Loss: 0.405  	 Train Acc: 82.54% 
Val. Loss: 0.388 	 Val. Acc: 82.76% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:30


Epoch: 18 	 Train Loss: 0.373  	 Train Acc: 84.25% 
Val. Loss: 0.345 	 Val. Acc: 85.49% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:35


Epoch: 19 	 Train Loss: 0.358  	 Train Acc: 84.95% 
Val. Loss: 0.342 	 Val. Acc: 85.67% 


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:31


Epoch: 20 	 Train Loss: 0.349  	 Train Acc: 85.47% 
Val. Loss: 0.340 	 Val. Acc: 85.51% 


In [10]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Val. Loss: {test_loss:.3f} \t Val. Acc: {test_acc*100:.2f}% ')

Val. Loss: 0.358 	 Val. Acc: 84.75% 


In [11]:
model.load_state_dict(torch.load('./weights_lstm_sentiment.pth'))
train_loss, train_acc = evaluate(model, train_iterator, criterion)
print(f'train. Loss: {train_loss:.3f} \t train. Acc: {train_acc*100:.2f}% ')
val_loss, val_acc = evaluate(model, valid_iterator, criterion)
print(f'Val. Loss: {val_loss:.3f} \t Val. Acc: {val_acc*100:.2f}% ')
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'test. Loss: {test_loss:.3f} \t test. Acc: {test_acc*100:.2f}% ')

train. Loss: 0.353 	 train. Acc: 85.10% 
Val. Loss: 0.340 	 Val. Acc: 85.51% 
test. Loss: 0.358 	 test. Acc: 84.75% 
