##LSTM for sentiment analysis using IMDB dataset 

Preparing data

In [1]:
import torch #importing pytorch
from torchtext import data #pytorch library for preprocessing
import torch.nn.functional as F
#setting random seed
SEED = 1234
# setting manual seed using our random seed to get the same random number
torch.manual_seed(SEED)
# running on the CuDNN backend
torch.backends.cudnn.deterministic = True
# tokenizing text using spacy tokenizer and include lengths for packed padded sequences
TEXT = data.Field(tokenize = 'spacy',include_lengths=True) 
# Labelling our dataset and setting tensor to FloatTensors
LABEL = data.LabelField(dtype = torch.float)

In [2]:
from torchtext import datasets # to get torch dataset
# downloading the IMDb dataset and spliting it into the canonical train/test splits as torchtext.datasets objects
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [3]:
# Checking length of train test splits
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [4]:
print(vars(train_data.examples[0]))

{'text': ['So', ',', 'neighbor', 'was', 'killing', 'neighbor', '.', 'Reminds', 'me', 'of', 'Iraq', '.', 'As', 'I', 'watched', 'the', 'American', 'flag', '(', '50', 'stars', 'in', '1864', '?', ')', 'being', 'dragged', 'behind', 'the', 'horse', ',', 'I', 'realized', 'why', 'burning', 'that', 'piece', 'of', 'red', 'white', 'and', 'blue', 'does', "n't", 'upset', 'me', 'as', 'much', 'as', 'our', 'destruction', '/', 'indifference', 'to', 'the', 'Bill', 'of', 'Rights', '.', 'I', "'m", 'a', 'Southerner', ',', 'and', 'must', 'have', 'some', 'historical', 'memory.<br', '/><br', '/>Watching', 'the', 'Tobey', 'McGuire', 'character', 'learn', 'to', 'respect', 'the', 'dignity', 'of', 'a', 'former', 'slave', ',', 'as', 'he', 'looks', 'at', 'the', 'scalps', 'of', 'blacks', 'and', 'Germans', '(', 'his', 'ethnic', 'background', ')', 'being', 'wagered', 'at', 'a', 'poker', 'game', '.....', 'was', 'interesting', '.', 'Many', 'twists', 'in', 'this', 'movie', '.', 'The', 'wife', ',', 'who', 'is', 'forced', 

In [5]:
import random
#creating valdation set and setting random_state so that we get training and validation set same each time
train_data, valid_data = train_data.split(random_state = random.seed(SEED),split_ratio=0.8)

In [6]:
# Checking length of train,test, validation splits
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 20000
Number of validation examples: 5000
Number of testing examples: 25000


In [7]:
# building the vocabulary, only keeping the most common max_size tokens using our training set
MAX_VOCAB_SIZE = 25_000 

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [8]:
# Unique tokens in TEXT vocabulary and LABEL vocabulary
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [9]:
#to view the most common words in the vocabulary and their frequencies
print(TEXT.vocab.freqs.most_common(20))

[('the', 232582), (',', 221092), ('.', 189740), ('and', 125326), ('a', 124882), ('of', 114980), ('to', 107347), ('is', 87551), ('in', 70313), ('I', 62454), ('it', 61401), ('that', 56774), ('"', 50544), ("'s", 49584), ('this', 48507), ('-', 42205), ('/><br', 40788), ('was', 40044), ('as', 34706), ('with', 34223)]


In [10]:
# We can also see the vocabulary directly using either the stoi (string to int) or itos (int to string) method.
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [11]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f739579dea0>, {'neg': 0, 'pos': 1})


In [12]:
# We then create the iterators. We iterate over these in the training/evaluation loop, and 
#they return a batch of examples (indexed and converted into tensors) at each iteration.
#We'll use a BucketIterator which is a special type of iterator that will return
#a batch of examples where each example is of a similar length, minimizing 
#the amount of padding per example.
#We also want to place the tensors returned by the iterator on the 
#GPU (if you're using one). PyTorch handles this using torch.device, 
#we then pass this device to the iterator.

BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,sort_within_batch=True, #for packed_padded_sequences
    device = device)

In [13]:
#testing the iterators 
#number of rows depends on the longest document in the respective batch
print('Train')
for batch in train_iterator:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nValid:')
for batch in valid_iterator:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nTest:')
for batch in test_iterator:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

Train
Text matrix size: torch.Size([511, 64])
Target vector size: torch.Size([64])

Valid:
Text matrix size: torch.Size([53, 64])
Target vector size: torch.Size([64])

Test:
Text matrix size: torch.Size([36, 64])
Target vector size: torch.Size([64])


Building our model

In [14]:
import torch.nn as nn # neural network library of pytorch
# Defining our RNN class as a sub-class of nn.Module 
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        """ this function takes input layer, embedding layer , hidden layer and output layer dimensions and creates layers of our NN"""
        #inheriting init function of nn.Module class 
        super().__init__()
        # embedding layer with input of input_dim and output of embedding_dim
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # hidden LSTM layer with input of embedding_dim and output of hidden_dim
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        # ouput layer with input of input_dim and output of output_dim
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text,text_length):

        #[sentence len, batch size] => [sentence len, batch size, embedding size]
        # input batch passed through the embedding layer to get embedded
        embedded = self.embedding(text)
        # to pack a text tensor containing padded sequences of variable length.
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length)
        
        #embedded = [sent len, batch size, emb dim]
        #feeding embedding into RNN
        packed_output, (hidden, cell) = self.rnn(packed)
        
        #[sentence len, batch size, embedding size] => 
        #  output: [sentence len, batch size, hidden size]
        #  hidden: [1, batch size, hidden size]
        #feeding the last hidden state, hidden, through the linear layer, fc, to produce a prediction
        return self.fc(hidden.squeeze(0)).view(-1)

In [15]:
# to create an instance of our RNN class by passing INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM and OUTPUT_DIM
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [16]:
def count_parameters(model):
    """tell us how many trainable parameters our model has so we can compare the number of parameters across different models"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,867,049 trainable parameters


Training our model

In [17]:

import torch.optim as optim #optimizer
# we use ADAM with learning rate as 0.0001
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [18]:
#putting model and criterion to our device
model = model.to(device)

In [54]:
from sklearn import metrics
def compute_accuracy_and_confusion_matrix(model, data_loader, device,test):
    """ returns accuracy per batch i.e percentage of correct predictions to total number of examples"""
    model.eval()
    correct_pred, num_examples = 0, 0
    y_predicted=[]
    y_true=[]
    #turning off calculation of gradient
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.text
            logits = model(text, text_lengths)
            predicted_labels = (torch.sigmoid(logits) > 0.5).long()
            size = [int(x) for x in batch_data.label.long().shape]
            for i in range(0,size[0]):
                y_predicted.append(predicted_labels[i].item())
                y_true.append((batch_data.label.long())[i].item())
            num_examples += batch_data.label.size(0)
            correct_pred += (predicted_labels == batch_data.label.long()).sum()
        if test:
            print("Confusion Matrix")
            print(metrics.confusion_matrix(y_true, y_predicted,labels=[0,1]))    
        return correct_pred.float()/num_examples * 100

In [56]:
import time 
start_time = time.time()
NUM_EPOCHS=15
for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_iterator):
        text,text_lengths = batch_data.text
        ### FORWARD AND BACK PROP
        logits = model(text,text_lengths)
        cost = F.binary_cross_entropy_with_logits(logits, batch_data.label)
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_iterator):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy_and_confusion_matrix(model, train_iterator, device,test=False):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy_and_confusion_matrix(model, valid_iterator, device,test=False):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

Epoch: 001/015 | Batch 000/313 | Cost: 0.0016
Epoch: 001/015 | Batch 050/313 | Cost: 0.0039
Epoch: 001/015 | Batch 100/313 | Cost: 0.0599
Epoch: 001/015 | Batch 150/313 | Cost: 0.2715
Epoch: 001/015 | Batch 200/313 | Cost: 0.0066
Epoch: 001/015 | Batch 250/313 | Cost: 0.0054
Epoch: 001/015 | Batch 300/313 | Cost: 0.0899
training accuracy: 99.88%
valid accuracy: 85.72%
Time elapsed: 0.25 min
Epoch: 002/015 | Batch 000/313 | Cost: 0.0164
Epoch: 002/015 | Batch 050/313 | Cost: 0.0012
Epoch: 002/015 | Batch 100/313 | Cost: 0.0039
Epoch: 002/015 | Batch 150/313 | Cost: 0.0011
Epoch: 002/015 | Batch 200/313 | Cost: 0.0027
Epoch: 002/015 | Batch 250/313 | Cost: 0.0018
Epoch: 002/015 | Batch 300/313 | Cost: 0.0059
training accuracy: 99.92%
valid accuracy: 85.74%
Time elapsed: 0.50 min
Epoch: 003/015 | Batch 000/313 | Cost: 0.0037
Epoch: 003/015 | Batch 050/313 | Cost: 0.0020
Epoch: 003/015 | Batch 100/313 | Cost: 0.0018
Epoch: 003/015 | Batch 150/313 | Cost: 0.0035
Epoch: 003/015 | Batch 200/3

In [57]:
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
#prints the confusion matrix as well as test = True
print(f'Test accuracy: {compute_accuracy_and_confusion_matrix(model, test_iterator, device,test=True):.2f}%')

Total Training Time: 3.73 min
Confusion Matrix
[[10723  1777]
 [ 1938 10562]]
Test accuracy: 85.14%


In [58]:
import spacy 
nlp = spacy.load('en')
def predict_sentiment(model, sentence):
    """ inputs the sentence and the model and outputs the sentiment of sentence based on the model"""
    model.eval()
    tokenized = [token.text for token in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor,length_tensor))
    if (prediction.item())>=0.5:
        return "Positive with score "+ str(prediction.item())
    else:
        return "Negative with score "+ str(prediction.item())
        

In [59]:
predict_sentiment(model, "I really love this movie.This movie is so great!")

'Positive with score 0.9962443113327026'