In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

train, test = datasets.IMDB.splits(TEXT, LABEL)

train, valid = train.split(random_state=random.seed(SEED))

In [2]:
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

In [3]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

## LSTM Model

In [4]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [5]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model_lstm = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [6]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [7]:
model_lstm.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304],
        [ 0.0306, -0.0086,  0.1552,  ..., -0.9847,  0.4392,  0.3018],
        [ 0.3614,  0.1344,  0.0411,  ..., -0.1543, -1.0218, -0.5138]])

## Train LSTM

In [8]:
import torch.optim as optim

optimizer_lstm = optim.Adam(model_lstm.parameters())

In [9]:
criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_lstm = model_lstm.to(device)
criterion = criterion.to(device)

In [10]:
import torch.nn.functional as F

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [11]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [12]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [14]:
N_EPOCHS = 5

import time

start = time.time()

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model_lstm, train_iterator, optimizer_lstm, criterion)
    valid_loss, valid_acc = evaluate(model_lstm, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

end = time.time()

print("Training time for LSTM: ", end - start, " seconds.")

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.664, Train Acc: 60.56%, Val. Loss: 0.640, Val. Acc: 69.40%
Epoch: 02, Train Loss: 0.582, Train Acc: 70.09%, Val. Loss: 0.418, Val. Acc: 81.15%
Epoch: 03, Train Loss: 0.355, Train Acc: 85.48%, Val. Loss: 0.289, Val. Acc: 87.73%
Epoch: 04, Train Loss: 0.244, Train Acc: 90.74%, Val. Loss: 0.281, Val. Acc: 89.29%
Epoch: 05, Train Loss: 0.172, Train Acc: 93.79%, Val. Loss: 0.329, Val. Acc: 87.69%
Training time for LSTM:  1574.1675817966461  seconds.


In [15]:
test_loss, test_acc = evaluate(model_lstm, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.426, Test Acc: 84.00%


## User Input to LSTM

In [16]:
import spacy
nlp = spacy.load('en')

def predict_sentiment_LSTM(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model_lstm(tensor))
    return prediction.item()

In [17]:
predict_sentiment_LSTM("This film is terrible")



0.07159094512462616

In [18]:
predict_sentiment_LSTM("This film is great")



0.9938889741897583

## GRU Model

In [19]:
import torch.nn as nn

class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        #output, (hidden, cell) = self.rnn(embedded)
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [20]:
model_gru = GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [21]:
model_gru.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304],
        [ 0.0306, -0.0086,  0.1552,  ..., -0.9847,  0.4392,  0.3018],
        [ 0.3614,  0.1344,  0.0411,  ..., -0.1543, -1.0218, -0.5138]])

## Train GRU

In [22]:
optimizer_gru = optim.Adam(model_gru.parameters())

In [23]:
model_gru = model_gru.to(device)

In [25]:
N_EPOCHS = 5

start = time.time()

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model_gru, train_iterator, optimizer_gru, criterion)
    valid_loss, valid_acc = evaluate(model_gru, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')
    
end = time.time()

print("Training time for GRU: ", end - start, " seconds.")

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.652, Train Acc: 59.87%, Val. Loss: 0.459, Val. Acc: 78.74%
Epoch: 02, Train Loss: 0.343, Train Acc: 85.37%, Val. Loss: 0.344, Val. Acc: 84.54%
Epoch: 03, Train Loss: 0.211, Train Acc: 91.65%, Val. Loss: 0.239, Val. Acc: 90.48%
Epoch: 04, Train Loss: 0.155, Train Acc: 94.31%, Val. Loss: 0.248, Val. Acc: 90.32%
Epoch: 05, Train Loss: 0.110, Train Acc: 96.13%, Val. Loss: 0.269, Val. Acc: 90.17%
Training time for GRU:  1368.0845170021057  seconds.


In [26]:
test_loss, test_acc = evaluate(model_gru, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.327, Test Acc: 87.21%


## User Input for GRU

In [27]:
def predict_sentiment_GRU(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model_gru(tensor))
    return prediction.item()

In [28]:
predict_sentiment_GRU("This film is terrible")



0.006208060774952173

In [29]:
predict_sentiment_GRU("This film is great")



0.991594135761261

## Conclusion

The GRU model performed better than LSTM model in terms of time consumption and results in training and validation. GRU took 1368 seconds to train and validate and took a bit more time, 1574 seconds for training and validation. In addition, GRU has higher training and validation accuracy than LSTM in all five epochs.

Also, GRU has higher test accuracy (87.21%) and lower test loss (0.327) than LSTM which has 84% test accuracy and 0.426 test loss. So for this data set, GRU seems to be a more suitable model.

When testing these two models on the same user input, the two models both gave high scores on a positve comment and the two scores are very close to each other (GRU: 0.9915 vs. LSTM: 0.9939). For the negative comment, however, GRU seems to be more sensitive because the score it gave (0.0062) is significantly lower than that from LSTM (0.0716), although they both managed to detect the negative sentiment behind the comment.

