In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

TEXT = data.Field(tokenize='spacy') # The tokenization should be done using spacy tokenizor
LABEL = data.LabelField(tensor_type=torch.FloatTensor) 

train, test = datasets.IMDB.splits(TEXT, LABEL) # Separate the data into train and test sets

train, valid = train.split(random_state=random.seed(SEED)) # Further separate the traning set into training set and validation set

The first update, is the addition of pre-trained word embeddings. These vectors have been trained on corpuses of billions of tokens. Now, instead of having our word embeddings initialized randomly, they are initialized with these pre-trained vectors, where words that appear in similar contexts appear nearby in this vector space.

The first step to using these is to specify the vectors and download them, which is passed as an argument to build_vocab. The glove is the algorithm used to calculate the vectors, go here for more. 6B indicates these vectors were trained on 6 billion tokens. 100d indicates these vectors are 100-dimensional.

In [2]:
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d") # Download the pretrained word embeddings
LABEL.build_vocab(train)

As before, we create the iterators.

In [3]:
BATCH_SIZE = 64  # This is the number of examples the iterator returns when called. These examples are similar in length

# create iterator for train, valid, and test set
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

To use an LSTM instead of the standard RNN, we use nn.LSTM instead of nn.RNN on line 8. Also note on line 20 the LSTM returns the output and a tuple of the final hidden state and the final cell state, whereas the standard RNN only returned the output and final hidden state.

As the final hidden state of our LSTM has both a forward and a backward component, which are concatenated together, the size of the input to the nn.Linear layer is twice that of the hidden dimension size.

Implementing bidirectionality and adding additional layers are done by passing values for the num_layers and bidirectional arguments for the RNN/LSTM.

Dropout is implemented by initializing an nn.Dropout layer (the argument is the probability of dropout for each neuron) and using it within the forward method after each layer we want to apply dropout to. Note: never use dropout on the input or output layers (x or  fc in this case), you only ever want to use dropout on intermediate layers. The LSTM has a dropout argument which adds dropout on the connections between hidden states in one layer to hidden states in the next layer.

# LSTM

In [4]:
import torch.nn as nn
# This is the class for the lstm model
class LSTM(nn.Module):
    #initialize the lstm model instance
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        # These are the three layers of the rnn
        # embedding transform a one hot vector into a dense embedding vector
        # the LSTM layer take the dense embedding vector and the previous hidden state to produce the next hidden state
        # the Linear layer transform the hidden state to the correct dimension
        # Dropout is implemented by initializing an nn.Dropout layer 
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
    #The forward method is called when we feed examples into our model
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [5]:
# Initialize the parameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
# create the lstm_model instance
lstm_model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [6]:
#retreive the embeddings
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [7]:
#copy the pre-trained word embeddings we loaded earlier into the embedding layer of our model
lstm_model.embedding.weight.data.copy_(pretrained_embeddings)


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4096, -0.5753,  0.1126,  ...,  0.4092,  0.1856,  0.1066],
        [ 0.2110, -0.2472,  0.6508,  ..., -0.1627,  0.4507, -1.1627],
        [-0.2379, -0.1095,  0.4314,  ...,  0.6665,  0.3200,  0.8872]])

In [8]:
import torch.optim as optim
#We use Adam as our optimizer. Adam adapts the learning rate for each parameter, 
#giving parameters that are updated more frequently lower learning rates 
#and parameters that are updated infrequently higher learning rates
optimizer = optim.Adam(lstm_model.parameters())

In [9]:
#This loss combines a Sigmoid layer and the BCELoss in one single class
criterion = nn.BCEWithLogitsLoss()
#choose whether to use cpu or gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lstm_model = lstm_model.to(device)
criterion = criterion.to(device)

In [10]:
import torch.nn.functional as F
#Calculate prediction accuracy
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [11]:
# This is used to train our model
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()# This is to ensure dropout is turned on while training
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)#get prediction
        
        loss = criterion(predictions, batch.label)#get current loss
        
        acc = binary_accuracy(predictions, batch.label)#get current accuracy
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()#cumulate loss
        epoch_acc += acc.item()#cumulate accuracy
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator) # return loss and accuracy

In [12]:
#after the model is trained, we use this function to test our model
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()# to make sure dropout is turned off
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)#get prediction
            
            loss = criterion(predictions, batch.label)#get loss
            
            acc = binary_accuracy(predictions, batch.label)#get accuracy

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)# return loss and accuracy

## In the following code we train the lstm model. 

In [13]:

N_EPOCHS = 5
import time
st=time.time()
for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(lstm_model, train_iterator, optimizer, criterion)# train the model
    valid_loss, valid_acc = evaluate(lstm_model, valid_iterator, criterion)#test the model
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')
ed=time.time()


  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.676, Train Acc: 58.68%, Val. Loss: 0.669, Val. Acc: 62.38%
Epoch: 02, Train Loss: 0.649, Train Acc: 61.48%, Val. Loss: 0.692, Val. Acc: 50.88%
Epoch: 03, Train Loss: 0.665, Train Acc: 60.66%, Val. Loss: 0.657, Val. Acc: 63.02%
Epoch: 04, Train Loss: 0.575, Train Acc: 71.02%, Val. Loss: 0.525, Val. Acc: 75.49%
Epoch: 05, Train Loss: 0.426, Train Acc: 81.68%, Val. Loss: 0.413, Val. Acc: 84.79%


In [14]:
print("The time taken for lstm model is:"+str(ed-st))# get a record of time

The time taken for lstm model is:1573.9865515232086


In [16]:
#test our model
#we can see good improvement
test_loss, test_acc = evaluate(lstm_model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.481, Test Acc: 82.13%


We can now use our model to predict the sentiment of any sentence we give it. As it has been trained on movie reviews, the sentences provided should also be movie reviews.

Our predict_sentiment function does a few things:

1. tokenizes the sentence, i.e. splits it from a raw string into a list of tokens
2. indexes the tokens by converting them into their integer representation from our vocabulary
3. converts the indexes, which are a Python list into a PyTorch tensor
4. add a batch dimension by unsqueezeing
5. squashes the output prediction from a real number between 0 and 1 with the sigmoid function
6. converts the tensor holding a single value into an integer with the item() method
We are expecting reviews with a negative sentiment to return a value close to 0 and positive reviews to return a value close to 1.

In [19]:
import spacy
nlp = spacy.load('en')
# The explaination is in the above block
def predict_sentiment_lstm(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(lstm_model(tensor))
    return prediction.item()

In [20]:
predict_sentiment_lstm("This film is terrible")#we can see the value is close to 0, as what we expect



0.24923060834407806

In [21]:
predict_sentiment_lstm("This film is great")#we can see the value is close to 1, as what we expect



0.8164269924163818

# GRU

In [22]:
import torch.nn as nn
# This is almost the same as the LSTM class with a couple changes
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # Here we use GRU as our rnn layer
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)# here we deleted the cell since the output of GRU is a tuple
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [23]:

#cerate a GRU model instance
gru_model = GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [24]:
gru_model.embedding.weight.data.copy_(pretrained_embeddings)# copy the pretrained embeddings

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4096, -0.5753,  0.1126,  ...,  0.4092,  0.1856,  0.1066],
        [ 0.2110, -0.2472,  0.6508,  ..., -0.1627,  0.4507, -1.1627],
        [-0.2379, -0.1095,  0.4314,  ...,  0.6665,  0.3200,  0.8872]])

In [25]:
import torch.optim as optim
#We use Adam as our optimizer. Adam adapts the learning rate for each parameter, 
#giving parameters that are updated more frequently lower learning rates 
#and parameters that are updated infrequently higher learning rates
gru_optimizer = optim.Adam(gru_model.parameters())

In [26]:

# use gpu to run the training
gru_model = gru_model.to(device)
criterion = criterion.to(device)

## In the following code we train the GRU model. 

In [27]:
N_EPOCHS = 5
import time
st=time.time()
for epoch in range(N_EPOCHS):

    train_loss1, train_acc1 = train(gru_model, train_iterator, gru_optimizer, criterion)# train the model
    valid_loss1, valid_acc1 = evaluate(gru_model, valid_iterator, criterion)# test the model
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss1:.3f}, Train Acc: {train_acc1*100:.2f}%, Val. Loss: {valid_loss1:.3f}, Val. Acc: {valid_acc1*100:.2f}%')
ed=time.time()
print(ed-st)

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.633, Train Acc: 62.24%, Val. Loss: 0.408, Val. Acc: 80.12%
Epoch: 02, Train Loss: 0.329, Train Acc: 85.92%, Val. Loss: 0.276, Val. Acc: 88.90%
Epoch: 03, Train Loss: 0.222, Train Acc: 91.29%, Val. Loss: 0.245, Val. Acc: 90.28%
Epoch: 04, Train Loss: 0.152, Train Acc: 94.13%, Val. Loss: 0.245, Val. Acc: 90.49%
Epoch: 05, Train Loss: 0.117, Train Acc: 95.70%, Val. Loss: 0.251, Val. Acc: 90.29%
1368.4315190315247


In [29]:
print("The time taken for lstm model is:"+str(ed-st))

The time taken for lstm model is:1368.4315190315247


In [28]:
#test the gru model
test_loss1, test_acc1 = evaluate(gru_model, test_iterator, criterion)

print(f'Test Loss: {test_loss1:.3f}, Test Acc: {test_acc1*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.302, Test Acc: 88.20%


In [31]:
# it is the same as the predict_sentiment_lstm function
def predict_sentiment_gru(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(gru_model(tensor))
    return prediction.item()

In [32]:
predict_sentiment_gru("This film is terrible")



0.012454232200980186

In [33]:
predict_sentiment_gru("This film is great")



0.9792103171348572

# compare GRU and LSTM

first of all, the time taken to train gru is 1368 seconds and for lstm it is 1573 seconds. The time to train GRU is faster.
Second, the test accuracy of gru is 88.2%, which is greater than lstm's 82.13%.
Third, using predict_sentiment, we can se that the output numbers from gru are closer to 0 and 1 than that of lstm.

To conclude, Gru is not only more effecient to train, but also has a better accuracy.