# 1 - Simple Sentiment Analysis

In this series we'll be building a *machine learning* model to detect sentiment (i.e. detect if a sentence is positive or negative) using PyTorch and TorchText. This will be done on movie reviews using the IMDb dataset.

In this first notebook, we'll start very simple to understand the general concepts, whilst further notebooks will build on this knowledge.

We'll be using a **recurrent neural network** (RNN) which reads a sequence of words, and for each word (sometimes called a _step_) will output a _hidden state_. We then use the hidden state for subsequent word in the sentence, until the final word has been fed into the RNN. This final hidden state will then be used to predict the sentiment of the sentence.

![](https://i.imgur.com/VedY9iG.png)

## Preparing Data



In [1]:
import torch
from torchtext import data

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(sequential=False, tensor_type=torch.FloatTensor)

In [2]:
from torchtext import datasets

train, test = datasets.IMDB.splits(TEXT, LABEL)

In [3]:
print('len(train)', len(train))

len(train) 25000


In [4]:
print('train.fields', train.fields)

train.fields {'text': <torchtext.data.field.Field object at 0x7f63ed58c780>, 'label': <torchtext.data.field.LabelField object at 0x7f63ed58c710>}


In [5]:
print('vars(train[0])', vars(train[0]))

vars(train[0]) {'text': ['elvira', 'mistress', 'of', 'the', 'dark', 'is', 'one', 'of', 'my', 'fav', 'movies,', 'it', 'has', 'every', 'thing', 'you', 'would', 'want', 'in', 'a', 'film,', 'like', 'great', 'one', 'liners,', 'sexy', 'star', 'and', 'a', 'Outrageous', 'story!', 'if', 'you', 'have', 'not', 'seen', 'it,', 'you', 'are', 'missing', 'out', 'on', 'one', 'of', 'the', 'greatest', 'films', 'made.', 'i', "can't", 'wait', 'till', 'her', 'new', 'movie', 'comes', 'out!'], 'label': 'pos'}


In [6]:
train, valid = train.split()

In [7]:
print('len(train)', len(train))
print('len(valid)', len(valid))
print('len(test)', len(test))

len(train) 17500
len(valid) 7500
len(test) 25000


In [8]:
# build the vocabulary
TEXT.build_vocab(train, max_size=25000)
LABEL.build_vocab(train)

In [9]:
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))

len(TEXT.vocab) 25002
len(LABEL.vocab) 2


In [10]:
print(TEXT.vocab.freqs.most_common(100))

[('the', 200172), ('a', 108127), ('and', 106433), ('of', 99579), ('to', 92133), ('is', 71636), ('in', 60019), ('I', 46381), ('that', 44914), ('this', 39894), ('it', 37997), ('/><br', 35251), ('was', 32575), ('as', 29712), ('with', 28885), ('for', 28549), ('The', 23679), ('but', 23518), ('movie', 21506), ('on', 21341), ('are', 19802), ('his', 19204), ('film', 18973), ('have', 18926), ('not', 18269), ('be', 17899), ('you', 17610), ('he', 15175), ('by', 14940), ('at', 14900), ('one', 14343), ('an', 14273), ('from', 13439), ('who', 13170), ('like', 12641), ('all', 12530), ('they', 12342), ('so', 11469), ('has', 11435), ('about', 11396), ('just', 11388), ('or', 11272), ('her', 11025), ('out', 10060), ('some', 9858), ('very', 9094), ('more', 9025), ('This', 8532), ('would', 8311), ('what', 8172), ('when', 8127), ('good', 8086), ('only', 7845), ('if', 7751), ('their', 7724), ('had', 7645), ('It', 7602), ('really', 7555), ("it's", 7503), ('which', 7421), ('up', 7391), ('even', 7316), ('can', 7

In [11]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f6393c51400>, {'neg': 0, 'pos': 1})


In [12]:
# make iterator for splits
train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=64, 
    sort_key=lambda x: len(x.text), 
    sort_within_batch=True, 
    repeat=False)

## Build the Model

Updates to model:
- Use a special RNN architecture called _LSTM_
- Increase the number of _layers_ of the RNN/LSTM
- The RNN/LSTM is _bi-directional_
- Added regularization in the form of _dropout_

In [13]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #print(x.shape)
        embedded = self.dropout(self.embedding(x))
        #print(embedded.shape)
        output, (hidden, cell) = self.rnn(embedded)
        #print(output.shape)
        #print(hidden.shape)
        #print(output[-1,:,:])
        #print(output[-1,:,:].shape)
        #print(hidden.shape)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        
        return self.fc(hidden.squeeze(0))

In [14]:
model = RNN(len(TEXT.vocab), 128, 256, 1, 2, True, 0.5)

## Check the Model

In [15]:
x = torch.ones(10, 32, dtype=torch.long)

In [16]:
model(x).shape

torch.Size([32, 1])

## Train the Model

In [17]:
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

In [18]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [19]:
import torch.nn.functional as F

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float or division 
    acc = correct.sum()/len(correct)
    return acc

In [20]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [21]:
def evaluate(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [22]:
for epoch in range(10):

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, optimizer, criterion)
    
    print(f'Epoch: {epoch+1}, Train Acc: {train_acc*100:.2f}%, Val. Acc: {valid_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 1, Train Acc: 53.22%, Val. Acc: 58.03%
Epoch: 2, Train Acc: 64.11%, Val. Acc: 70.57%
Epoch: 3, Train Acc: 72.43%, Val. Acc: 77.12%
Epoch: 4, Train Acc: 79.70%, Val. Acc: 81.24%
Epoch: 5, Train Acc: 83.71%, Val. Acc: 81.09%
Epoch: 6, Train Acc: 85.21%, Val. Acc: 84.86%
Epoch: 7, Train Acc: 87.77%, Val. Acc: 85.52%
Epoch: 8, Train Acc: 89.01%, Val. Acc: 86.71%
Epoch: 9, Train Acc: 89.95%, Val. Acc: 88.22%
Epoch: 10, Train Acc: 91.08%, Val. Acc: 88.34%


In [23]:
test_loss, test_acc = evaluate(model, test_iter, optimizer, criterion)

print(f'Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Acc: 88.25%


## Evaluate the Results

In [24]:
#TODO: add confusion matrix

## User Input

In [25]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model(tensor))
    return prediction.item()

predict_sentiment("This film is terrible")

0.005002327263355255

In [26]:
predict_sentiment("This film is amazing!!!")

0.9970943927764893