In [1]:
import numpy as np
import pandas as pd
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import random
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


### Basic tokenisation with spacy
In order to process original text we will use spacy default en model as tokenizer (instead of using default string.split function) and will use vocabulary to collect a set of words

In [2]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

### Dataset
For demonstration I will use IMDB movie review dataset, which is available in torchtext library
and it was used for sentiment analysis in many publications.
In order to check overfitting we have to split dataset into 3 parts:
* training set - will be used for training of ANN
* validation set - will be used for validation during training
* test set - hidden data set in order to check the model after training process in separate data (this data was not used for training and validation)

### Strategy
Our goal is to get the prediction score as a continious value from -1 to 1, where -1 is negative, 0 is neutral and 1 is positive. In this dataset we have only negative and positive lables. But we can train the model as binary classifier that will return 0 for negative and 1 for positive sentiment. We can map the output of the ANN to the interval(-1;1). Alternative solution is to train multiclass classifier and use probability of each class in order to get the final score. But binary classifier is good solution for this dataset(we don't have neutarl lables)

In [3]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

### Dataset lables sanity check
Here we can see that ratio for positive and negative labels for each dataset is around 1.0

In [4]:
def check_balance(ds, name):
    positive = 0
    negative = 0
    for row in ds:
        if row.label == 'pos':
            positive += 1
        else:
            negative += 1
    print(name + " has %s/%s positive/negative samples" % (positive, negative), 'balance=%.02f' % float(positive/negative))
    
check_balance(train_data, "train set")
check_balance(test_data, "test set")
check_balance(valid_data, "valid set")

train set has 8690/8810 positive/negative samples balance=0.99
test set has 12500/12500 positive/negative samples balance=1.00
valid set has 3810/3690 positive/negative samples balance=1.03


### Vocabulary
In the beginning I was thinkng about word2vec embeddings, but currently glove embeddings are much more advanced.

In [5]:
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

### Create training pipeline
In order to train our model I will create iterators based on train, test and validation datasets
In addition I will try to train all models with Cuda on my old nVidia card

In [6]:
BATCH_SIZE = 64
DEVICE_NAME = 'cuda'
device = torch.device(DEVICE_NAME)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

### Model
In this case we have more advanced model:
* Glove embedding as input (vector size=100)
* Recurrent neural network (LSTM) layer
* Linear layer for hidden layer+output layer
* Dropout layer for regularisation - randomly zeroing some elements of the input tensor with probability P using samples from a Bernoulli distribution. This is known and effective technique for regularisation

Important:
Unfortunatly my local PC is a bit old and it will take tonns of time to train really good model, I was trying to artificially prune/simplify the model in order to debug the pipeline and get some results in less then 5-6 hours.
You can see that hidden layer size is reduced to 16 (but in powerfull PR it can be 128, 256 or even more)

Remark:
If i set hidden layer size to 256 - i will get around 4.5mln parameters, the result of prunning/simplification is 2.5 mln parameters

Remark 2:
I was trying to run bigger hidden layer size, but I was always getting cuda memory errors

Remark 3:
In addition it will be interesting to add accustic data from audio inference in order to use text and audio data for sentiment analysis (not implemented in this version)

In [7]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        return self.fc(hidden)

In [8]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 16#256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [9]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total trainable params={total_params}')

Total trainable params=2521737


### Glove embedings
Here we have to add glove embeddings and check that they are not empty

In [10]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [11]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.0817,  0.5434,  0.6970,  ..., -0.6586,  0.0563,  0.3244],
        [ 0.3190, -0.1413, -0.3953,  ...,  0.6556, -0.0397,  0.1782],
        [ 0.2340, -0.4945, -0.1938,  ...,  0.0156,  1.0351,  0.8970]])

In [12]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.0817,  0.5434,  0.6970,  ..., -0.6586,  0.0563,  0.3244],
        [ 0.3190, -0.1413, -0.3953,  ...,  0.6556, -0.0397,  0.1782],
        [ 0.2340, -0.4945, -0.1938,  ...,  0.0156,  1.0351,  0.8970]])


### Training
For this experiment i will keep binary cross entropy loss function with sigmoid layer, but instead of SGD optimiser I will use Adaptive learning rate optimisation (which is more efficient then default SGD)

In [13]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())
model = model.to(device)
criterion = criterion.to(device)

### Training functions
In order to do training and validation I will ned some auxilarity functions
* accuracy calculation
* training loop
* evaluation loop (which will use accuracy function)

In [14]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [15]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [16]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### Main training loop
Here you can see limited amount of epochs(5) and cuda memory limitation (500mb) in order to be able to get a model in a reasonable amount of time

In [17]:
import os
import time

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:500"
MODEL_OUTPUT = 'sentiment_lstm_glove.pt'
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_OUTPUT)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {elapsed_mins}m {elapsed_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 45s
	Train Loss: 0.688 | Train Acc: 53.56%
	 Val. Loss: 0.635 |  Val. Acc: 64.55%
Epoch: 02 | Epoch Time: 0m 45s
	Train Loss: 0.586 | Train Acc: 69.76%
	 Val. Loss: 0.542 |  Val. Acc: 74.34%
Epoch: 03 | Epoch Time: 0m 44s
	Train Loss: 0.497 | Train Acc: 76.87%
	 Val. Loss: 0.443 |  Val. Acc: 81.11%
Epoch: 04 | Epoch Time: 0m 44s
	Train Loss: 0.418 | Train Acc: 82.00%
	 Val. Loss: 0.431 |  Val. Acc: 81.55%
Epoch: 05 | Epoch Time: 0m 45s
	Train Loss: 0.426 | Train Acc: 81.83%
	 Val. Loss: 0.476 |  Val. Acc: 80.96%


In [18]:
import pickle
with open('embeddings.pickle', 'wb') as f:
    pickle.dump(TEXT, f)

### Final results
In the end I can load the model and evalueate it on hidden dataset

In [19]:
model.load_state_dict(torch.load(MODEL_OUTPUT))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.446 | Test Acc: 80.56%


### Conclusion
For the advanced version with LSTM, Glove embeddings, and adaptive learning rate optimization we got 80.5% of accuracy which is significantly better than the baseline model. Due to lack of CPU/Memory on my PC, I expect that 80.5% is a good result for the interview exercise.

### Prod inference
Before we will start the segmentation pipeline we have to check potential prod code for the web service
I will cover this topic in this section. For baseline we can simply filter all not alphanumeric numbers and split the line on words

In [20]:
#import spacy
import re
#nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    model.eval()
    st = re.sub(r'[^A-Za-z0-9 ]+', '', sentence)
    tokenized = st.split(' ')
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [21]:
predict_sentiment(model, "It's a lovely day")

0.8141034245491028

In [22]:
predict_sentiment(model, "I hate brokkoli")

0.291689395904541

### Score mapping
Orignal goal of the exercise is to have continious value from -1 to 1, but we have continious value from 0 to 1. We can map interval from 0 to 1 to the interval from -1 to 1.

In [24]:
def sentiment_score(text):
    return predict_sentiment(model, text) * 2.0 - 1.0

### Santity checks
After all experiments it's good to run simple checks and validate that models returns reasonable results

In [29]:
# Example of negative sentence
sentiment_score("I hate terribale brokkoli")

-0.5232957899570465

In [26]:
# Example of neutral sentence
sentiment_score("I will read a book")

-0.07505971193313599

In [27]:
# Example of positive sentence
sentiment_score("It's a lovely day")

0.6282068490982056