In [2]:
import torch
from torchtext.legacy import data
import random
from torchtext.legacy import datasets

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


### Experiment
In this notebook I will compare different approaches for segmentation in order to get the best score
TO do this I prepares following experiment:
1. Freeze the sentiment model
2. Implement multiple text preprocessing pipelines
3. Calculate Acuracy score with frozen models and different pipelines
4. Get the best result for web service

### Dataset
Here we will load 1000 samples from the full dataset and use them as a benchmark data set

In [3]:
texts = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True)
labels = data.LabelField(dtype = torch.float)

train_data, test_data = datasets.IMDB.splits(texts, labels)
_, valid_data = train_data.split(random_state = random.seed(SEED))
validation = [(' '.join(v.text), 1 if v.label=='pos' else 0) for v in valid_data]
positive_tests = []
negative_tests = []
for (text, label) in validation:
    if label == 1 and len(positive_tests) < 500:
        positive_tests.append((text, label))
    if label == 0 and len(negative_tests) < 500:
        negative_tests.append((text, label))

In [5]:
# No need for balance check, because we manually selecting 500/500 rows
print(len(positive_tests))
print(len(negative_tests))

500
500


In [20]:
balanced_validation = positive_tests + negative_tests
len(balanced_validation)

1000

### Load trained model
Here I will load the model from previous notebook to do this I will need do following steps:
* Load pickled embeddings
* Define the model class (copy/paste from previous notebook)
* Load trained model weights

In [6]:
import pickle
with open('embeddings.pickle', 'rb') as f:
    TEXT = pickle.load(f)

In [7]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        return self.fc(hidden)

In [8]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 16#256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [10]:
MODEL_OUTPUT = 'sentiment_lstm_glove.pt'
model.load_state_dict(torch.load(MODEL_OUTPUT))

<All keys matched successfully>

### Benchmarks
For benchmarks I prepared modified version of `predict_sentiment` function. It will get additional argument for processing function. In addition we have to switch device to "CPU" because we should not run the production inference on GPU (to save resources)

In [11]:
DEVICE_NAME = 'cpu'
device = torch.device(DEVICE_NAME)

def predict_sentiment(model, sentence, proc_func):
    model.eval()
    
    tokenized = proc_func(sentence)
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

# Methods
First 2 methods are easy: naive version with regular expression and intermediate version with spacy tokenisation

In [12]:
import re
import spacy
nlp = spacy.load('en_core_web_sm')

def naive_preprocessing(text):
    st = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    tokens = st.split(' ')
    return tokens

def spacy_tokenizer(text):
    tokenized = [tok.text for tok in nlp.tokenizer(text)]
    return tokenized

In [13]:
 predict_sentiment(model, "This film is great", naive_preprocessing)

0.7301585674285889

### Segmentation
Advanced approach is to use segmentation first, split the text into multiple utterances and run inference for each utterance. Then we can calculate mean value and consider it as final result. 

Remark:
As additional idea we can train extra model in order to return final score (based on all sentenses) but it's not implemented here

In [15]:
def segmented_sentiment(text, proc_func):
    data = nlp(text)
    scores = []
    for sent in data.sents:
        scores.append(predict_sentiment(model, str(sent), proc_func))
    return round(sum(scores) / float(len(scores)))

### Benchmark
For benchmark we have 4 pipelines:
* naive approach with regular expression
* spacy tokenisation for all text
* segmentation + regular expression + avg
* segmentation + spacy tokenisation + avg

The best result will be used in the web service

In [24]:
from sklearn.metrics import accuracy_score
def benchmark_tokenisation():
    for func in [naive_preprocessing, spacy_tokenizer]:
        labels = [v[1] for v in balanced_validation]
        preds = [round(predict_sentiment(model, v[0], func)) for v in balanced_validation]
        print(func.__name__, accuracy_score(labels, preds))

In [25]:
benchmark_tokenisation()

naive_preprocessing 0.81
spacy_tokenizer 0.81


In [27]:
def benchmark_segmentation():
    for func in [naive_preprocessing, spacy_tokenizer]:
        labels = [v[1] for v in balanced_validation]
        preds = [round(segmented_sentiment(v[0], func)) for v in balanced_validation]
        print("segment + " + func.__name__, accuracy_score(labels, preds))

In [28]:
benchmark_segmentation()

segment + naive_preprocessing 0.798
segment + spacy_tokenizer 0.831


### Conclusion
As we can see, segmentation + spacy tokenization + avg value is the better then other solutions (we won ~2% of accuracy only with preprocessing pipeline)