In [20]:
import numpy as np
import pandas as pd
import torch.nn as nn
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import random
import torch.optim as optim

### Basic tokenisation with spacy
In order to process original text we will use spacy default en model as tokenizer (instead of using default string.split function) and will use vocabulary to collect a set of words

In [3]:
SEED=1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)

### Dataset
For demonstration I will use IMDB movie review dataset, which is available in torchtext library
and it was used for sentiment analysis in many publications.
In order to check overfitting we have to split dataset into 3 parts:
* training set - will be used for training of ANN
* validation set - will be used for validation during training
* test set - hidden data set in order to check the model after training process in separate data (this data was not used for training and validation)

### Strategy
Our goal is to get the prediction score as a continious value from -1 to 1, where -1 is negative, 0 is neutral and 1 is positive. In this dataset we have only negative and positive lables. But we can train the model as binary classifier that will return 0 for negative and 1 for positive sentiment. We can map the output of the ANN to the interval(-1;1). Alternative solution is to train multiclass classifier and use probability of each class in order to get the final score. But binary classifier is good solution for this dataset(we don't have neutarl lables)

In [4]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

### Dataset lables sanity check
Here we can see that ratio for positive and negative labels for each dataset is around 1.0

In [29]:
def check_balance(ds, name):
    positive = 0
    negative = 0
    for row in ds:
        if row.label == 'pos':
            positive += 1
        else:
            negative += 1
    print(name + " has %s/%s positive/negative samples" % (positive, negative), 'balance=%.02f' % float(positive/negative))
    
check_balance(train_data, "train set")
check_balance(test_data, "test set")
check_balance(valid_data, "valid set")

train set has 8690/8810 positive/negative samples balance=0.99
test set has 12500/12500 positive/negative samples balance=1.00
valid set has 3810/3690 positive/negative samples balance=1.03


### Vocabulary
For the baseline I will use default vocabulary based on default torch embeddings

In [5]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

### Create training pipeline
In order to train our model I will create iterators based on train, test and validation datasets
In addition I will try to train all models with Cuda on my old nVidia card

In [6]:
BATCH_SIZE = 64

device = torch.device('cuda')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

### Model
Here we will have 3 layers:
* embedding as input
* default recurrent neural network
* Linear layer as hidden layer+output layer

Important:
Unfortunatly my local PC is a bit old and it will take tonns of time to train really good model, I was trying to artificially prune/simplify the model in order to debug the pipeline and get some results in less then 5-6 hours.
You can see that hidden layer size is reduced to 16 (but in powerfull PR it can be 128, 256 or even more)

Remark:
If i set hidden layer size to 256 - i will get around 4.5mln parameters, the result of prunning/simplification is 2.5 mln parameters

Remark2:
I was trying to run bigger hidden layer size, but I was always getting cuda memory errors

In [8]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        return self.fc(hidden.squeeze(0))

In [9]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 16
OUTPUT_DIM = 1
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [11]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total trainable params={total_params}')

Total trainable params=2502105


### Training
For baseline I used SGD optimiser and binary cross entropy loss function with sigmoid layer

In [12]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

### Training functions
In order to do training and validation I will ned some auxilarity functions
* accuracy calculation
* training loop
* evaluation loop (which will use accuracy function)

In [13]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [14]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### Main training loop
Here you can see limited amount of epochs(5) and cuda memory limitation (500mb) in order to be able to get a model in a reasonable amount of time

In [17]:
import os
import time

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:500"
MODEL_OUTPUT = 'sentiment_baseline.pt'
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_OUTPUT)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {elapsed_mins}m {elapsed_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 18s
	Train Loss: 0.694 | Train Acc: 49.58%
	 Val. Loss: 0.697 |  Val. Acc: 49.94%
Epoch: 02 | Epoch Time: 0m 17s
	Train Loss: 0.693 | Train Acc: 50.23%
	 Val. Loss: 0.697 |  Val. Acc: 49.10%
Epoch: 03 | Epoch Time: 0m 17s
	Train Loss: 0.694 | Train Acc: 50.27%
	 Val. Loss: 0.697 |  Val. Acc: 49.00%
Epoch: 04 | Epoch Time: 0m 17s
	Train Loss: 0.693 | Train Acc: 50.38%
	 Val. Loss: 0.696 |  Val. Acc: 49.07%
Epoch: 05 | Epoch Time: 0m 18s
	Train Loss: 0.693 | Train Acc: 50.47%
	 Val. Loss: 0.696 |  Val. Acc: 49.16%


### Final results
In the end I can load the model and evalueate it on hidden dataset

In [18]:
model.load_state_dict(torch.load(MODEL_OUTPUT))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.678 | Test Acc: 59.29%


### Conclusion
We got 59% of accuracy, which is acceptable for first attempt, but very bad for final results. I will use this "boilerplate" for multiple experiments. In addition we can see that loos function is not decreasing a lot, so we need to work on data quality and maybe embeddings