In [1]:
import collections
import datasets
import functools
import mininlp
import pytorch_lightning as pl
import random
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tqdm

In [2]:
seed = 1234

_ = pl.seed_everything(seed)

Global seed set to 1234


## Loading the Dataset

In [3]:
train_data, test_data = datasets.load_dataset('imdb', split=['train', 'test'])

Reusing dataset imdb (/home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


In [4]:
def get_train_valid_split(train_data, valid_ratio=0.2, shuffle=True):
    data = train_data.train_test_split(test_size=valid_ratio, shuffle=shuffle)
    train_data = data['train']
    valid_data = data['test']
    return train_data, valid_data

In [5]:
valid_ratio = 0.2
shuffle = True

train_data, valid_data = get_train_valid_split(train_data, valid_ratio, shuffle)

Loading cached split indices for dataset at /home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-c7753a32c7c1dfde.arrow and /home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-69d48dab60f93b72.arrow


## Initializing the Tokenizer

In [6]:
nlp = spacy.load('en_core_web_sm')

def spacy_tokenize(s: str, nlp: spacy.lang):
    return [t.text for t in nlp.tokenizer(s)]

In [7]:
_spacy_tokenize = functools.partial(spacy_tokenize, nlp=nlp)

In [8]:
tokenizer = mininlp.tokenizer.Tokenizer(_spacy_tokenize)

## Building the Vocabulary

In [9]:
field = 'text'

counter = mininlp.vocab.build_vocab_counter(train_data, field, tokenizer)

In [10]:
min_freq = 6
max_size = 30_000

vocab = mininlp.vocab.Vocab(counter, min_freq, max_size)

## Creating the DataLoader

In [11]:
text_transforms = mininlp.transforms.sequential_transforms(tokenizer.tokenize,
                                                           vocab.stoi,
                                                           mininlp.transforms.to_longtensor)

In [12]:
label_transforms = mininlp.transforms.sequential_transforms(mininlp.transforms.to_longtensor)

In [13]:
train_dataset = mininlp.dataset.TextClassificationDataset(train_data, text_transforms, label_transforms)

In [14]:
valid_dataset = mininlp.dataset.TextClassificationDataset(valid_data, text_transforms, label_transforms)

In [15]:
test_dataset = mininlp.dataset.TextClassificationDataset(test_data, text_transforms, label_transforms)

In [16]:
pad_idx = vocab.stoi(vocab.pad_token)

In [17]:
collator = mininlp.collator.TextClassificationCollator(pad_idx)

In [18]:
batch_size = 256

In [19]:
train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True, 
                                           collate_fn=collator.collate,
                                           num_workers=torch.get_num_threads())

In [20]:
valid_loader = torch.utils.data.DataLoader(valid_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False, 
                                           collate_fn=collator.collate,
                                           num_workers=torch.get_num_threads())

In [21]:
test_loader = torch.utils.data.DataLoader(train_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False, 
                                          collate_fn=collator.collate,
                                          num_workers=torch.get_num_threads())

## Creating the NBOW model

In [22]:
class NBOW(pl.LightningModule):
    def __init__(self, input_dim: int, emb_dim: int, output_dim: int, pad_idx: int):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)
        self.fc = nn.Linear(emb_dim, output_dim)
        
    def forward(self, text: torch.LongTensor) -> torch.FloatTensor:
        #text = [seq len, batch size]
        embedded = self.embedding(text)
        # embedded = [seq len, batch size, emb dim]
        pooled = embedded.mean(0)
        # pooled = [batch size, emb dim]
        prediction = self.fc(pooled)
        # prediction = [batch size, output dim]
        return prediction
    
    def training_step(self, batch, batch_idx):
        text, labels = batch
        predictions = self.forward(text)
        loss = F.cross_entropy(predictions, labels)
        acc = mininlp.utils.calculate_accuracy(predictions, labels)
        return {'loss': loss, 'acc': acc}
    
    def training_epoch_end(self, training_step_outputs):
        loss, acc = self.calculate_metrics(training_step_outputs)
        print(f'Epoch: {self.current_epoch:2}')
        print(f'  Train Loss: {loss:.3f} | Train Acc: {acc*100:.2f}%')
    
    def validation_step(self, batch, batch_idx):
        text, labels = batch
        predictions = self.forward(text)
        loss = F.cross_entropy(predictions, labels)
        acc = mininlp.utils.calculate_accuracy(predictions, labels)
        self.log('valid_loss', loss)
        return {'loss': loss, 'acc': acc}
        
    def validation_epoch_end(self, validation_step_outputs):
        loss, acc = self.calculate_metrics(validation_step_outputs)
        print(f'  Valid Loss: {loss:.3f} | Valid Acc: {acc*100:.2f}%')
        
    def test_step(self, batch, batch_idx):
        text, labels = batch
        predictions = self.forward(text)
        loss = F.cross_entropy(predictions, labels)
        acc = mininlp.utils.calculate_accuracy(predictions, labels)
        return {'loss': loss, 'acc': acc}
        
    def test_epoch_end(self, test_step_outputs):
        loss, acc = self.calculate_metrics(test_step_outputs)
        print(f'Test Loss: {loss:.3f} | Test Acc: {acc*100:.2f}%')
        
    def configure_optimizers(self):
        return optim.Adam(self.parameters())
    
    def calculate_metrics(self, step_outputs):
        loss = torch.mean(torch.stack([x['loss'] for x in step_outputs]))
        acc = torch.mean(torch.stack([x['acc'] for x in step_outputs]))
        return loss, acc

In [23]:
input_dim = len(vocab)
emb_dim = 100
output_dim = 2

model = NBOW(input_dim, emb_dim, output_dim, pad_idx)

In [24]:
early_stopping_callback = pl.callbacks.EarlyStopping(monitor='valid_loss',
                                                     mode='min',
                                                     patience=0)

In [25]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor='valid_loss',
                                                   mode='min')

In [26]:
trainer = pl.Trainer(max_epochs=10,
                     gpus=-1,
                     callbacks=[early_stopping_callback,
                                checkpoint_callback],
                     deterministic=True,
                     num_sanity_val_steps=0,
                     progress_bar_refresh_rate=0)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [27]:
_ = trainer.fit(model, train_loader, valid_loader)


  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 2.8 M 
1 | fc        | Linear    | 202   
----------------------------------------
2.8 M     Trainable params
0         Non-trainable params
2.8 M     Total params


Epoch:  0
  Train Loss: 0.689 | Train Acc: 58.89%
  Valid Loss: 0.685 | Valid Acc: 63.93%
Epoch:  1
  Train Loss: 0.675 | Train Acc: 68.34%
  Valid Loss: 0.665 | Valid Acc: 70.68%
Epoch:  2
  Train Loss: 0.647 | Train Acc: 73.36%
  Valid Loss: 0.631 | Valid Acc: 74.27%
Epoch:  3
  Train Loss: 0.605 | Train Acc: 76.33%
  Valid Loss: 0.589 | Valid Acc: 76.96%
Epoch:  4
  Train Loss: 0.562 | Train Acc: 79.28%
  Valid Loss: 0.548 | Valid Acc: 79.05%
Epoch:  5
  Train Loss: 0.519 | Train Acc: 81.99%
  Valid Loss: 0.510 | Valid Acc: 80.96%
Epoch:  6
  Train Loss: 0.480 | Train Acc: 84.12%
  Valid Loss: 0.477 | Valid Acc: 82.44%
Epoch:  7
  Train Loss: 0.444 | Train Acc: 85.72%
  Valid Loss: 0.449 | Valid Acc: 83.84%
Epoch:  8
  Train Loss: 0.412 | Train Acc: 86.88%
  Valid Loss: 0.424 | Valid Acc: 84.95%
Epoch:  9
  Train Loss: 0.385 | Train Acc: 88.02%
  Valid Loss: 0.404 | Valid Acc: 85.73%


In [28]:
_ = trainer.test(test_dataloaders=test_loader, verbose=False)

Test Loss: 0.369 | Test Acc: 88.47%


In [29]:
def predict(sentence, text_transforms, model):
    model.eval()
    tensor = text_transforms(sentence).unsqueeze(-1).to(model.device)
    prediction = model(tensor)
    probabilities = nn.functional.softmax(prediction, dim=-1)
    pos_probability = probabilities.squeeze(0)[-1].item()
    return pos_probability

In [30]:
sentence = 'the absolute worst movie of all time.'

predict(sentence, text_transforms, model)

5.7020774768545834e-09

In [31]:
sentence = 'one of the greatest films i have ever seen in my life.'

predict(sentence, text_transforms, model)

1.0

In [32]:
sentence = "i thought it was going to be one of the greatest films i have ever seen in my life, \
but it was actually the absolute worst movie of all time."

predict(sentence, text_transforms, model)

0.9619628190994263

In [33]:
sentence = "i thought it was going to be the absolute worst movie of all time, \
but it was actually one of the greatest films i have ever seen in my life."

predict(sentence, text_transforms, model)

0.9619628190994263