In [15]:

from typing import Dict

import numpy as np
import torch
import torch.optim as optim
from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import \
    StanfordSentimentTreeBankDatasetReader
from allennlp.data.iterators import BucketIterator
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training.trainer import Trainer

from predictors import SentenceClassifierPredictor
EMBEDDING_DIM = 128
HIDDEN_DIM = 128

In [4]:
reader = StanfordSentimentTreeBankDatasetReader()

train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

8544it [00:02, 3664.44it/s]
1101it [00:00, 2886.73it/s]


In [5]:
vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                  min_count={'tokens': 3})

100%|██████████| 9645/9645 [00:00<00:00, 36274.35it/s]


In [8]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
# BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
# not for labels, which are used unchanged as the answer of the sentence classification
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [9]:
# Model in AllenNLP represents a model that is trained.
class LstmClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        # We need the embeddings to convert word IDs to their vector representations
        self.word_embeddings = word_embeddings

        # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
        # (usually a sequence of embedded word vectors), processes it, and returns it as a single
        # vector. Oftentimes, this is an RNN-based architecture (e.g., LSTM or GRU), but
        # AllenNLP also supports CNNs and other simple architectures (for example,
        # just averaging over the input vectors).
        self.encoder = encoder

        # After converting a sequence of vectors to a single vector, we feed it into
        # a fully-connected linear layer to reduce the dimension to the total number of labels.
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()

        # We use the cross-entropy loss because this is a classification task.
        # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss,
        # which makes it unnecessary to add a separate softmax layer.
        self.loss_function = torch.nn.CrossEntropyLoss()

    # Instances are fed to forward after batching.
    # Fields are passed through arguments with the same name.
    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        # In deep NLP, when sequences of tensors in different lengths are batched together,
        # shorter sequences get padded with zeros to make them of equal length.
        # Masking is the process to ignore extra zeros added by padding
        mask = get_text_field_mask(tokens)

        # Forward pass
        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2tag(encoder_out)

        # In AllenNLP, the output of forward() is a dictionary.
        # Your output dictionary must contain a "loss" key for your model to be trained.
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            output["loss"] = self.loss_function(logits, label)

        return output

In [10]:
lstm = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

model = LstmClassifier(word_embeddings, lstm, vocab)

In [11]:
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=dev_dataset,
                  patience=10,
                  num_epochs=20)

trainer.train()

loss: 1.5788 ||: 100%|██████████| 267/267 [00:16<00:00, 15.71it/s]
loss: 1.5757 ||: 100%|██████████| 35/35 [00:00<00:00, 59.71it/s]
loss: 1.5656 ||: 100%|██████████| 267/267 [00:15<00:00, 16.80it/s]
loss: 1.5723 ||: 100%|██████████| 35/35 [00:00<00:00, 57.55it/s]
loss: 1.5588 ||: 100%|██████████| 267/267 [00:15<00:00, 16.99it/s]
loss: 1.5677 ||: 100%|██████████| 35/35 [00:00<00:00, 67.73it/s]
loss: 1.5384 ||: 100%|██████████| 267/267 [00:15<00:00, 16.72it/s]
loss: 1.5582 ||: 100%|██████████| 35/35 [00:00<00:00, 66.28it/s]
loss: 1.4750 ||: 100%|██████████| 267/267 [00:16<00:00, 14.35it/s]
loss: 1.5267 ||: 100%|██████████| 35/35 [00:00<00:00, 53.23it/s]
loss: 1.3601 ||: 100%|██████████| 267/267 [00:14<00:00, 20.11it/s]
loss: 1.4941 ||: 100%|██████████| 35/35 [00:00<00:00, 78.50it/s] 
loss: 1.2145 ||: 100%|██████████| 267/267 [00:14<00:00, 18.45it/s]
loss: 1.4841 ||: 100%|██████████| 35/35 [00:00<00:00, 75.86it/s]
loss: 1.0755 ||: 100%|██████████| 267/267 [00:14<00:00, 18.42it/s]
loss: 1.

{'training_duration': '00:04:22',
 'training_start_epoch': 0,
 'training_epochs': 15,
 'epoch': 15,
 'training_loss': 0.582126122847032,
 'validation_loss': 2.244421570641654,
 'best_epoch': 6,
 'best_validation_loss': 1.4841412782669068}

In [17]:
tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!']
predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict('This is the best movie ever!')['logits']
label_id = np.argmax(logits)

print(model.vocab.get_token_from_index(label_id, 'labels'))

4
