<a href="https://colab.research.google.com/github/ankit1khare/Sentiment_Analysis_is_fun_with_Allen_NLP/blob/master/Sentiment_ALLEN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Getting the resources
!git clone https://github.com/mhagiwara/realworldnlp.git
mv  ./realworldnlp/realworldnlp/predictors.py ./realworldnlp

!wget https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.
!unzip trainDevTestTrees_PTB.zip

In [0]:
from typing import Dict

import numpy as np
import torch
import torch.optim as optim
from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import \
    StanfordSentimentTreeBankDatasetReader
from allennlp.data.iterators import BucketIterator
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.training.trainer import Trainer

from realworldnlp.predictors import SentenceClassifierPredictor

EMBEDDING_DIM = 128
HIDDEN_DIM = 128

# Model in AllenNLP represents a model that is trained.
@Model.register("lstm_classify") #use a name different from class for the model
class LstmClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        # We need the embeddings to convert word IDs to their vector representations
        self.word_embeddings = word_embeddings

        # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
        # (usually a sequence of embedded word vectors), processes it, and returns a single
        # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
        # AllenNLP also supports CNNs and other simple architectures (for example,
        # just averaging over the input vectors).
        self.encoder = encoder

        # After converting a sequence of vectors to a single vector, we feed it into
        # a fully-connected linear layer to reduce the dimension to the total number of labels.
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()

        # We use the cross entropy loss because this is a classification task.
        # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss,
        # which makes it unnecessary to add a separate softmax layer.
        self.loss_function = torch.nn.CrossEntropyLoss()

    # Instances are fed to forward after batching.
    # Fields are passed through arguments with the same name.
    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        # In deep NLP, when sequences of tensors in different lengths are batched together,
        # shorter sequences get padded with zeros to make them equal length.
        # Masking is the process to ignore extra zeros added by padding
        mask = get_text_field_mask(tokens)

        # Forward pass
        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2tag(encoder_out)

        # In AllenNLP, the output of forward() is a dictionary.
        # Your output dictionary must contain a "loss" key for your model to be trained.
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            output["loss"] = self.loss_function(logits, label)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}


In [0]:
# PATH = "./model.pth"
def main():
    reader = StanfordSentimentTreeBankDatasetReader()

    train_dataset = reader.read('./trees/train.txt')
    dev_dataset = reader.read('./trees/dev.txt')
    
    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    lstm = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, lstm, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      num_epochs=10)

    trainer.train()
#     torch.save(model, PATH)
    tokens = ['This', 'is', 'the', 'worst', 'movie', 'ever', '!']
# model = torch.load(PATH)
# model.eval()

    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict(tokens)['logits']
    label_id = np.argmax(logits)
    
    print("0 - very negative\n 1 - negative\n 2 - neutral\n 3 - positive\n 4 - very positive\n")
    print("Sentiment level: {}".format(model.vocab.get_token_from_index(label_id, 'labels')))


In [0]:
# tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!']
# # model = torch.load(PATH)
# # model.eval()

# # reader = StanfordSentimentTreeBankDatasetReader()
# # train_dataset = reader.read('./trees/train.txt')
# # dev_dataset = reader.read('./trees/dev.txt')
  
# predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
# logits = predictor.predict(tokens)['logits']
# label_id = np.argmax(logits)

# print(model.vocab.get_token_from_index(label_id, 'labels'))


In [47]:
if __name__ == '__main__':
    main()

0it [00:00, ?it/s]01/19/2019 18:25:26 - INFO - allennlp.data.dataset_readers.stanford_sentiment_tree_bank -   Reading instances from lines in file at: ./trees/train.txt
8544it [00:01, 4334.71it/s]
0it [00:00, ?it/s]01/19/2019 18:25:28 - INFO - allennlp.data.dataset_readers.stanford_sentiment_tree_bank -   Reading instances from lines in file at: ./trees/dev.txt
1101it [00:00, 4616.91it/s]
01/19/2019 18:25:28 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.
100%|██████████| 9645/9645 [00:00<00:00, 48429.53it/s]
01/19/2019 18:25:28 - INFO - allennlp.training.trainer -   Beginning training.
01/19/2019 18:25:28 - INFO - allennlp.training.trainer -   Epoch 0/9
01/19/2019 18:25:28 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 517.616
01/19/2019 18:25:28 - INFO - allennlp.training.trainer -   GPU 0 memory usage MB: 11
01/19/2019 18:25:28 - INFO - allennlp.training.trainer -   Training
accuracy: 0.2577, loss: 1.5797 ||: 100%|██████████| 267/267 [0

0 - very negative
 1 - negative
 2 - neutral
 3 - positive
 4 - very positive

Sentiment level: 0
