<a href="https://colab.research.google.com/github/baluvanan/AllenNLP/blob/main/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup 

In [1]:
!pip install allennlp==1.0.0
!pip install allennlp-models==1.0.0
!git clone https://github.com/mhagiwara/realworldnlp.git
%cd realworldnlp

Collecting allennlp==1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/2c/49/bf0ec241496a82c9dd2f0b6ff6f8156b6b2b72b849df8c00a4f2bcf61485/allennlp-1.0.0-py3-none-any.whl (473kB)
[K     |▊                               | 10kB 11.1MB/s eta 0:00:01[K     |█▍                              | 20kB 15.7MB/s eta 0:00:01[K     |██                              | 30kB 13.7MB/s eta 0:00:01[K     |██▊                             | 40kB 11.3MB/s eta 0:00:01[K     |███▌                            | 51kB 8.7MB/s eta 0:00:01[K     |████▏                           | 61kB 8.5MB/s eta 0:00:01[K     |████▉                           | 71kB 9.1MB/s eta 0:00:01[K     |█████▌                          | 81kB 8.4MB/s eta 0:00:01[K     |██████▎                         | 92kB 8.6MB/s eta 0:00:01[K     |███████                         | 102kB 8.7MB/s eta 0:00:01[K     |███████▋                        | 112kB 8.7MB/s eta 0:00:01[K     |████████▎                       | 122kB 8.

In [14]:
import torch
import torch.optim as optim

# Constants

In [6]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 128

# Data setup

In [4]:
from allennlp_models.classification.dataset_readers.stanford_sentiment_tree_bank import \
    StanfordSentimentTreeBankDatasetReader

reader = StanfordSentimentTreeBankDatasetReader()
train_dataset = reader.read('https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt')
dev_dataset = reader.read('https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt')


0it [00:00, ?it/s]
  0%|          | 0/2160058 [00:00<?, ?B/s][A
  2%|▏         | 43008/2160058 [00:00<00:06, 330310.71B/s][A
  9%|▉         | 199680/2160058 [00:00<00:04, 422780.65B/s][A
100%|██████████| 2160058/2160058 [00:00<00:00, 4690184.31B/s]
8544it [00:03, 2715.78it/s]
0it [00:00, ?it/s]
  0%|          | 0/280825 [00:00<?, ?B/s][A
 20%|██        | 56320/280825 [00:00<00:00, 488038.25B/s][A
100%|██████████| 280825/280825 [00:00<00:00, 1124682.56B/s]
1101it [00:01, 795.47it/s]


In [15]:
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper

vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                  min_count={'tokens': 3})
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

encoder = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

100%|██████████| 9645/9645 [00:00<00:00, 52696.11it/s]


In [11]:
vocab.

['from_instances', 'from_files', 'extend', 'empty']

# Model 

In [18]:
from allennlp.models import Model
from typing import Dict

@Model.register("lstm_classifier")
class LstmClassifier(Model):
    def __init__(self,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary,
                 positive_label: str = '4') -> None:
        super().__init__(vocab)
        # We need the embeddings to convert word IDs to their vector representations
        self.embedder = embedder

        self.encoder = encoder

        # After converting a sequence of vectors to a single vector, we feed it into
        # a fully-connected linear layer to reduce the dimension to the total number of labels.
        self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                      out_features=vocab.get_vocab_size('labels'))

        # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive)
        positive_index = vocab.get_token_index(positive_label, namespace='labels')
        self.accuracy = CategoricalAccuracy()
        self.f1_measure = F1Measure(positive_index)

        # We use the cross entropy loss because this is a classification task.
        # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss,
        # which makes it unnecessary to add a separate softmax layer.
        self.loss_function = torch.nn.CrossEntropyLoss()

    # Instances are fed to forward after batching.
    # Fields are passed through arguments with the same name.
    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        # In deep NLP, when sequences of tensors in different lengths are batched together,
        # shorter sequences get padded with zeros to make them equal length.
        # Masking is the process to ignore extra zeros added by padding
        mask = get_text_field_mask(tokens)

        # Forward pass
        embeddings = self.embedder(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.linear(encoder_out)

        # In AllenNLP, the output of forward() is a dictionary.
        # Your output dictionary must contain a "loss" key for your model to be trained.
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            self.f1_measure(logits, label)
            output["loss"] = self.loss_function(logits, label)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        precision, recall, f1_measure = self.f1_measure.get_metric(reset)
        return {'accuracy': self.accuracy.get_metric(reset),
                'precision': precision,
                'recall': recall,
                'f1_measure': f1_measure}

# Training

In [28]:
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training.trainer import GradientDescentTrainer

from allennlp.nn.util import get_text_field_mask

model = LstmClassifier(word_embeddings, encoder, vocab)

In [21]:
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

In [26]:
from allennlp.data import DataLoader
from allennlp.data.samplers import BucketBatchSampler

train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)

train_data_loader = DataLoader(train_dataset,
                         batch_sampler=BucketBatchSampler(
                             train_dataset,
                             batch_size=32,
                             sorting_keys=["tokens"]))
dev_data_loader = DataLoader(dev_dataset,
                         batch_sampler=BucketBatchSampler(
                             dev_dataset,
                             batch_size=32,
                             sorting_keys=["tokens"]))

In [29]:
trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    patience=10,
    num_epochs=20)

trainer.train()

unable to check gpu_memory_mb() due to occasional failure, continuing
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/allennlp/common/util.py", line 415, in gpu_memory_mb
    encoding="utf-8",
  File "/usr/lib/python3.7/subprocess.py", line 411, in check_output
    **kwargs).stdout
  File "/usr/lib/python3.7/subprocess.py", line 512, in run
    output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command '['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader']' returned non-zero exit status 9.

  0%|          | 0/267 [00:00<?, ?it/s][A
accuracy: 0.2500, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.6116, reg_loss: 0.0000 ||:   0%|          | 1/267 [00:00<02:00,  2.20it/s][A
accuracy: 0.2500, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.6098, reg_loss: 0.0000 ||:   1%|          | 3/267 [00:00<01:30,  2.92it/s][A
accuracy: 0.2625, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss:

{'best_epoch': 7,
 'best_validation_accuracy': 0.329700272479564,
 'best_validation_f1_measure': 0.36086955666542053,
 'best_validation_loss': 1.5126733166830881,
 'best_validation_precision': 0.28135591745376587,
 'best_validation_recall': 0.5030303001403809,
 'best_validation_reg_loss': 0.0,
 'epoch': 16,
 'peak_worker_0_memory_MB': 714.38,
 'training_accuracy': 0.7621722846441947,
 'training_duration': '0:04:36.212998',
 'training_epochs': 16,
 'training_f1_measure': 0.8233962059020996,
 'training_loss': 0.7178696200865485,
 'training_precision': 0.8010278940200806,
 'training_recall': 0.8470497131347656,
 'training_reg_loss': 0.0,
 'training_start_epoch': 0,
 'training_worker_0_memory_MB': 714.364,
 'validation_accuracy': 0.3369663941871026,
 'validation_f1_measure': 0.3664596378803253,
 'validation_loss': 1.8222380706242152,
 'validation_precision': 0.3757961690425873,
 'validation_recall': 0.35757574439048767,
 'validation_reg_loss': 0.0}