In [None]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

from collections import Counter

In [2]:
import pickle
import torch
import torch.optim as optim

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [4]:
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Tesla K80
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [5]:
import csv
import numpy as np
from typing import Dict, List, Tuple

from allennlp.common.file_utils import cached_path
from allennlp.data import DataLoader
from allennlp.data.fields import TextField, SequenceLabelField
from allennlp.data.instance import Instance
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers.token import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy, SpanBasedF1Measure
from allennlp.training.trainer import GradientDescentTrainer
from allennlp.data.dataset_readers.dataset_reader import DatasetReader, AllennlpDataset
from overrides import overrides

In [6]:
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 128

In [7]:
class NERDatasetReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer]=None, lazy=False):
        super().__init__(lazy=lazy)
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

    @overrides
    def text_to_instance(self, tokens: List[Token], labels: List[str]=None):
        fields = {}

        text_field = TextField(tokens, self.token_indexers)
        fields['tokens'] = text_field
        if labels:
            fields['labels'] = SequenceLabelField(labels, text_field)

        return Instance(fields)

    def _convert_sentence(self, rows: List[Tuple[str]]) -> Tuple[List[Token], List[str]]:
        """Given a list of rows, returns tokens and labels."""
        _, tokens, _, labels = zip(*rows)
        tokens = [Token(t) for t in tokens]

        # NOTE: the original dataset seems to confuse gpe with geo, and the distinction
        # seems arbitrary. Here we replace both with 'gpe'
        labels = [label.replace('geo', 'gpe') for label in labels]
        return tokens, labels

    @overrides
    def _read(self, file_path: str):

        file_path = cached_path(file_path)
        sentence = []
        with open(file_path, mode='r', encoding='utf-8', errors='ignore') as csv_file:
            next(csv_file)
            reader = csv.reader(csv_file)

            for row in reader:
                if row[0] and sentence:
                    tokens, labels = self._convert_sentence(sentence)
                    yield self.text_to_instance(tokens, labels)

                    sentence = [row]
                else:
                    sentence.append(row)

            if sentence:
                tokens, labels = self._convert_sentence(sentence)
                yield self.text_to_instance(tokens, labels)

In [8]:
class LstmTagger(Model):
    def __init__(self,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.embedder = embedder
        self.encoder = encoder
        self.hidden2labels = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                             out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        self.f1 = SpanBasedF1Measure(vocab, tag_namespace='labels')

    def forward(self,
                tokens: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(tokens)
        embeddings = self.embedder(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2labels(encoder_out)
        output = {'logits': logits}
        if labels is not None:
            self.accuracy(logits, labels, mask)
            self.f1(logits, labels, mask)
            output['loss'] = sequence_cross_entropy_with_logits(logits, labels, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        f1_metrics = self.f1.get_metric(reset)
        return {'accuracy': self.accuracy.get_metric(reset),
                'prec': f1_metrics['precision-overall'],
                'rec': f1_metrics['recall-overall'],
                'f1': f1_metrics['f1-measure-overall']}

In [9]:
reader = NERDatasetReader()
dataset = reader.read('https://s3.amazonaws.com/realworldnlpbook/data/entity-annotated-corpus/ner_dataset.csv')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [10]:
def predict(tokens: List[str], model: LstmTagger) -> List[str]:
    token_indexers = {'tokens': SingleIdTokenIndexer()}
    tokens = [Token(t) for t in tokens]
    inst = Instance({'tokens': TextField(tokens, token_indexers)})
    logits = model.forward_on_instance(inst)['logits']
    label_ids = np.argmax(logits, axis=1)
    labels = [model.vocab.get_token_from_index(label_id, 'labels')
              for label_id in label_ids]
    return labels

In [11]:
train_dataset = AllennlpDataset([inst for i, inst in enumerate(dataset) if i % 10 != 0])
dev_dataset = AllennlpDataset([inst for i, inst in enumerate(dataset) if i % 10 == 0])

In [12]:
vocab = Vocabulary.from_instances(train_dataset + dev_dataset)

HBox(children=(FloatProgress(value=0.0, max=47959.0), HTML(value='')))




In [13]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_SIZE)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [14]:
lstm = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, bidirectional=True, batch_first=True))

In [15]:
model = LstmTagger(word_embeddings, lstm, vocab)

train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)

train_data_loader = DataLoader(train_dataset,
                         batch_sampler=BucketBatchSampler(
                             train_dataset,
                             batch_size=16,
                             sorting_keys=["tokens"]))
dev_data_loader = DataLoader(dev_dataset,
                         batch_sampler=BucketBatchSampler(
                             dev_dataset,
                             batch_size=16,
                             sorting_keys=["tokens"]))

optimizer = optim.Adam(model.parameters())



In [16]:
trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    patience=10,
    num_epochs=10)

trainer.train()

HBox(children=(FloatProgress(value=0.0, max=2698.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2698.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2698.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2698.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2698.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2698.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2698.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2698.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2698.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2698.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




{'best_epoch': 1,
 'peak_worker_0_memory_MB': 924.472,
 'peak_gpu_0_memory_MB': 11,
 'training_duration': '0:29:12.683499',
 'training_start_epoch': 0,
 'training_epochs': 9,
 'epoch': 9,
 'training_accuracy': 0.9946135223474193,
 'training_prec': 0.9599348051563195,
 'training_rec': 0.9646228522080939,
 'training_f1': 0.9622731188544787,
 'training_loss': 0.014742819040041922,
 'training_reg_loss': 0.0,
 'training_worker_0_memory_MB': 924.472,
 'training_gpu_0_memory_MB': 11,
 'validation_accuracy': 0.9658073140456411,
 'validation_prec': 0.7929292929292929,
 'validation_rec': 0.8168281306063868,
 'validation_f1': 0.8047013078825941,
 'validation_loss': 0.14975996050285176,
 'validation_reg_loss': 0.0,
 'best_validation_accuracy': 0.9706674305356632,
 'best_validation_prec': 0.8318417755503849,
 'best_validation_rec': 0.8337818442770003,
 'best_validation_f1': 0.8328106800465408,
 'best_validation_loss': 0.0901502693273748,
 'best_validation_reg_loss': 0.0}

In [23]:
tokens = ["Amazon", "has", "built", "Kindle", "to", "rule", "the", "market", "in", "the", "US"]
labels = predict(tokens, model)
print(' '.join('{}/{}'.format(token, label) for token, label in zip(tokens, labels)))

Amazon/B-org has/O built/O Kindle/O to/O rule/O the/O market/O in/O the/O US/B-gpe
