In [1]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

from collections import Counter

In [2]:
import pickle
import torch
import torch.optim as optim

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [4]:
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Tesla K80
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [5]:
from typing import Dict

import numpy as np
import torch
import torch.optim as optim


from allennlp.data import DataLoader
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.training.trainer import GradientDescentTrainer
from allennlp_models.structured_prediction.dataset_readers.universal_dependencies import UniversalDependenciesDatasetReader

from predictor import UniversalPOSPredictor

In [6]:
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 128

In [7]:
class LstmTagger(Model):
    def __init__(self,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.embedder = embedder
        self.encoder = encoder
        self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                      out_features=vocab.get_vocab_size('pos'))
        self.accuracy = CategoricalAccuracy()

    def forward(self,
                words: Dict[str, torch.Tensor],
                pos_tags: torch.Tensor = None,
                **args) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(words)

        embeddings = self.embedder(words)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.linear(encoder_out)

        output = {"tag_logits": tag_logits}
        if pos_tags is not None:
            self.accuracy(tag_logits, pos_tags, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, pos_tags, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

In [8]:
reader = UniversalDependenciesDatasetReader()

train_dataset = reader.read('https://s3.amazonaws.com/realworldnlpbook/data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-train.conllu')
dev_dataset = reader.read('https://s3.amazonaws.com/realworldnlpbook/data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-dev.conllu')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Your label namespace was 'pos'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.





HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [9]:
vocab = Vocabulary.from_instances(train_dataset + dev_dataset)

train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)

HBox(children=(FloatProgress(value=0.0, max=14545.0), HTML(value='')))




In [10]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_SIZE)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [20]:
encoder = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, bidirectional=True, batch_first=True))


In [21]:
model = LstmTagger(word_embeddings, encoder, vocab) 
optimizer = optim.Adam(model.parameters())

train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)

# iterator = BucketIterator(batch_size=16, sorting_keys=[("words", "num_tokens")])
train_data_loader = DataLoader(
    train_dataset,
    batch_sampler=BucketBatchSampler(
        train_dataset,
        batch_size=32,
        sorting_keys=["words"]
    )
)

dev_data_loader = DataLoader(
    train_dataset,
    batch_sampler=BucketBatchSampler(
        dev_dataset,
        batch_size=32,
        sorting_keys=["words"]
    )
)

In [22]:
trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    patience=10,
    num_epochs=10)
trainer.train()

HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=392.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))




{'best_epoch': 9,
 'peak_worker_0_memory_MB': 789.272,
 'peak_gpu_0_memory_MB': 11,
 'training_duration': '0:03:25.387396',
 'training_start_epoch': 0,
 'training_epochs': 9,
 'epoch': 9,
 'training_accuracy': 0.9983038834714177,
 'training_loss': 0.00563032939747672,
 'training_reg_loss': 0.0,
 'training_worker_0_memory_MB': 789.272,
 'training_gpu_0_memory_MB': 11,
 'validation_accuracy': 0.9993721274832358,
 'validation_loss': 0.0035438241939696057,
 'validation_reg_loss': 0.0,
 'best_validation_accuracy': 0.9993721274832358,
 'best_validation_loss': 0.0035438241939696057,
 'best_validation_reg_loss': 0.0}

In [23]:
predictor = UniversalPOSPredictor(model, reader)
tokens = ['What', 'do', 'you', 'think', '?']
logits = predictor.predict(tokens)['tag_logits']
tag_ids = np.argmax(logits, axis=-1)

[vocab.get_token_from_index(tag_id, 'pos') for tag_id in tag_ids]

['PRON', 'AUX', 'PRON', 'VERB', 'PUNCT']

In [28]:
print(len(logits))

5


In [26]:
vocab.get_vocab_size('pos')

19

In [14]:
vocab

Vocabulary with namespaces:  tokens, Size: 21337 || pos, Size: 19 || head_tags, Size: 49 || Non Padded Namespaces: {'*tags', '*labels'}

In [17]:
vocab.get_vocab_size('head_tags')

49

In [19]:
vocab.save_to_files("overthere")