In [0]:
#!pip install allennlp==1.0.0rc5
#!pip install allennlp-models==1.0.0rc5
#!pip freeze | grep allennlp

In [0]:
import torch
if torch.cuda.is_available():
    cuda_device = 0
else:
    cuda_device = -1

In [0]:
from allennlp.data.dataset_readers import Conll2003DatasetReader
from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer
from allennlp.data.vocabulary import Vocabulary

BERT_MODEL = 'bert-base-cased'
indexer = PretrainedTransformerMismatchedIndexer(model_name=BERT_MODEL)

In [0]:
from typing import Dict, List, Sequence, Iterable
import itertools
import logging

from overrides import overrides

from allennlp.common.checks import ConfigurationError
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.dataset_readers.dataset_utils import to_bioul
from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token

logger = logging.getLogger(__name__)


def _is_divider(line: str) -> bool:
    empty_line = line.strip() == ""
    if empty_line:
        return True
    else:
        first_token = line.split()[0]
        if first_token == "-DOCSTART-":
            return True
        else:
            return False

class GeniaDatasetReader(Conll2003DatasetReader):

    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_, ner_tags = fields
                    # TextField requires `Token` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, ner_tags)



In [5]:
dataset_type = 'conll'

if dataset_type == 'conll':
  reader = Conll2003DatasetReader(token_indexers={'tokens': indexer})
  train_dataset = reader.read('train.txt')
  dev_dataset = reader.read('test.txt')
elif dataset_type == 'genia':
  reader = GeniaDatasetReader(token_indexers={'tokens': indexer})
  train_dataset = reader.read('./Genia4ERtask1.iob2')
  dev_dataset = reader.read('./Genia4EReval1.iob2')

14041it [00:01, 10837.25it/s]
3453it [00:00, 8314.39it/s]


In [6]:
vocab = Vocabulary.from_instances(train_dataset.instances)

100%|██████████| 14041/14041 [00:00<00:00, 109655.47it/s]


In [0]:
train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)

In [0]:
from allennlp.modules.token_embedders import PretrainedTransformerMismatchedEmbedder

embedder = PretrainedTransformerMismatchedEmbedder(model_name=BERT_MODEL)

In [0]:
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder})

In [0]:
from allennlp.models import SimpleTagger
from allennlp.modules.seq2seq_encoders import PassThroughEncoder
from allennlp_models.tagging import CrfTagger

seq2seq_encoder = PassThroughEncoder(input_dim=embedder.get_output_dim())

tagger = CrfTagger( text_field_embedder=text_field_embedder, 
                      vocab=vocab, 
                      encoder=seq2seq_encoder,
                      calculate_span_f1=True,
                      label_encoding='IOB1',

                      dropout = 0.2).cuda(device=cuda_device)

In [11]:
import torch.optim as optim
from allennlp.training.learning_rate_schedulers import ReduceOnPlateauLearningRateScheduler

from allennlp.data.dataloader import DataLoader
from allennlp.training import GradientDescentTrainer

NUM_EPOCHS = 1
N_MICRO_BATCH = 1

optimizer = optim.Adam(tagger.parameters(), lr=1e-5)
train_data_loader = DataLoader(dataset=train_dataset, batch_size=32)
val_data_loader = DataLoader(dataset=dev_dataset, batch_size=100)
lr_scheduler = ReduceOnPlateauLearningRateScheduler(optimizer, patience=1)


trainer = GradientDescentTrainer(
    model=tagger,
    optimizer=optimizer,
    data_loader=train_data_loader,
    validation_data_loader=val_data_loader,
    num_epochs=NUM_EPOCHS,
    cuda_device=cuda_device,
    learning_rate_scheduler=lr_scheduler,
    patience=5,
    num_gradient_accumulation_steps=N_MICRO_BATCH)

metrics = trainer.train()

accuracy: 0.9295, accuracy3: 0.9738, precision-overall: 0.6855, recall-overall: 0.6373, f1-measure-overall: 0.6606, loss: 110.1791, reg_loss: 0.0000 ||: 100%|██████████| 439/439 [05:09<00:00,  1.42it/s]
accuracy: 0.9772, accuracy3: 0.9928, precision-overall: 0.8656, recall-overall: 0.8780, f1-measure-overall: 0.8718, loss: 107.6890, reg_loss: 0.0000 ||: 100%|██████████| 35/35 [00:37<00:00,  1.07s/it]


In [0]:
from allennlp.predictors import SentenceTaggerPredictor
import numpy as np
from seqeval.metrics import f1_score

def predict_from_instance(model, reader, data, evaluate = False):
  predictor = SentenceTaggerPredictor(model, reader)
  predictions = []
  scores = []
  len_sentence = data['tokens'].sequence_length()
  text = [data['tokens'].tokens[j].text for j in range(len_sentence)]
  tag_logits = predictor.predict(' '.join(text))['logits']

  tag_ids = np.argmax(tag_logits, axis=-1)
  res = [tagger.vocab.get_token_from_index(i, 'labels') for i in tag_ids]
  if evaluate:
    score = f1_score(res, data['tags'].labels)
  return {'predicted_tags': res,'f1-score': score} 