Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
5 contributors

Users who have contributed to this file

@DeNeutoy @joelgrus @matt-gardner @nelson-liu @schmmd
255 lines (215 sloc) 10 KB
import logging
from typing import Dict, List, Iterable, Tuple, Any
from overrides import overrides
from pytorch_pretrained_bert.tokenization import BertTokenizer
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import Field, TextField, SequenceLabelField, MetadataField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.dataset_readers.dataset_utils import Ontonotes, OntonotesSentence
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
def _convert_tags_to_wordpiece_tags(tags: List[str], offsets: List[int]) -> List[str]:
"""
Converts a series of BIO tags to account for a wordpiece tokenizer,
extending/modifying BIO tags where appropriate to deal with words which
are split into multiple wordpieces by the tokenizer.
This is only used if you pass a `bert_model_name` to the dataset reader below.
Parameters
----------
tags : `List[str]`
The BIO formatted tags to convert to BIO tags for wordpieces
offsets : `List[int]`
The wordpiece offsets.
Returns
-------
The new BIO tags.
"""
# account for the fact the offsets are with respect to
# additional cls token at the start.
offsets = [x - 1 for x in offsets]
new_tags = []
j = 0
for i, offset in enumerate(offsets):
tag = tags[i]
is_o = tag == "O"
is_start = True
while j < offset:
if is_o:
new_tags.append("O")
elif tag.startswith("I"):
new_tags.append(tag)
elif is_start and tag.startswith("B"):
new_tags.append(tag)
is_start = False
elif tag.startswith("B"):
_, label = tag.split("-", 1)
new_tags.append("I-" + label)
j += 1
# Add O tags for cls and sep tokens.
return ["O"] + new_tags + ["O"]
def _convert_verb_indices_to_wordpiece_indices(verb_indices: List[int], offsets: List[int]): # pylint: disable=invalid-name
"""
Converts binary verb indicators to account for a wordpiece tokenizer,
extending/modifying BIO tags where appropriate to deal with words which
are split into multiple wordpieces by the tokenizer.
This is only used if you pass a `bert_model_name` to the dataset reader below.
Parameters
----------
verb_indices : `List[int]`
The binary verb indicators, 0 for not a verb, 1 for verb.
offsets : `List[int]`
The wordpiece offsets.
Returns
-------
The new verb indices.
"""
# account for the fact the offsets are with respect to
# additional cls token at the start.
offsets = [x - 1 for x in offsets]
j = 0
new_verb_indices = []
for i, offset in enumerate(offsets):
indicator = verb_indices[i]
while j < offset:
new_verb_indices.append(indicator)
j += 1
# Add 0 indicators for cls and sep tokens.
return [0] + new_verb_indices + [0]
@DatasetReader.register("srl")
class SrlReader(DatasetReader):
"""
This DatasetReader is designed to read in the English OntoNotes v5.0 data
for semantic role labelling. It returns a dataset of instances with the
following fields:
tokens : ``TextField``
The tokens in the sentence.
verb_indicator : ``SequenceLabelField``
A sequence of binary indicators for whether the word is the verb for this frame.
tags : ``SequenceLabelField``
A sequence of Propbank tags for the given verb in a BIO format.
Parameters
----------
token_indexers : ``Dict[str, TokenIndexer]``, optional
We similarly use this for both the premise and the hypothesis. See :class:`TokenIndexer`.
Default is ``{"tokens": SingleIdTokenIndexer()}``.
domain_identifier: ``str``, (default = None)
A string denoting a sub-domain of the Ontonotes 5.0 dataset to use. If present, only
conll files under paths containing this domain identifier will be processed.
bert_model_name : ``Optional[str]``, (default = None)
The BERT model to be wrapped. If you specify a bert_model here, then we will
assume you want to use BERT throughout; we will use the bert tokenizer,
and will expand your tags and verb indicators accordingly. If not,
the tokens will be indexed as normal with the token_indexers.
Returns
-------
A ``Dataset`` of ``Instances`` for Semantic Role Labelling.
"""
def __init__(self,
token_indexers: Dict[str, TokenIndexer] = None,
domain_identifier: str = None,
lazy: bool = False,
bert_model_name: str = None) -> None:
super().__init__(lazy)
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
self._domain_identifier = domain_identifier
if bert_model_name is not None:
self.bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
self.lowercase_input = "uncased" in bert_model_name
else:
self.bert_tokenizer = None
self.lowercase_input = False
def _wordpiece_tokenize_input(self, tokens: List[str]) -> Tuple[List[str], List[int]]:
"""
Convert a list of tokens to wordpiece tokens and offsets, as well as adding
BERT CLS and SEP tokens to the begining and end of the sentence.
"""
word_piece_tokens: List[str] = []
offsets = []
cumulative = 0
for token in tokens:
if self.lowercase_input:
token = token.lower()
word_pieces = self.bert_tokenizer.wordpiece_tokenizer.tokenize(token)
cumulative += len(word_pieces)
offsets.append(cumulative)
word_piece_tokens.extend(word_pieces)
wordpieces = ["[CLS]"] + word_piece_tokens + ["[SEP]"]
offsets = [x + 1 for x in offsets]
return wordpieces, offsets
@overrides
def _read(self, file_path: str):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
ontonotes_reader = Ontonotes()
logger.info("Reading SRL instances from dataset files at: %s", file_path)
if self._domain_identifier is not None:
logger.info("Filtering to only include file paths containing the %s domain", self._domain_identifier)
for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier):
tokens = [Token(t) for t in sentence.words]
if not sentence.srl_frames:
# Sentence contains no predicates.
tags = ["O" for _ in tokens]
verb_label = [0 for _ in tokens]
yield self.text_to_instance(tokens, verb_label, tags)
else:
for (_, tags) in sentence.srl_frames:
verb_indicator = [1 if label[-2:] == "-V" else 0 for label in tags]
yield self.text_to_instance(tokens, verb_indicator, tags)
@staticmethod
def _ontonotes_subset(ontonotes_reader: Ontonotes,
file_path: str,
domain_identifier: str) -> Iterable[OntonotesSentence]:
"""
Iterates over the Ontonotes 5.0 dataset using an optional domain identifier.
If the domain identifier is present, only examples which contain the domain
identifier in the file path are yielded.
"""
for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
if domain_identifier is None or f"/{domain_identifier}/" in conll_file:
yield from ontonotes_reader.sentence_iterator(conll_file)
def text_to_instance(self, # type: ignore
tokens: List[Token],
verb_label: List[int],
tags: List[str] = None) -> Instance:
"""
We take `pre-tokenized` input here, along with a verb label. The verb label should be a
one-hot binary vector, the same length as the tokens, indicating the position of the verb
to find arguments for.
"""
# pylint: disable=arguments-differ
metadata_dict: Dict[str, Any] = {}
if self.bert_tokenizer is not None:
wordpieces, offsets = self._wordpiece_tokenize_input([t.text for t in tokens])
new_verbs = _convert_verb_indices_to_wordpiece_indices(verb_label, offsets)
metadata_dict["offsets"] = offsets
# In order to override the indexing mechanism, we need to set the `text_id`
# attribute directly. This causes the indexing to use this id.
text_field = TextField([Token(t, text_id=self.bert_tokenizer.vocab[t]) for t in wordpieces],
token_indexers=self._token_indexers)
verb_indicator = SequenceLabelField(new_verbs, text_field)
else:
text_field = TextField(tokens, token_indexers=self._token_indexers)
verb_indicator = SequenceLabelField(verb_label, text_field)
fields: Dict[str, Field] = {}
fields['tokens'] = text_field
fields['verb_indicator'] = verb_indicator
if all([x == 0 for x in verb_label]):
verb = None
verb_index = None
else:
verb_index = verb_label.index(1)
verb = tokens[verb_index].text
metadata_dict["words"] = [x.text for x in tokens]
metadata_dict["verb"] = verb
metadata_dict["verb_index"] = verb_index
if tags:
if self.bert_tokenizer is not None:
new_tags = _convert_tags_to_wordpiece_tags(tags, offsets)
fields['tags'] = SequenceLabelField(new_tags, text_field)
else:
fields['tags'] = SequenceLabelField(tags, text_field)
metadata_dict["gold_tags"] = tags
fields["metadata"] = MetadataField(metadata_dict)
return Instance(fields)
You can’t perform that action at this time.