From 76a137ae7be37094c53dff0cf55dc1ad9cc849e1 Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Fri, 5 Apr 2019 18:15:54 -0700 Subject: [PATCH 01/20] Always return a minimum padding size of 1. --- allennlp/data/fields/text_field.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py index 7f01e0509c1..3ad2af6248b 100644 --- a/allennlp/data/fields/text_field.py +++ b/allennlp/data/fields/text_field.py @@ -122,6 +122,10 @@ def get_padding_lengths(self) -> Dict[str, int]: padding_keys = {key for d in lengths for key in d.keys()} for padding_key in padding_keys: padding_lengths[padding_key] = max(x[padding_key] if padding_key in x else 0 for x in lengths) + + # Set minimum padding length to handle empty list fields. + for padding_key in padding_lengths: + padding_lengths[padding_key] = max(padding_lengths[padding_key], 1) return padding_lengths @overrides From 0855be4d8a47ce720058462fd53d2b614dceef87 Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Tue, 9 Apr 2019 18:40:27 -0700 Subject: [PATCH 02/20] add repro --- allennlp/tests/data/fields/list_field_test.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/allennlp/tests/data/fields/list_field_test.py b/allennlp/tests/data/fields/list_field_test.py index 9670164703c..11902d19b16 100644 --- a/allennlp/tests/data/fields/list_field_test.py +++ b/allennlp/tests/data/fields/list_field_test.py @@ -189,3 +189,18 @@ def test_sequence_methods(self): assert len(list_field) == 3 assert list_field[1] == self.field2 assert [f for f in list_field] == [self.field1, self.field2, self.field3] + + def test_2660_repro(self): + from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer + token_indexers = {"tokens": SingleIdTokenIndexer()} + from allennlp.data.tokenizers.word_tokenizer import WordTokenizer + tokenizer = WordTokenizer() + tokens = tokenizer.tokenize("Foo") + from allennlp.data.fields.text_field import TextField + from allennlp.data.fields.list_field import ListField + text_field = TextField(tokens, token_indexers) + list_field = ListField([text_field.empty_field()]) + fields = {'list': list_field} + from allennlp.data.instance import Instance + instance = Instance(fields) + instance.as_tensor_dict() From 2dedf915bf14ff190978cc584e4053a90ef5566f Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Tue, 9 Apr 2019 18:43:30 -0700 Subject: [PATCH 03/20] Move min padding length setting to list_field.py --- allennlp/data/fields/list_field.py | 5 +++++ allennlp/data/fields/text_field.py | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/allennlp/data/fields/list_field.py b/allennlp/data/fields/list_field.py index 1f41881b33f..8063e1bba06 100644 --- a/allennlp/data/fields/list_field.py +++ b/allennlp/data/fields/list_field.py @@ -68,6 +68,11 @@ def get_padding_lengths(self) -> Dict[str, int]: # when we construct the dictionary from the list of fields, we add something to the # name, and we remove it when padding the list of fields. padding_lengths['list_' + key] = max(x[key] if key in x else 0 for x in field_lengths) + + # Set minimum padding length to handle empty list fields. + for padding_key in padding_lengths: + padding_lengths[padding_key] = max(padding_lengths[padding_key], 1) + return padding_lengths @overrides diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py index 3ad2af6248b..72719b82321 100644 --- a/allennlp/data/fields/text_field.py +++ b/allennlp/data/fields/text_field.py @@ -123,9 +123,6 @@ def get_padding_lengths(self) -> Dict[str, int]: for padding_key in padding_keys: padding_lengths[padding_key] = max(x[padding_key] if padding_key in x else 0 for x in lengths) - # Set minimum padding length to handle empty list fields. - for padding_key in padding_lengths: - padding_lengths[padding_key] = max(padding_lengths[padding_key], 1) return padding_lengths @overrides From 405da9716d15ea3d703018d3c2210445bc14c46f Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Tue, 9 Apr 2019 18:49:16 -0700 Subject: [PATCH 04/20] Move min padding length setting to list_field.py --- allennlp/tests/data/fields/list_field_test.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/allennlp/tests/data/fields/list_field_test.py b/allennlp/tests/data/fields/list_field_test.py index 11902d19b16..b8fde722747 100644 --- a/allennlp/tests/data/fields/list_field_test.py +++ b/allennlp/tests/data/fields/list_field_test.py @@ -4,7 +4,9 @@ from allennlp.common.testing import AllenNlpTestCase from allennlp.data import Token, Vocabulary from allennlp.data.fields import TextField, LabelField, ListField, IndexField, SequenceLabelField +from allennlp.data.instance import Instance from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer +from allennlp.data.tokenizers.word_tokenizer import WordTokenizer class TestListField(AllenNlpTestCase): @@ -191,16 +193,11 @@ def test_sequence_methods(self): assert [f for f in list_field] == [self.field1, self.field2, self.field3] def test_2660_repro(self): - from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer token_indexers = {"tokens": SingleIdTokenIndexer()} - from allennlp.data.tokenizers.word_tokenizer import WordTokenizer tokenizer = WordTokenizer() tokens = tokenizer.tokenize("Foo") - from allennlp.data.fields.text_field import TextField - from allennlp.data.fields.list_field import ListField text_field = TextField(tokens, token_indexers) list_field = ListField([text_field.empty_field()]) fields = {'list': list_field} - from allennlp.data.instance import Instance instance = Instance(fields) instance.as_tensor_dict() From 01e4cf7d5f65f94cd5709cfdaf0b5558f268e81b Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Tue, 9 Apr 2019 20:13:46 -0700 Subject: [PATCH 05/20] Add test --- allennlp/tests/data/fields/list_field_test.py | 153 +++++++++++++++++- 1 file changed, 150 insertions(+), 3 deletions(-) diff --git a/allennlp/tests/data/fields/list_field_test.py b/allennlp/tests/data/fields/list_field_test.py index b8fde722747..79064062756 100644 --- a/allennlp/tests/data/fields/list_field_test.py +++ b/allennlp/tests/data/fields/list_field_test.py @@ -1,12 +1,21 @@ # pylint: disable=no-self-use,invalid-name +import json +import torch +from typing import Iterator, List, Dict + import numpy from allennlp.common.testing import AllenNlpTestCase -from allennlp.data import Token, Vocabulary -from allennlp.data.fields import TextField, LabelField, ListField, IndexField, SequenceLabelField +from allennlp.data import Token, Vocabulary, DatasetReader +from allennlp.data.fields import TextField, LabelField, ListField, IndexField, SequenceLabelField, SpanField from allennlp.data.instance import Instance from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer from allennlp.data.tokenizers.word_tokenizer import WordTokenizer +from allennlp.models import Model +from allennlp.modules import TextFieldEmbedder, Embedding +from allennlp.modules.span_extractors import EndpointSpanExtractor +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder +from allennlp.nn.util import get_text_field_mask, batched_index_select class TestListField(AllenNlpTestCase): @@ -192,7 +201,7 @@ def test_sequence_methods(self): assert list_field[1] == self.field2 assert [f for f in list_field] == [self.field1, self.field2, self.field3] - def test_2660_repro(self): + def test_empty_list_can_be_tensorized(self): token_indexers = {"tokens": SingleIdTokenIndexer()} tokenizer = WordTokenizer() tokens = tokenizer.tokenize("Foo") @@ -201,3 +210,141 @@ def test_2660_repro(self): fields = {'list': list_field} instance = Instance(fields) instance.as_tensor_dict() + + class SalienceReader(DatasetReader): + def __init__(self): + super().__init__(lazy=False) + self._token_indexers = {"tokens": SingleIdTokenIndexer()} + self._tokenizer = WordTokenizer() + + def _read(self, file_path: str) -> Iterator[Instance]: + with open(file_path) as file: + for line in file: + doc = json.loads(line) + yield self.text_to_instance(body=doc['body'], + entity_name=doc['entity_name'], + entity_mentions=doc['entity_mentions']) + + @classmethod + def _is_same_token_sequence(cls, seq1: List[Token], seq2: List[Token]): + """ + Utility function to check if two token sequences are identical. + """ + for t1, t2 in zip(seq1, seq2): + if t1.text != t2.text: + return False + return True + + def text_to_instance(self, + body: str, + entity_name: str, + entity_mentions: List[str]) -> Instance: + """ + Generates an instance based on a body of text, an entity with a + given name (which need not be in the body) and series of entity + mentions. The mentions will be matched against the text to generate + mention spans. + """ + + fields = {} + + body_tokens = self._tokenizer.tokenize(body) + fields['body'] = TextField(body_tokens, self._token_indexers) + + EMPTY_TEXT = fields['body'].empty_field() + EMPTY_SPAN = SpanField(-1, -1, EMPTY_TEXT) + + def get_entity_spans(mentions): + spans = [] + for mention in mentions: + mention_tokens = self._tokenizer.tokenize(mention) + for start_index in range(0, len(body_tokens) - len(mention_tokens) + 1): + selected_tokens = body_tokens[start_index:start_index + len(mention_tokens)] + if self._is_same_token_sequence(selected_tokens, mention_tokens): + spans.append(SpanField(start_index, + start_index + len(mention_tokens) - 1, + fields['body'])) + # Empty lists fields are actually non-empty list fields full of padding. + if not spans: + spans.append(EMPTY_SPAN) + return ListField(spans) + + fields['entity_name'] = TextField(self._tokenizer.tokenize(entity_name), self._token_indexers) + fields['entity_spans'] = get_entity_spans(entity_mentions) + + return Instance(fields) + + class SalienceModel(Model): + """ + An unsupervised baseline model for salience based on textual similarity. + """ + def __init__(self, vocab: Vocabulary) -> None: + super().__init__(vocab) + # In the real model this is pretrained. + token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), + embedding_dim=30) + self._embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) + + @classmethod + def fixed_length_embedding(cls, mask, embedded_tokens): + """ + Create a very simple fixed length embedding of a sequence by + concatenating the first and last embedded tokens. + """ + sequence_lengths = mask.sum(dim=1) + # size: + embedded_first_tokens = embedded_tokens[:,0,:] + # size: + indices = sequence_lengths - 1 + # size: + embedded_last_tokens = batched_index_select(embedded_tokens, indices) + # size: + return torch.cat((embedded_first_tokens, embedded_last_tokens), 1) + + def forward(self, + body: Dict[str, torch.LongTensor], + entity_name: Dict[str, torch.LongTensor], + entity_spans: torch.Tensor) -> Dict[str, torch.Tensor]: + + # Embed body + + # size: + body_mask = get_text_field_mask(body) + # size: + embedded_body_tokens = self._embedder(body) + # size: + embedded_body = self.fixed_length_embedding(body_mask, embedded_body_tokens) + + # Embed name (in isolation) + + # size: + name_mask = get_text_field_mask(entity_name) + # size: + embedded_name_tokens = self._embedder(entity_name) + # size: + embedded_name = self.fixed_length_embedding(name_mask, embedded_name_tokens) + + # Extract embedded spans from the body + + extractor = EndpointSpanExtractor(input_dim=embedded_body_tokens.size(-1)) + # size: + span_mask = (entity_spans[:, :, 0] >= 0).long() + # size: + embedded_spans = extractor(embedded_body_tokens, entity_spans, span_indices_mask=span_mask) + + # size: + name_match_score = torch.cosine_similarity(embedded_body, embedded_name) + + # size: + transposed_embedded_spans = embedded_spans.transpose(1, 2) + # Note: Real model normalizes to give cosine similarity. + # size: + span_match_scores = torch.matmul(embedded_body, transposed_embedded_spans) + # size: + masked_span_match_scores = span_match_scores * span_mask + # Aggregate with max to get single score + # size: + span_match_score = masked_span_match_scores.max(dim=-1)[0].squeeze(-1) + + # Combine name match and span match scores. + return {'score': name_match_score + span_match_score} From 9592d403f12b62a177c8ad3e0076adfc063f61c4 Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Wed, 10 Apr 2019 16:20:57 -0700 Subject: [PATCH 06/20] Separate test. Fixture data --- allennlp/tests/end_to_end/__init__.py | 0 allennlp/tests/end_to_end/empty_list_test.py | 168 ++++++++++++++++++ .../tests/fixtures/end_to_end/sample.json | 2 + 3 files changed, 170 insertions(+) create mode 100644 allennlp/tests/end_to_end/__init__.py create mode 100644 allennlp/tests/end_to_end/empty_list_test.py create mode 100644 allennlp/tests/fixtures/end_to_end/sample.json diff --git a/allennlp/tests/end_to_end/__init__.py b/allennlp/tests/end_to_end/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/allennlp/tests/end_to_end/empty_list_test.py b/allennlp/tests/end_to_end/empty_list_test.py new file mode 100644 index 00000000000..9bbc847f082 --- /dev/null +++ b/allennlp/tests/end_to_end/empty_list_test.py @@ -0,0 +1,168 @@ +# pylint: disable=no-self-use,invalid-name +import json +import torch +from typing import Iterator, List, Dict + +import numpy + +from allennlp.common.testing import AllenNlpTestCase +from allennlp.data import Token, Vocabulary, DatasetReader +from allennlp.data.fields import TextField, LabelField, ListField, IndexField, SequenceLabelField, SpanField +from allennlp.data.instance import Instance +from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer +from allennlp.data.tokenizers.word_tokenizer import WordTokenizer +from allennlp.models import Model +from allennlp.modules import TextFieldEmbedder, Embedding +from allennlp.modules.span_extractors import EndpointSpanExtractor +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder +from allennlp.nn.util import get_text_field_mask, batched_index_select + +class SalienceReader(DatasetReader): + def __init__(self): + super().__init__(lazy=False) + self._token_indexers = {"tokens": SingleIdTokenIndexer()} + self._tokenizer = WordTokenizer() + + def _read(self, file_path: str) -> Iterator[Instance]: + with open(file_path) as file: + for line in file: + doc = json.loads(line) + yield self.text_to_instance(body=doc['body'], + entity_name=doc['entity_name'], + entity_mentions=doc['entity_mentions']) + + @classmethod + def _is_same_token_sequence(cls, seq1: List[Token], seq2: List[Token]): + """ + Utility function to check if two token sequences are identical. + """ + for t1, t2 in zip(seq1, seq2): + if t1.text != t2.text: + return False + return True + + def text_to_instance(self, + body: str, + entity_name: str, + entity_mentions: List[str]) -> Instance: + """ + Generates an instance based on a body of text, an entity with a + given name (which need not be in the body) and series of entity + mentions. The mentions will be matched against the text to generate + mention spans. + """ + + fields = {} + + body_tokens = self._tokenizer.tokenize(body) + fields['body'] = TextField(body_tokens, self._token_indexers) + + EMPTY_TEXT = fields['body'].empty_field() + EMPTY_SPAN = SpanField(-1, -1, EMPTY_TEXT) + + def get_entity_spans(mentions): + spans = [] + for mention in mentions: + mention_tokens = self._tokenizer.tokenize(mention) + for start_index in range(0, len(body_tokens) - len(mention_tokens) + 1): + selected_tokens = body_tokens[start_index:start_index + len(mention_tokens)] + if self._is_same_token_sequence(selected_tokens, mention_tokens): + spans.append(SpanField(start_index, + start_index + len(mention_tokens) - 1, + fields['body'])) + # Empty lists fields are actually non-empty list fields full of padding. + if not spans: + spans.append(EMPTY_SPAN) + return ListField(spans) + + fields['entity_name'] = TextField(self._tokenizer.tokenize(entity_name), self._token_indexers) + fields['entity_spans'] = get_entity_spans(entity_mentions) + + return Instance(fields) + +class SalienceModel(Model): + """ + An unsupervised baseline model for salience based on textual similarity. + """ + def __init__(self, vocab: Vocabulary) -> None: + super().__init__(vocab) + # In the real model this is pretrained. + token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), + embedding_dim=30) + self._embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) + + @classmethod + def fixed_length_embedding(cls, mask, embedded_tokens): + """ + Create a very simple fixed length embedding of a sequence by + concatenating the first and last embedded tokens. + """ + sequence_lengths = mask.sum(dim=1) + # size: + embedded_first_tokens = embedded_tokens[:,0,:] + # size: + indices = sequence_lengths - 1 + # size: + embedded_last_tokens = batched_index_select(embedded_tokens, indices) + # size: + return torch.cat((embedded_first_tokens, embedded_last_tokens), 1) + + def forward(self, + body: Dict[str, torch.LongTensor], + entity_name: Dict[str, torch.LongTensor], + entity_spans: torch.Tensor) -> Dict[str, torch.Tensor]: + + # Embed body + + # size: + body_mask = get_text_field_mask(body) + # size: + embedded_body_tokens = self._embedder(body) + # size: + embedded_body = self.fixed_length_embedding(body_mask, embedded_body_tokens) + + # Embed name (in isolation) + + # size: + name_mask = get_text_field_mask(entity_name) + # size: + embedded_name_tokens = self._embedder(entity_name) + # size: + embedded_name = self.fixed_length_embedding(name_mask, embedded_name_tokens) + + # Extract embedded spans from the body + + extractor = EndpointSpanExtractor(input_dim=embedded_body_tokens.size(-1)) + # size: + span_mask = (entity_spans[:, :, 0] >= 0).long() + # size: + embedded_spans = extractor(embedded_body_tokens, entity_spans, span_indices_mask=span_mask) + + # size: + name_match_score = torch.cosine_similarity(embedded_body, embedded_name) + + # size: + transposed_embedded_spans = embedded_spans.transpose(1, 2) + # Note: Real model normalizes to give cosine similarity. + # size: + span_match_scores = torch.matmul(embedded_body, transposed_embedded_spans) + # size: + masked_span_match_scores = span_match_scores * span_mask + # Aggregate with max to get single score + # size: + span_match_score = masked_span_match_scores.max(dim=-1)[0].squeeze(-1) + + # Combine name match and span match scores. + return {'score': name_match_score + span_match_score} + + +class EmptyListTest(AllenNlpTestCase): + def test_empty_list_can_be_tensorized(self): + token_indexers = {"tokens": SingleIdTokenIndexer()} + tokenizer = WordTokenizer() + tokens = tokenizer.tokenize("Foo") + text_field = TextField(tokens, token_indexers) + list_field = ListField([text_field.empty_field()]) + fields = {'list': list_field} + instance = Instance(fields) + instance.as_tensor_dict() diff --git a/allennlp/tests/fixtures/end_to_end/sample.json b/allennlp/tests/fixtures/end_to_end/sample.json new file mode 100644 index 00000000000..e0928d32141 --- /dev/null +++ b/allennlp/tests/fixtures/end_to_end/sample.json @@ -0,0 +1,2 @@ +{"body": "This is a test.", "entity_name": "exam", "entity_mentions": ["test", "quiz"]} +{"body": "The dog went on a walk.", "entity_name": "animal", "entity_mentions": ["hound", "puppy"]} From 7de91249fe16bf88155d5882228bd29d8044ba74 Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Wed, 10 Apr 2019 16:31:25 -0700 Subject: [PATCH 07/20] Revert test --- allennlp/tests/data/fields/list_field_test.py | 163 +----------------- 1 file changed, 2 insertions(+), 161 deletions(-) diff --git a/allennlp/tests/data/fields/list_field_test.py b/allennlp/tests/data/fields/list_field_test.py index 79064062756..9670164703c 100644 --- a/allennlp/tests/data/fields/list_field_test.py +++ b/allennlp/tests/data/fields/list_field_test.py @@ -1,21 +1,10 @@ # pylint: disable=no-self-use,invalid-name -import json -import torch -from typing import Iterator, List, Dict - import numpy from allennlp.common.testing import AllenNlpTestCase -from allennlp.data import Token, Vocabulary, DatasetReader -from allennlp.data.fields import TextField, LabelField, ListField, IndexField, SequenceLabelField, SpanField -from allennlp.data.instance import Instance +from allennlp.data import Token, Vocabulary +from allennlp.data.fields import TextField, LabelField, ListField, IndexField, SequenceLabelField from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer -from allennlp.data.tokenizers.word_tokenizer import WordTokenizer -from allennlp.models import Model -from allennlp.modules import TextFieldEmbedder, Embedding -from allennlp.modules.span_extractors import EndpointSpanExtractor -from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder -from allennlp.nn.util import get_text_field_mask, batched_index_select class TestListField(AllenNlpTestCase): @@ -200,151 +189,3 @@ def test_sequence_methods(self): assert len(list_field) == 3 assert list_field[1] == self.field2 assert [f for f in list_field] == [self.field1, self.field2, self.field3] - - def test_empty_list_can_be_tensorized(self): - token_indexers = {"tokens": SingleIdTokenIndexer()} - tokenizer = WordTokenizer() - tokens = tokenizer.tokenize("Foo") - text_field = TextField(tokens, token_indexers) - list_field = ListField([text_field.empty_field()]) - fields = {'list': list_field} - instance = Instance(fields) - instance.as_tensor_dict() - - class SalienceReader(DatasetReader): - def __init__(self): - super().__init__(lazy=False) - self._token_indexers = {"tokens": SingleIdTokenIndexer()} - self._tokenizer = WordTokenizer() - - def _read(self, file_path: str) -> Iterator[Instance]: - with open(file_path) as file: - for line in file: - doc = json.loads(line) - yield self.text_to_instance(body=doc['body'], - entity_name=doc['entity_name'], - entity_mentions=doc['entity_mentions']) - - @classmethod - def _is_same_token_sequence(cls, seq1: List[Token], seq2: List[Token]): - """ - Utility function to check if two token sequences are identical. - """ - for t1, t2 in zip(seq1, seq2): - if t1.text != t2.text: - return False - return True - - def text_to_instance(self, - body: str, - entity_name: str, - entity_mentions: List[str]) -> Instance: - """ - Generates an instance based on a body of text, an entity with a - given name (which need not be in the body) and series of entity - mentions. The mentions will be matched against the text to generate - mention spans. - """ - - fields = {} - - body_tokens = self._tokenizer.tokenize(body) - fields['body'] = TextField(body_tokens, self._token_indexers) - - EMPTY_TEXT = fields['body'].empty_field() - EMPTY_SPAN = SpanField(-1, -1, EMPTY_TEXT) - - def get_entity_spans(mentions): - spans = [] - for mention in mentions: - mention_tokens = self._tokenizer.tokenize(mention) - for start_index in range(0, len(body_tokens) - len(mention_tokens) + 1): - selected_tokens = body_tokens[start_index:start_index + len(mention_tokens)] - if self._is_same_token_sequence(selected_tokens, mention_tokens): - spans.append(SpanField(start_index, - start_index + len(mention_tokens) - 1, - fields['body'])) - # Empty lists fields are actually non-empty list fields full of padding. - if not spans: - spans.append(EMPTY_SPAN) - return ListField(spans) - - fields['entity_name'] = TextField(self._tokenizer.tokenize(entity_name), self._token_indexers) - fields['entity_spans'] = get_entity_spans(entity_mentions) - - return Instance(fields) - - class SalienceModel(Model): - """ - An unsupervised baseline model for salience based on textual similarity. - """ - def __init__(self, vocab: Vocabulary) -> None: - super().__init__(vocab) - # In the real model this is pretrained. - token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), - embedding_dim=30) - self._embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) - - @classmethod - def fixed_length_embedding(cls, mask, embedded_tokens): - """ - Create a very simple fixed length embedding of a sequence by - concatenating the first and last embedded tokens. - """ - sequence_lengths = mask.sum(dim=1) - # size: - embedded_first_tokens = embedded_tokens[:,0,:] - # size: - indices = sequence_lengths - 1 - # size: - embedded_last_tokens = batched_index_select(embedded_tokens, indices) - # size: - return torch.cat((embedded_first_tokens, embedded_last_tokens), 1) - - def forward(self, - body: Dict[str, torch.LongTensor], - entity_name: Dict[str, torch.LongTensor], - entity_spans: torch.Tensor) -> Dict[str, torch.Tensor]: - - # Embed body - - # size: - body_mask = get_text_field_mask(body) - # size: - embedded_body_tokens = self._embedder(body) - # size: - embedded_body = self.fixed_length_embedding(body_mask, embedded_body_tokens) - - # Embed name (in isolation) - - # size: - name_mask = get_text_field_mask(entity_name) - # size: - embedded_name_tokens = self._embedder(entity_name) - # size: - embedded_name = self.fixed_length_embedding(name_mask, embedded_name_tokens) - - # Extract embedded spans from the body - - extractor = EndpointSpanExtractor(input_dim=embedded_body_tokens.size(-1)) - # size: - span_mask = (entity_spans[:, :, 0] >= 0).long() - # size: - embedded_spans = extractor(embedded_body_tokens, entity_spans, span_indices_mask=span_mask) - - # size: - name_match_score = torch.cosine_similarity(embedded_body, embedded_name) - - # size: - transposed_embedded_spans = embedded_spans.transpose(1, 2) - # Note: Real model normalizes to give cosine similarity. - # size: - span_match_scores = torch.matmul(embedded_body, transposed_embedded_spans) - # size: - masked_span_match_scores = span_match_scores * span_mask - # Aggregate with max to get single score - # size: - span_match_score = masked_span_match_scores.max(dim=-1)[0].squeeze(-1) - - # Combine name match and span match scores. - return {'score': name_match_score + span_match_score} From 702d8ba296872895e27189583bd1d08436e619b9 Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Wed, 10 Apr 2019 16:32:19 -0700 Subject: [PATCH 08/20] whitespace change --- allennlp/data/fields/text_field.py | 1 - 1 file changed, 1 deletion(-) diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py index 72719b82321..7f01e0509c1 100644 --- a/allennlp/data/fields/text_field.py +++ b/allennlp/data/fields/text_field.py @@ -122,7 +122,6 @@ def get_padding_lengths(self) -> Dict[str, int]: padding_keys = {key for d in lengths for key in d.keys()} for padding_key in padding_keys: padding_lengths[padding_key] = max(x[padding_key] if padding_key in x else 0 for x in lengths) - return padding_lengths @overrides From 10e2d2b28fb7e87d1e304da2296207f07b61243e Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Thu, 11 Apr 2019 17:30:37 -0700 Subject: [PATCH 09/20] make working --- allennlp/tests/end_to_end/empty_list_test.py | 42 ++++++++++++++------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/allennlp/tests/end_to_end/empty_list_test.py b/allennlp/tests/end_to_end/empty_list_test.py index 9bbc847f082..959e9e32122 100644 --- a/allennlp/tests/end_to_end/empty_list_test.py +++ b/allennlp/tests/end_to_end/empty_list_test.py @@ -1,22 +1,24 @@ # pylint: disable=no-self-use,invalid-name import json -import torch from typing import Iterator, List, Dict -import numpy +import torch +from allennlp.common import Params from allennlp.common.testing import AllenNlpTestCase from allennlp.data import Token, Vocabulary, DatasetReader -from allennlp.data.fields import TextField, LabelField, ListField, IndexField, SequenceLabelField, SpanField +from allennlp.data.fields import TextField, ListField, SpanField from allennlp.data.instance import Instance -from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer +from allennlp.data.iterators import BasicIterator +from allennlp.data.token_indexers import SingleIdTokenIndexer from allennlp.data.tokenizers.word_tokenizer import WordTokenizer from allennlp.models import Model -from allennlp.modules import TextFieldEmbedder, Embedding +from allennlp.modules import Embedding from allennlp.modules.span_extractors import EndpointSpanExtractor from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder from allennlp.nn.util import get_text_field_mask, batched_index_select + class SalienceReader(DatasetReader): def __init__(self): super().__init__(lazy=False) @@ -86,9 +88,10 @@ class SalienceModel(Model): """ def __init__(self, vocab: Vocabulary) -> None: super().__init__(vocab) - # In the real model this is pretrained. - token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), - embedding_dim=30) + params = Params({"embedding_dim": 50, + "pretrained_file": "s3://allennlp/datasets/glove/glove.6B.50d.txt.gz", + "trainable": False}) + token_embedding = Embedding.from_params(vocab, params) self._embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) @classmethod @@ -111,7 +114,6 @@ def forward(self, body: Dict[str, torch.LongTensor], entity_name: Dict[str, torch.LongTensor], entity_spans: torch.Tensor) -> Dict[str, torch.Tensor]: - # Embed body # size: @@ -139,15 +141,15 @@ def forward(self, embedded_spans = extractor(embedded_body_tokens, entity_spans, span_indices_mask=span_mask) # size: - name_match_score = torch.cosine_similarity(embedded_body, embedded_name) + name_match_score = torch.nn.functional.cosine_similarity(embedded_body, embedded_name) # size: transposed_embedded_spans = embedded_spans.transpose(1, 2) # Note: Real model normalizes to give cosine similarity. # size: - span_match_scores = torch.matmul(embedded_body, transposed_embedded_spans) + span_match_scores = torch.bmm(embedded_body.unsqueeze(1), transposed_embedded_spans).squeeze(1) # size: - masked_span_match_scores = span_match_scores * span_mask + masked_span_match_scores = span_match_scores * span_mask.float() # Aggregate with max to get single score # size: span_match_score = masked_span_match_scores.max(dim=-1)[0].squeeze(-1) @@ -155,7 +157,6 @@ def forward(self, # Combine name match and span match scores. return {'score': name_match_score + span_match_score} - class EmptyListTest(AllenNlpTestCase): def test_empty_list_can_be_tensorized(self): token_indexers = {"tokens": SingleIdTokenIndexer()} @@ -166,3 +167,18 @@ def test_empty_list_can_be_tensorized(self): fields = {'list': list_field} instance = Instance(fields) instance.as_tensor_dict() + + def test_end_to_end(self): + reader = SalienceReader() + dataset = reader.read(self.FIXTURES_ROOT / 'end_to_end' / 'sample.json') + vocab = Vocabulary.from_instances(dataset) + model = SalienceModel(vocab) + model.eval() + iterator = BasicIterator(batch_size=2) + iterator.index_with(vocab) + batch = next(iterator(dataset, shuffle=False)) + results = model.forward(**batch)["score"] + # For the sample data: + # {"body": "This is a test.", "entity_name": "exam", "entity_mentions": ["test", "quiz"]} + # {"body": "The dog went on a walk.", "entity_name": "animal", "entity_mentions": ["hound", "puppy"]} + assert results[0] > results[1] From ac376136f949d944dc9b5b23304a8de7b2cabe9f Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Wed, 24 Apr 2019 18:29:36 -0700 Subject: [PATCH 10/20] Works. Need to review tests --- allennlp/tests/end_to_end/empty_list_test.py | 106 ++++++++++++------- 1 file changed, 67 insertions(+), 39 deletions(-) diff --git a/allennlp/tests/end_to_end/empty_list_test.py b/allennlp/tests/end_to_end/empty_list_test.py index 959e9e32122..76f54168462 100644 --- a/allennlp/tests/end_to_end/empty_list_test.py +++ b/allennlp/tests/end_to_end/empty_list_test.py @@ -3,18 +3,18 @@ from typing import Iterator, List, Dict import torch +from torch.nn import Module from allennlp.common import Params from allennlp.common.testing import AllenNlpTestCase from allennlp.data import Token, Vocabulary, DatasetReader -from allennlp.data.fields import TextField, ListField, SpanField +from allennlp.data.fields import TextField, ListField from allennlp.data.instance import Instance from allennlp.data.iterators import BasicIterator from allennlp.data.token_indexers import SingleIdTokenIndexer from allennlp.data.tokenizers.word_tokenizer import WordTokenizer from allennlp.models import Model from allennlp.modules import Embedding -from allennlp.modules.span_extractors import EndpointSpanExtractor from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder from allennlp.nn.util import get_text_field_mask, batched_index_select @@ -50,8 +50,9 @@ def text_to_instance(self, """ Generates an instance based on a body of text, an entity with a given name (which need not be in the body) and series of entity - mentions. The mentions will be matched against the text to generate - mention spans. + mentions. The mentions will be matched against the text. In the real + model we generate spans, but for this repro we return them as + TextFields. """ fields = {} @@ -60,42 +61,27 @@ def text_to_instance(self, fields['body'] = TextField(body_tokens, self._token_indexers) EMPTY_TEXT = fields['body'].empty_field() - EMPTY_SPAN = SpanField(-1, -1, EMPTY_TEXT) - def get_entity_spans(mentions): - spans = [] + def get_matching_entities(mentions): + matched_mention = [] for mention in mentions: mention_tokens = self._tokenizer.tokenize(mention) for start_index in range(0, len(body_tokens) - len(mention_tokens) + 1): selected_tokens = body_tokens[start_index:start_index + len(mention_tokens)] if self._is_same_token_sequence(selected_tokens, mention_tokens): - spans.append(SpanField(start_index, - start_index + len(mention_tokens) - 1, - fields['body'])) + matched_mention.append(TextField(selected_tokens, self._token_indexers)) # Empty lists fields are actually non-empty list fields full of padding. - if not spans: - spans.append(EMPTY_SPAN) - return ListField(spans) + if not matched_mention: + matched_mention.append(EMPTY_TEXT) + return ListField(matched_mention) fields['entity_name'] = TextField(self._tokenizer.tokenize(entity_name), self._token_indexers) - fields['entity_spans'] = get_entity_spans(entity_mentions) + fields['entity_mentions'] = get_matching_entities(entity_mentions) return Instance(fields) -class SalienceModel(Model): - """ - An unsupervised baseline model for salience based on textual similarity. - """ - def __init__(self, vocab: Vocabulary) -> None: - super().__init__(vocab) - params = Params({"embedding_dim": 50, - "pretrained_file": "s3://allennlp/datasets/glove/glove.6B.50d.txt.gz", - "trainable": False}) - token_embedding = Embedding.from_params(vocab, params) - self._embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) - - @classmethod - def fixed_length_embedding(cls, mask, embedded_tokens): +class FixedLengthEmbedding(Module): + def forward(cls, mask, embedded_tokens): """ Create a very simple fixed length embedding of a sequence by concatenating the first and last embedded tokens. @@ -105,21 +91,39 @@ def fixed_length_embedding(cls, mask, embedded_tokens): embedded_first_tokens = embedded_tokens[:,0,:] # size: indices = sequence_lengths - 1 + # size: + zeros = torch.zeros_like(indices) + # Handle empty lists. Caller responsible for masking. + # size: + adjusted_indices = torch.stack((indices, zeros), dim=1).max(dim=1)[0] # size: - embedded_last_tokens = batched_index_select(embedded_tokens, indices) + embedded_last_tokens = batched_index_select(embedded_tokens, adjusted_indices) # size: return torch.cat((embedded_first_tokens, embedded_last_tokens), 1) +class SalienceModel(Model): + """ + An unsupervised baseline model for salience based on textual similarity. + """ + def __init__(self, vocab: Vocabulary) -> None: + super().__init__(vocab) + params = Params({"embedding_dim": 50, + "pretrained_file": "s3://allennlp/datasets/glove/glove.6B.50d.txt.gz", + "trainable": False}) + token_embedding = Embedding.from_params(vocab, params) + self.embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) + self.fixed_length_embedding = FixedLengthEmbedding() + def forward(self, body: Dict[str, torch.LongTensor], entity_name: Dict[str, torch.LongTensor], - entity_spans: torch.Tensor) -> Dict[str, torch.Tensor]: + entity_mentions: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: # Embed body # size: body_mask = get_text_field_mask(body) # size: - embedded_body_tokens = self._embedder(body) + embedded_body_tokens = self.embedder(body) # size: embedded_body = self.fixed_length_embedding(body_mask, embedded_body_tokens) @@ -128,17 +132,26 @@ def forward(self, # size: name_mask = get_text_field_mask(entity_name) # size: - embedded_name_tokens = self._embedder(entity_name) + embedded_name_tokens = self.embedder(entity_name) # size: embedded_name = self.fixed_length_embedding(name_mask, embedded_name_tokens) # Extract embedded spans from the body - extractor = EndpointSpanExtractor(input_dim=embedded_body_tokens.size(-1)) - # size: - span_mask = (entity_spans[:, :, 0] >= 0).long() + # size: + mentions_mask = get_text_field_mask(entity_mentions, num_wrapping_dims=1) + # size: + embedded_mentions_tokens = self.embedder(entity_mentions) + # size: [] + embedded_spans_list = [] + for i in range(mentions_mask.size(1)): + embedded_spans_tmp = self.fixed_length_embedding( + mentions_mask[:, i, :], + embedded_mentions_tokens[:, i, :, :] + ).unsqueeze(1) + embedded_spans_list.append(embedded_spans_tmp) # size: - embedded_spans = extractor(embedded_body_tokens, entity_spans, span_indices_mask=span_mask) + embedded_spans = torch.cat(embedded_spans_list, 1) # size: name_match_score = torch.nn.functional.cosine_similarity(embedded_body, embedded_name) @@ -149,7 +162,7 @@ def forward(self, # size: span_match_scores = torch.bmm(embedded_body.unsqueeze(1), transposed_embedded_spans).squeeze(1) # size: - masked_span_match_scores = span_match_scores * span_mask.float() + masked_span_match_scores = span_match_scores * (mentions_mask[:, :, 0] != 0).float() # Aggregate with max to get single score # size: span_match_score = masked_span_match_scores.max(dim=-1)[0].squeeze(-1) @@ -164,11 +177,26 @@ def test_empty_list_can_be_tensorized(self): tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, token_indexers) list_field = ListField([text_field.empty_field()]) - fields = {'list': list_field} + fields = {'list': list_field, 'bar': TextField(tokenizer.tokenize("BAR"), token_indexers)} instance = Instance(fields) + vocab = Vocabulary.from_instances([instance]) + instance.index_fields(vocab) instance.as_tensor_dict() - def test_end_to_end(self): + # A batch with entirely empty lists. + def test_end_to_end_broken_without_fix(self): + reader = SalienceReader() + dataset = reader.read(self.FIXTURES_ROOT / 'end_to_end' / 'sample.json')[1:] + vocab = Vocabulary.from_instances(dataset) + model = SalienceModel(vocab) + model.eval() + iterator = BasicIterator(batch_size=2) + iterator.index_with(vocab) + batch = next(iterator(dataset, shuffle=False)) + model.forward(**batch)["score"] + + # A mixed batch with some empty lists. + def test_end_to_end_works_in_master(self): reader = SalienceReader() dataset = reader.read(self.FIXTURES_ROOT / 'end_to_end' / 'sample.json') vocab = Vocabulary.from_instances(dataset) From 4c391e776b6b9f772d15425ae70029180ff4a488 Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Thu, 25 Apr 2019 17:57:41 -0700 Subject: [PATCH 11/20] Stop using glove --- allennlp/tests/end_to_end/empty_list_test.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/allennlp/tests/end_to_end/empty_list_test.py b/allennlp/tests/end_to_end/empty_list_test.py index 76f54168462..3edbad2233b 100644 --- a/allennlp/tests/end_to_end/empty_list_test.py +++ b/allennlp/tests/end_to_end/empty_list_test.py @@ -107,10 +107,13 @@ class SalienceModel(Model): """ def __init__(self, vocab: Vocabulary) -> None: super().__init__(vocab) - params = Params({"embedding_dim": 50, - "pretrained_file": "s3://allennlp/datasets/glove/glove.6B.50d.txt.gz", - "trainable": False}) - token_embedding = Embedding.from_params(vocab, params) + # Dummy weights + weight = torch.ones(vocab.get_vocab_size(), 10) + token_embedding = Embedding( + num_embeddings=vocab.get_vocab_size(), + embedding_dim=10, + weight=weight, + trainable=False) self.embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) self.fixed_length_embedding = FixedLengthEmbedding() From 06dfd4c32da52efacf2ccf40628848099391a867 Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Thu, 25 Apr 2019 18:11:39 -0700 Subject: [PATCH 12/20] Fix types and lint. --- allennlp/tests/end_to_end/empty_list_test.py | 21 +++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/allennlp/tests/end_to_end/empty_list_test.py b/allennlp/tests/end_to_end/empty_list_test.py index 3edbad2233b..5cca8c536f8 100644 --- a/allennlp/tests/end_to_end/empty_list_test.py +++ b/allennlp/tests/end_to_end/empty_list_test.py @@ -1,13 +1,12 @@ # pylint: disable=no-self-use,invalid-name import json -from typing import Iterator, List, Dict +from typing import Iterator, List, Dict, Any, MutableMapping import torch from torch.nn import Module -from allennlp.common import Params from allennlp.common.testing import AllenNlpTestCase -from allennlp.data import Token, Vocabulary, DatasetReader +from allennlp.data import Token, Vocabulary, DatasetReader, Field from allennlp.data.fields import TextField, ListField from allennlp.data.instance import Instance from allennlp.data.iterators import BasicIterator @@ -43,10 +42,11 @@ def _is_same_token_sequence(cls, seq1: List[Token], seq2: List[Token]): return False return True - def text_to_instance(self, + def text_to_instance(self, # type: ignore body: str, entity_name: str, entity_mentions: List[str]) -> Instance: + # pylint: disable=arguments-differ """ Generates an instance based on a body of text, an entity with a given name (which need not be in the body) and series of entity @@ -55,7 +55,7 @@ def text_to_instance(self, TextFields. """ - fields = {} + fields: MutableMapping[str, Field[Any]] = {} body_tokens = self._tokenizer.tokenize(body) fields['body'] = TextField(body_tokens, self._token_indexers) @@ -81,14 +81,15 @@ def get_matching_entities(mentions): return Instance(fields) class FixedLengthEmbedding(Module): - def forward(cls, mask, embedded_tokens): + def forward(self, mask, embedded_tokens): + # pylint: disable=arguments-differ """ Create a very simple fixed length embedding of a sequence by concatenating the first and last embedded tokens. """ sequence_lengths = mask.sum(dim=1) # size: - embedded_first_tokens = embedded_tokens[:,0,:] + embedded_first_tokens = embedded_tokens[:, 0, :] # size: indices = sequence_lengths - 1 # size: @@ -117,10 +118,12 @@ def __init__(self, vocab: Vocabulary) -> None: self.embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) self.fixed_length_embedding = FixedLengthEmbedding() - def forward(self, + def forward(self, # type: ignore body: Dict[str, torch.LongTensor], entity_name: Dict[str, torch.LongTensor], entity_mentions: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + # Embed body # size: @@ -196,7 +199,7 @@ def test_end_to_end_broken_without_fix(self): iterator = BasicIterator(batch_size=2) iterator.index_with(vocab) batch = next(iterator(dataset, shuffle=False)) - model.forward(**batch)["score"] + model.forward(**batch) # A mixed batch with some empty lists. def test_end_to_end_works_in_master(self): From e7bee8a65d7975c9c35e9e15c657440a5a2811c6 Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Fri, 3 May 2019 19:50:54 -0700 Subject: [PATCH 13/20] fixes --- allennlp/tests/data/fields/list_field_test.py | 71 ++++++++++- allennlp/tests/end_to_end/empty_list_test.py | 114 ------------------ 2 files changed, 70 insertions(+), 115 deletions(-) diff --git a/allennlp/tests/data/fields/list_field_test.py b/allennlp/tests/data/fields/list_field_test.py index 9670164703c..620a1da13cf 100644 --- a/allennlp/tests/data/fields/list_field_test.py +++ b/allennlp/tests/data/fields/list_field_test.py @@ -1,11 +1,39 @@ # pylint: disable=no-self-use,invalid-name +from typing import Dict + +import torch + import numpy from allennlp.common.testing import AllenNlpTestCase -from allennlp.data import Token, Vocabulary +from allennlp.data import Token, Vocabulary, Instance from allennlp.data.fields import TextField, LabelField, ListField, IndexField, SequenceLabelField +from allennlp.data.iterators import BasicIterator from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer +from allennlp.data.tokenizers import WordTokenizer +from allennlp.models import Model +from allennlp.modules import Embedding +from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder + +class DummyModel(Model): + """ + Performs a common operation (embedding) that won't work on an empty tensor. + Returns an arbitrary loss. + """ + def __init__(self, vocab: Vocabulary) -> None: + super().__init__(vocab) + weight = torch.ones(vocab.get_vocab_size(), 10) + token_embedding = Embedding( + num_embeddings=vocab.get_vocab_size(), + embedding_dim=10, + weight=weight, + trainable=False) + self.embedder = BasicTextFieldEmbedder({"words": token_embedding}) + def forward(self, # type: ignore + list: Dict[str, torch.LongTensor]) -> Dict[str, torch.Tensor]: + self.embedder(list) + return {"loss": 1.0} class TestListField(AllenNlpTestCase): def setUp(self): @@ -39,6 +67,17 @@ def setUp(self): self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() + tokenizer = WordTokenizer() + tokens = tokenizer.tokenize("Foo") + text_field = TextField(tokens, self.word_indexer) + empty_list_field = ListField([text_field.empty_field()]) + empty_fields = {'list': empty_list_field} + self.empty_instance = Instance(empty_fields) + + non_empty_list_field = ListField([text_field]) + non_empty_fields = {'list': non_empty_list_field} + self.non_empty_instance = Instance(non_empty_fields) + super(TestListField, self).setUp() def test_get_padding_lengths(self): @@ -189,3 +228,33 @@ def test_sequence_methods(self): assert len(list_field) == 3 assert list_field[1] == self.field2 assert [f for f in list_field] == [self.field1, self.field2, self.field3] + + def test_empty_list_can_be_tensorized(self): + tokenizer = WordTokenizer() + tokens = tokenizer.tokenize("Foo") + text_field = TextField(tokens, self.word_indexer) + list_field = ListField([text_field.empty_field()]) + fields = {'list': list_field, 'bar': TextField(tokenizer.tokenize("BAR"), self.word_indexer)} + instance = Instance(fields) + instance.index_fields(self.vocab) + instance.as_tensor_dict() + + def test_batch_with_some_empty_lists_works(self): + dataset = [self.empty_instance, self.non_empty_instance] + + model = DummyModel(self.vocab) + model.eval() + iterator = BasicIterator(batch_size=2) + iterator.index_with(self.vocab) + batch = next(iterator(dataset, shuffle=False)) + model.forward(**batch) + + def test_batch_of_entirely_empty_lists_works(self): + dataset = [self.empty_instance, self.empty_instance] + + model = DummyModel(self.vocab) + model.eval() + iterator = BasicIterator(batch_size=2) + iterator.index_with(self.vocab) + batch = next(iterator(dataset, shuffle=False)) + model.forward(**batch) diff --git a/allennlp/tests/end_to_end/empty_list_test.py b/allennlp/tests/end_to_end/empty_list_test.py index 5cca8c536f8..f2c2588b716 100644 --- a/allennlp/tests/end_to_end/empty_list_test.py +++ b/allennlp/tests/end_to_end/empty_list_test.py @@ -102,117 +102,3 @@ def forward(self, mask, embedded_tokens): # size: return torch.cat((embedded_first_tokens, embedded_last_tokens), 1) -class SalienceModel(Model): - """ - An unsupervised baseline model for salience based on textual similarity. - """ - def __init__(self, vocab: Vocabulary) -> None: - super().__init__(vocab) - # Dummy weights - weight = torch.ones(vocab.get_vocab_size(), 10) - token_embedding = Embedding( - num_embeddings=vocab.get_vocab_size(), - embedding_dim=10, - weight=weight, - trainable=False) - self.embedder = BasicTextFieldEmbedder({"tokens": token_embedding}) - self.fixed_length_embedding = FixedLengthEmbedding() - - def forward(self, # type: ignore - body: Dict[str, torch.LongTensor], - entity_name: Dict[str, torch.LongTensor], - entity_mentions: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: - # pylint: disable=arguments-differ - - # Embed body - - # size: - body_mask = get_text_field_mask(body) - # size: - embedded_body_tokens = self.embedder(body) - # size: - embedded_body = self.fixed_length_embedding(body_mask, embedded_body_tokens) - - # Embed name (in isolation) - - # size: - name_mask = get_text_field_mask(entity_name) - # size: - embedded_name_tokens = self.embedder(entity_name) - # size: - embedded_name = self.fixed_length_embedding(name_mask, embedded_name_tokens) - - # Extract embedded spans from the body - - # size: - mentions_mask = get_text_field_mask(entity_mentions, num_wrapping_dims=1) - # size: - embedded_mentions_tokens = self.embedder(entity_mentions) - # size: [] - embedded_spans_list = [] - for i in range(mentions_mask.size(1)): - embedded_spans_tmp = self.fixed_length_embedding( - mentions_mask[:, i, :], - embedded_mentions_tokens[:, i, :, :] - ).unsqueeze(1) - embedded_spans_list.append(embedded_spans_tmp) - # size: - embedded_spans = torch.cat(embedded_spans_list, 1) - - # size: - name_match_score = torch.nn.functional.cosine_similarity(embedded_body, embedded_name) - - # size: - transposed_embedded_spans = embedded_spans.transpose(1, 2) - # Note: Real model normalizes to give cosine similarity. - # size: - span_match_scores = torch.bmm(embedded_body.unsqueeze(1), transposed_embedded_spans).squeeze(1) - # size: - masked_span_match_scores = span_match_scores * (mentions_mask[:, :, 0] != 0).float() - # Aggregate with max to get single score - # size: - span_match_score = masked_span_match_scores.max(dim=-1)[0].squeeze(-1) - - # Combine name match and span match scores. - return {'score': name_match_score + span_match_score} - -class EmptyListTest(AllenNlpTestCase): - def test_empty_list_can_be_tensorized(self): - token_indexers = {"tokens": SingleIdTokenIndexer()} - tokenizer = WordTokenizer() - tokens = tokenizer.tokenize("Foo") - text_field = TextField(tokens, token_indexers) - list_field = ListField([text_field.empty_field()]) - fields = {'list': list_field, 'bar': TextField(tokenizer.tokenize("BAR"), token_indexers)} - instance = Instance(fields) - vocab = Vocabulary.from_instances([instance]) - instance.index_fields(vocab) - instance.as_tensor_dict() - - # A batch with entirely empty lists. - def test_end_to_end_broken_without_fix(self): - reader = SalienceReader() - dataset = reader.read(self.FIXTURES_ROOT / 'end_to_end' / 'sample.json')[1:] - vocab = Vocabulary.from_instances(dataset) - model = SalienceModel(vocab) - model.eval() - iterator = BasicIterator(batch_size=2) - iterator.index_with(vocab) - batch = next(iterator(dataset, shuffle=False)) - model.forward(**batch) - - # A mixed batch with some empty lists. - def test_end_to_end_works_in_master(self): - reader = SalienceReader() - dataset = reader.read(self.FIXTURES_ROOT / 'end_to_end' / 'sample.json') - vocab = Vocabulary.from_instances(dataset) - model = SalienceModel(vocab) - model.eval() - iterator = BasicIterator(batch_size=2) - iterator.index_with(vocab) - batch = next(iterator(dataset, shuffle=False)) - results = model.forward(**batch)["score"] - # For the sample data: - # {"body": "This is a test.", "entity_name": "exam", "entity_mentions": ["test", "quiz"]} - # {"body": "The dog went on a walk.", "entity_name": "animal", "entity_mentions": ["hound", "puppy"]} - assert results[0] > results[1] From 86bc77c17393298454d99068a5576e2a6714b3ae Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Fri, 3 May 2019 19:51:36 -0700 Subject: [PATCH 14/20] delete --- allennlp/tests/end_to_end/empty_list_test.py | 104 ------------------- 1 file changed, 104 deletions(-) delete mode 100644 allennlp/tests/end_to_end/empty_list_test.py diff --git a/allennlp/tests/end_to_end/empty_list_test.py b/allennlp/tests/end_to_end/empty_list_test.py deleted file mode 100644 index f2c2588b716..00000000000 --- a/allennlp/tests/end_to_end/empty_list_test.py +++ /dev/null @@ -1,104 +0,0 @@ -# pylint: disable=no-self-use,invalid-name -import json -from typing import Iterator, List, Dict, Any, MutableMapping - -import torch -from torch.nn import Module - -from allennlp.common.testing import AllenNlpTestCase -from allennlp.data import Token, Vocabulary, DatasetReader, Field -from allennlp.data.fields import TextField, ListField -from allennlp.data.instance import Instance -from allennlp.data.iterators import BasicIterator -from allennlp.data.token_indexers import SingleIdTokenIndexer -from allennlp.data.tokenizers.word_tokenizer import WordTokenizer -from allennlp.models import Model -from allennlp.modules import Embedding -from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder -from allennlp.nn.util import get_text_field_mask, batched_index_select - - -class SalienceReader(DatasetReader): - def __init__(self): - super().__init__(lazy=False) - self._token_indexers = {"tokens": SingleIdTokenIndexer()} - self._tokenizer = WordTokenizer() - - def _read(self, file_path: str) -> Iterator[Instance]: - with open(file_path) as file: - for line in file: - doc = json.loads(line) - yield self.text_to_instance(body=doc['body'], - entity_name=doc['entity_name'], - entity_mentions=doc['entity_mentions']) - - @classmethod - def _is_same_token_sequence(cls, seq1: List[Token], seq2: List[Token]): - """ - Utility function to check if two token sequences are identical. - """ - for t1, t2 in zip(seq1, seq2): - if t1.text != t2.text: - return False - return True - - def text_to_instance(self, # type: ignore - body: str, - entity_name: str, - entity_mentions: List[str]) -> Instance: - # pylint: disable=arguments-differ - """ - Generates an instance based on a body of text, an entity with a - given name (which need not be in the body) and series of entity - mentions. The mentions will be matched against the text. In the real - model we generate spans, but for this repro we return them as - TextFields. - """ - - fields: MutableMapping[str, Field[Any]] = {} - - body_tokens = self._tokenizer.tokenize(body) - fields['body'] = TextField(body_tokens, self._token_indexers) - - EMPTY_TEXT = fields['body'].empty_field() - - def get_matching_entities(mentions): - matched_mention = [] - for mention in mentions: - mention_tokens = self._tokenizer.tokenize(mention) - for start_index in range(0, len(body_tokens) - len(mention_tokens) + 1): - selected_tokens = body_tokens[start_index:start_index + len(mention_tokens)] - if self._is_same_token_sequence(selected_tokens, mention_tokens): - matched_mention.append(TextField(selected_tokens, self._token_indexers)) - # Empty lists fields are actually non-empty list fields full of padding. - if not matched_mention: - matched_mention.append(EMPTY_TEXT) - return ListField(matched_mention) - - fields['entity_name'] = TextField(self._tokenizer.tokenize(entity_name), self._token_indexers) - fields['entity_mentions'] = get_matching_entities(entity_mentions) - - return Instance(fields) - -class FixedLengthEmbedding(Module): - def forward(self, mask, embedded_tokens): - # pylint: disable=arguments-differ - """ - Create a very simple fixed length embedding of a sequence by - concatenating the first and last embedded tokens. - """ - sequence_lengths = mask.sum(dim=1) - # size: - embedded_first_tokens = embedded_tokens[:, 0, :] - # size: - indices = sequence_lengths - 1 - # size: - zeros = torch.zeros_like(indices) - # Handle empty lists. Caller responsible for masking. - # size: - adjusted_indices = torch.stack((indices, zeros), dim=1).max(dim=1)[0] - # size: - embedded_last_tokens = batched_index_select(embedded_tokens, adjusted_indices) - # size: - return torch.cat((embedded_first_tokens, embedded_last_tokens), 1) - From 243c18f6fa979be97078f8c2707f957d98af91c0 Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Fri, 3 May 2019 19:52:17 -0700 Subject: [PATCH 15/20] delete old --- allennlp/tests/fixtures/end_to_end/sample.json | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 allennlp/tests/fixtures/end_to_end/sample.json diff --git a/allennlp/tests/fixtures/end_to_end/sample.json b/allennlp/tests/fixtures/end_to_end/sample.json deleted file mode 100644 index e0928d32141..00000000000 --- a/allennlp/tests/fixtures/end_to_end/sample.json +++ /dev/null @@ -1,2 +0,0 @@ -{"body": "This is a test.", "entity_name": "exam", "entity_mentions": ["test", "quiz"]} -{"body": "The dog went on a walk.", "entity_name": "animal", "entity_mentions": ["hound", "puppy"]} From 86791911de3cf5f6d0dd9c8204987afb49796664 Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Fri, 3 May 2019 19:56:23 -0700 Subject: [PATCH 16/20] Add comment --- allennlp/tests/data/fields/list_field_test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/allennlp/tests/data/fields/list_field_test.py b/allennlp/tests/data/fields/list_field_test.py index 620a1da13cf..5bacc1c6854 100644 --- a/allennlp/tests/data/fields/list_field_test.py +++ b/allennlp/tests/data/fields/list_field_test.py @@ -249,6 +249,12 @@ def test_batch_with_some_empty_lists_works(self): batch = next(iterator(dataset, shuffle=False)) model.forward(**batch) + # This use case may seem a bit peculiar. It's intended for situations where + # you have sparse inputs that are used as additional features for some + # prediction, and they are sparse enough that they can be empty for some + # cases. It would be silly to try to handle these as None in your model; it + # makes a whole lot more sense to just have a minimally-sized tensor that + # gets entirely masked and has no effect on the rest of the model. def test_batch_of_entirely_empty_lists_works(self): dataset = [self.empty_instance, self.empty_instance] From 99f36fa59d15892c02f0643330a002fca56e8c2c Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Fri, 3 May 2019 19:58:55 -0700 Subject: [PATCH 17/20] lint --- allennlp/tests/data/fields/list_field_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/allennlp/tests/data/fields/list_field_test.py b/allennlp/tests/data/fields/list_field_test.py index 5bacc1c6854..29bed81c82e 100644 --- a/allennlp/tests/data/fields/list_field_test.py +++ b/allennlp/tests/data/fields/list_field_test.py @@ -1,4 +1,4 @@ -# pylint: disable=no-self-use,invalid-name +# pylint: disable=no-self-use,invalid-name,arguments-differ from typing import Dict import torch @@ -31,8 +31,8 @@ def __init__(self, vocab: Vocabulary) -> None: self.embedder = BasicTextFieldEmbedder({"words": token_embedding}) def forward(self, # type: ignore - list: Dict[str, torch.LongTensor]) -> Dict[str, torch.Tensor]: - self.embedder(list) + list_tensor: Dict[str, torch.LongTensor]) -> Dict[str, torch.Tensor]: + self.embedder(list_tensor) return {"loss": 1.0} class TestListField(AllenNlpTestCase): From 3106214c834544526f2a789535a6da694d8b980e Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Fri, 3 May 2019 20:00:45 -0700 Subject: [PATCH 18/20] imports --- allennlp/tests/data/fields/list_field_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/allennlp/tests/data/fields/list_field_test.py b/allennlp/tests/data/fields/list_field_test.py index 29bed81c82e..03e37a7faf7 100644 --- a/allennlp/tests/data/fields/list_field_test.py +++ b/allennlp/tests/data/fields/list_field_test.py @@ -1,9 +1,8 @@ # pylint: disable=no-self-use,invalid-name,arguments-differ from typing import Dict -import torch - import numpy +import torch from allennlp.common.testing import AllenNlpTestCase from allennlp.data import Token, Vocabulary, Instance @@ -15,6 +14,7 @@ from allennlp.modules import Embedding from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder + class DummyModel(Model): """ Performs a common operation (embedding) that won't work on an empty tensor. From 053f666d8790c2c68e0708c72282bc995ae5634c Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Fri, 3 May 2019 20:01:45 -0700 Subject: [PATCH 19/20] delete --- allennlp/tests/end_to_end/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 allennlp/tests/end_to_end/__init__.py diff --git a/allennlp/tests/end_to_end/__init__.py b/allennlp/tests/end_to_end/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 From 6ef9f5cc5c65fc1a475e775b304d1fbde3821322 Mon Sep 17 00:00:00 2001 From: Brendan Roof Date: Fri, 3 May 2019 20:19:00 -0700 Subject: [PATCH 20/20] fix --- allennlp/tests/data/fields/list_field_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/allennlp/tests/data/fields/list_field_test.py b/allennlp/tests/data/fields/list_field_test.py index 03e37a7faf7..9dcd983869e 100644 --- a/allennlp/tests/data/fields/list_field_test.py +++ b/allennlp/tests/data/fields/list_field_test.py @@ -71,11 +71,11 @@ def setUp(self): tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) empty_list_field = ListField([text_field.empty_field()]) - empty_fields = {'list': empty_list_field} + empty_fields = {'list_tensor': empty_list_field} self.empty_instance = Instance(empty_fields) non_empty_list_field = ListField([text_field]) - non_empty_fields = {'list': non_empty_list_field} + non_empty_fields = {'list_tensor': non_empty_list_field} self.non_empty_instance = Instance(non_empty_fields) super(TestListField, self).setUp()