This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
sequence_tagging.py
98 lines (79 loc) · 3.67 KB
/
sequence_tagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from typing import Dict, List
import logging
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import TextField, SequenceLabelField, MetadataField, Field
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
logger = logging.getLogger(__name__)
DEFAULT_WORD_TAG_DELIMITER = "###"
@DatasetReader.register("sequence_tagging")
class SequenceTaggingDatasetReader(DatasetReader):
"""
Reads instances from a pretokenised file where each line is in the following format:
```
WORD###TAG [TAB] WORD###TAG [TAB] ..... \n
```
and converts it into a `Dataset` suitable for sequence tagging. You can also specify
alternative delimiters in the constructor.
Registered as a `DatasetReader` with name "sequence_tagging".
# Parameters
word_tag_delimiter: `str`, optional (default=`"###"`)
The text that separates each WORD from its TAG.
token_delimiter: `str`, optional (default=`None`)
The text that separates each WORD-TAG pair from the next pair. If `None`
then the line will just be split on whitespace.
token_indexers : `Dict[str, TokenIndexer]`, optional (default=`{"tokens": SingleIdTokenIndexer()}`)
We use this to define the input representation for the text. See :class:`TokenIndexer`.
Note that the `output` tags will always correspond to single token IDs based on how they
are pre-tokenised in the data file.
"""
def __init__(
self,
word_tag_delimiter: str = DEFAULT_WORD_TAG_DELIMITER,
token_delimiter: str = None,
token_indexers: Dict[str, TokenIndexer] = None,
**kwargs,
) -> None:
super().__init__(
manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs
)
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
self._word_tag_delimiter = word_tag_delimiter
self._token_delimiter = token_delimiter
@overrides
def _read(self, file_path):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
for line in self.shard_iterable(data_file):
line = line.strip("\n")
# skip blank lines
if not line:
continue
tokens_and_tags = [
pair.rsplit(self._word_tag_delimiter, 1)
for pair in line.split(self._token_delimiter)
]
tokens = [Token(token) for token, tag in tokens_and_tags]
tags = [tag for token, tag in tokens_and_tags]
yield self.text_to_instance(tokens, tags)
def text_to_instance( # type: ignore
self, tokens: List[Token], tags: List[str] = None
) -> Instance:
"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""
fields: Dict[str, Field] = {}
sequence = TextField(tokens)
fields["tokens"] = sequence
fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
if tags is not None:
fields["tags"] = SequenceLabelField(tags, sequence)
return Instance(fields)
@overrides
def apply_token_indexers(self, instance: Instance) -> None:
instance.fields["tokens"]._token_indexers = self._token_indexers # type: ignore