This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
single_id_token_indexer.py
118 lines (100 loc) · 5.14 KB
/
single_id_token_indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from typing import Dict, List, Optional, Any
import itertools
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.tokenizers import Token
from allennlp.data.token_indexers.token_indexer import TokenIndexer, IndexedTokenList
_DEFAULT_VALUE = "THIS IS A REALLY UNLIKELY VALUE THAT HAS TO BE A STRING"
@TokenIndexer.register("single_id")
class SingleIdTokenIndexer(TokenIndexer):
"""
This :class:`TokenIndexer` represents tokens as single integers.
Registered as a `TokenIndexer` with name "single_id".
# Parameters
namespace : `Optional[str]`, optional (default=`"tokens"`)
We will use this namespace in the :class:`Vocabulary` to map strings to indices. If you
explicitly pass in `None` here, we will skip indexing and vocabulary lookups. This means
that the `feature_name` you use must correspond to an integer value (like `text_id`, for
instance, which gets set by some tokenizers, such as when using byte encoding).
lowercase_tokens : `bool`, optional (default=`False`)
If `True`, we will call `token.lower()` before getting an index for the token from the
vocabulary.
start_tokens : `List[str]`, optional (default=`None`)
These are prepended to the tokens provided to `tokens_to_indices`.
end_tokens : `List[str]`, optional (default=`None`)
These are appended to the tokens provided to `tokens_to_indices`.
feature_name : `str`, optional (default=`"text"`)
We will use the :class:`Token` attribute with this name as input. This is potentially
useful, e.g., for using NER tags instead of (or in addition to) surface forms as your inputs
(passing `ent_type_` here would do that). If you use a non-default value here, you almost
certainly want to also change the `namespace` parameter, and you might want to give a
`default_value`.
default_value : `str`, optional
When you want to use a non-default `feature_name`, you sometimes want to have a default
value to go with it, e.g., in case you don't have an NER tag for a particular token, for
some reason. This value will get used if we don't find a value in `feature_name`. If this
is not given, we will crash if a token doesn't have a value for the given `feature_name`, so
that you don't get weird, silent errors by default.
token_min_padding_length : `int`, optional (default=`0`)
See :class:`TokenIndexer`.
"""
def __init__(
self,
namespace: Optional[str] = "tokens",
lowercase_tokens: bool = False,
start_tokens: List[str] = None,
end_tokens: List[str] = None,
feature_name: str = "text",
default_value: str = _DEFAULT_VALUE,
token_min_padding_length: int = 0,
) -> None:
super().__init__(token_min_padding_length)
self.namespace = namespace
self.lowercase_tokens = lowercase_tokens
self._start_tokens = [Token(st) for st in (start_tokens or [])]
self._end_tokens = [Token(et) for et in (end_tokens or [])]
self._feature_name = feature_name
self._default_value = default_value
def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):
if self.namespace is not None:
text = self._get_feature_value(token)
if self.lowercase_tokens:
text = text.lower()
counter[self.namespace][text] += 1
def tokens_to_indices(
self, tokens: List[Token], vocabulary: Vocabulary
) -> Dict[str, List[int]]:
indices: List[int] = []
for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
text = self._get_feature_value(token)
if self.namespace is None:
# We could have a check here that `text` is an int; not sure it's worth it.
indices.append(text) # type: ignore
else:
if self.lowercase_tokens:
text = text.lower()
indices.append(vocabulary.get_token_index(text, self.namespace))
return {"tokens": indices}
def get_empty_token_list(self) -> IndexedTokenList:
return {"tokens": []}
def _get_feature_value(self, token: Token) -> str:
text = getattr(token, self._feature_name)
if text is None:
if self._default_value is not _DEFAULT_VALUE:
text = self._default_value
else:
raise ValueError(
f"{token} did not have attribute {self._feature_name}. If you "
"want to ignore this kind of error, give a default value in the "
"constructor of this indexer."
)
return text
def _to_params(self) -> Dict[str, Any]:
return {
"namespace": self.namespace,
"lowercase_tokens": self.lowercase_tokens,
"start_tokens": [t.text for t in self._start_tokens],
"end_tokens": [t.text for t in self._end_tokens],
"feature_name": self._feature_name,
"default_value": self._default_value,
"token_min_padding_length": self._token_min_padding_length,
}