This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
spacy_tokenizer.py
160 lines (138 loc) · 6.53 KB
/
spacy_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from typing import List, Optional
import spacy
from spacy.tokens import Doc
from allennlp.common.util import get_spacy_model
from allennlp.data.tokenizers.token_class import Token
from allennlp.data.tokenizers.tokenizer import Tokenizer
@Tokenizer.register("spacy")
class SpacyTokenizer(Tokenizer):
"""
A `Tokenizer` that uses spaCy's tokenizer. It's fast and reasonable - this is the
recommended `Tokenizer`. By default it will return allennlp Tokens,
which are small, efficient NamedTuples (and are serializable). If you want
to keep the original spaCy tokens, pass keep_spacy_tokens=True. Note that we leave one particular piece of
post-processing for later: the decision of whether or not to lowercase the token. This is for
two reasons: (1) if you want to make two different casing decisions for whatever reason, you
won't have to run the tokenizer twice, and more importantly (2) if you want to lowercase words
for your word embedding, but retain capitalization in a character-level representation, we need
to retain the capitalization here.
Registered as a `Tokenizer` with name "spacy", which is currently the default.
# Parameters
language : `str`, optional, (default=`"en_core_web_sm"`)
Spacy model name.
pos_tags : `bool`, optional, (default=`False`)
If `True`, performs POS tagging with spacy model on the tokens.
Generally used in conjunction with :class:`~allennlp.data.token_indexers.pos_tag_indexer.PosTagIndexer`.
parse : `bool`, optional, (default=`False`)
If `True`, performs dependency parsing with spacy model on the tokens.
Generally used in conjunction with :class:`~allennlp.data.token_indexers.pos_tag_indexer.DepLabelIndexer`.
ner : `bool`, optional, (default=`False`)
If `True`, performs dependency parsing with spacy model on the tokens.
Generally used in conjunction with :class:`~allennlp.data.token_indexers.ner_tag_indexer.NerTagIndexer`.
keep_spacy_tokens : `bool`, optional, (default=`False`)
If `True`, will preserve spacy token objects, We copy spacy tokens into our own class by default instead
because spacy Cython Tokens can't be pickled.
split_on_spaces : `bool`, optional, (default=`False`)
If `True`, will split by spaces without performing tokenization.
Used when your data is already tokenized, but you want to perform pos, ner or parsing on the tokens.
start_tokens : `Optional[List[str]]`, optional, (default=`None`)
If given, these tokens will be added to the beginning of every string we tokenize.
end_tokens : `Optional[List[str]]`, optional, (default=`None`)
If given, these tokens will be added to the end of every string we tokenize.
"""
def __init__(
self,
language: str = "en_core_web_sm",
pos_tags: bool = True,
parse: bool = False,
ner: bool = False,
keep_spacy_tokens: bool = False,
split_on_spaces: bool = False,
start_tokens: Optional[List[str]] = None,
end_tokens: Optional[List[str]] = None,
) -> None:
# Save these for use later in the _to_params method
self._language = language
self._pos_tags = pos_tags
self._parse = parse
self._ner = ner
self._split_on_spaces = split_on_spaces
self.spacy = get_spacy_model(self._language, self._pos_tags, self._parse, self._ner)
if self._split_on_spaces:
self.spacy.tokenizer = _WhitespaceSpacyTokenizer(self.spacy.vocab)
self._keep_spacy_tokens = keep_spacy_tokens
self._start_tokens = start_tokens or []
# We reverse the tokens here because we're going to insert them with `insert(0)` later;
# this makes sure they show up in the right order.
self._start_tokens.reverse()
self._is_version_3 = spacy.__version__ >= "3.0"
self._end_tokens = end_tokens or []
def _sanitize(self, tokens: List[spacy.tokens.Token]) -> List[Token]:
"""
Converts spaCy tokens to allennlp tokens. Is a no-op if
keep_spacy_tokens is True
"""
if not self._keep_spacy_tokens:
tokens = [
Token(
token.text,
token.idx,
token.idx + len(token.text),
token.lemma_,
token.pos_,
token.tag_,
token.dep_,
token.ent_type_,
)
for token in tokens
]
for start_token in self._start_tokens:
tokens.insert(0, Token(start_token, 0))
for end_token in self._end_tokens:
tokens.append(Token(end_token, -1))
return tokens
def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
if self._is_version_3:
return [
self._sanitize(_remove_spaces(tokens))
for tokens in self.spacy.pipe(texts, n_process=-1)
]
else:
return [
self._sanitize(_remove_spaces(tokens))
for tokens in self.spacy.pipe(texts, n_threads=-1)
]
def tokenize(self, text: str) -> List[Token]:
# This works because our Token class matches spacy's.
return self._sanitize(_remove_spaces(self.spacy(text)))
def _to_params(self):
return {
"type": "spacy",
"language": self._language,
"pos_tags": self._pos_tags,
"parse": self._parse,
"ner": self._ner,
"keep_spacy_tokens": self._keep_spacy_tokens,
"split_on_spaces": self._split_on_spaces,
"start_tokens": self._start_tokens,
"end_tokens": self._end_tokens,
}
class _WhitespaceSpacyTokenizer:
"""
Spacy doesn't assume that text is tokenised. Sometimes this
is annoying, like when you have gold data which is pre-tokenised,
but Spacy's tokenisation doesn't match the gold. This can be used
as follows:
nlp = spacy.load("en_core_web_md")
# hack to replace tokenizer with a whitespace tokenizer
nlp.tokenizer = _WhitespaceSpacyTokenizer(nlp.vocab)
... use nlp("here is some text") as normal.
"""
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = text.split(" ")
spaces = [True] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
def _remove_spaces(tokens: List[spacy.tokens.Token]) -> List[spacy.tokens.Token]:
return [token for token in tokens if not token.is_space]