-
Notifications
You must be signed in to change notification settings - Fork 20
/
classification_transformer.py
298 lines (259 loc) · 10 KB
/
classification_transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
"""
Functions for wrapping a sequence classification transformer in a SpaCy pipeline
"""
from typing import List, Callable, Iterable, Dict, Optional, Union
from pathlib import Path
import warnings
from spacy.language import Language
from spacy import util
from spacy.pipeline.pipe import deserialize_config
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy_transformers import Transformer
from spacy_transformers.layers.transformer_model import forward, set_pytorch_transformer
from spacy_transformers.data_classes import (
FullTransformerBatch,
WordpieceBatch,
)
from spacy_transformers.annotation_setters import null_annotation_setter
from spacy_transformers.util import registry, huggingface_tokenize
from thinc.api import (
get_current_ops,
CupyOps,
Model,
Config,
)
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from ..utils import softmax
DEFAULT_CONFIG_STR = """
[classification_transformer]
max_batch_items = 4096
doc_extension_attribute = "clf_trf_data"
[classification_transformer.set_extra_annotations]
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
[classification_transformer.model]
@architectures = "dacy.ClassificationTransformerModel.v1"
name = "roberta-base"
tokenizer_config = {"use_fast": true}
num_labels = 2
[classification_transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96
"""
DEFAULT_CONFIG = Config().from_str(DEFAULT_CONFIG_STR)
@Language.factory(
"classification_transformer",
default_config=DEFAULT_CONFIG["classification_transformer"],
)
def make_classification_transformer(
nlp: Language,
name: str,
model: Model[List[Doc], FullTransformerBatch],
set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None],
max_batch_items: int,
doc_extension_attribute: str,
):
"""
Construct a Transformer component, which lets you plug a model from the
Huggingface transformers library into spaCy so you can use it in your
pipeline. One or more subsequent spaCy components can use the transformer
outputs as features in its model, with gradients backpropagated to the single
shared weights.
Args:
nlp (Language): a SpaCy text processing pipeline
name (str): The desired name of the component
model (Model[List[Doc], FullTransformerBatch]):
A thinc Model object wrapping the transformer. Usually you will want to use the TransformerModel layer for this.
set_extra_annotations (Callable[[List[Doc], FullTransformerBatch], None]):
A callback to set additional information onto the batch of `Doc` objects.
The doc._.clf_trf_data attribute is set prior to calling the callback. By default, no additional annotations are set.
max_batch_items (int): Max batch size
doc_extension_attribute (str): Your desired doc extension
Returns:
Your ClassificationTransformer component
"""
return ClassificationTransformer(
nlp.vocab,
model,
set_extra_annotations,
max_batch_items=max_batch_items,
name=name,
doc_extension_attribute=doc_extension_attribute,
)
@registry.architectures.register("dacy.ClassificationTransformerModel.v1")
def ClassificationTransformerModel(
name: str, get_spans: Callable, tokenizer_config: dict, num_labels: int
) -> Model[List[Doc], FullTransformerBatch]:
"""
Args:
get_spans (Callable[[List[Doc]], List[Span]]):
A function to extract spans from the batch of Doc objects.
This is used to manage long documents, by cutting them into smaller
sequences before running the transformer. The spans are allowed to
overlap, and you can also omit sections of the Doc if they are not
relevant.
tokenizer_config (dict): Settings to pass to the transformers tokenizer.
"""
return Model(
"classification_transformer",
forward,
init=init,
layers=[],
dims={"nO": None},
attrs={
"tokenizer": None,
"get_spans": get_spans,
"name": name,
"tokenizer_config": tokenizer_config,
"num_labels": num_labels,
"set_transformer": set_pytorch_transformer,
"has_transformer": False,
"flush_cache_chance": 0.0,
},
)
class ClassificationTransformer(Transformer):
""""""
def __init__(
self,
vocab: Vocab,
model: Model[List[Doc], FullTransformerBatch],
set_extra_annotations: Callable = null_annotation_setter,
*,
name: str = "classification_transformer",
max_batch_items: int = 128 * 32, # Max size of padded batch
doc_extension_attribute,
):
super().__init__(
vocab=vocab,
model=model,
set_extra_annotations=set_extra_annotations,
name=name,
max_batch_items=max_batch_items,
)
install_extensions(doc_extension_attribute)
self.doc_extension_attribute = doc_extension_attribute
def from_disk(
self,
path: Union[str, Path],
*,
num_labels: int,
exclude: Iterable[str] = tuple(),
) -> "Transformer":
"""Load the pipe from disk. For more see:
https://spacy.io/api/transformer#from_disk
Args:
path (str): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
num_labels (int): Number of labels of the models. Required for reading the model into memory.
Return:
(Transformer): The loaded object.
"""
def load_model(p):
p = Path(p).absolute()
tokenizer, transformer = huggingface_classification_from_pretrained(
p, self.model.attrs["tokenizer_config"], num_labels=num_labels
)
self.model.attrs["tokenizer"] = tokenizer
self.model.attrs["set_transformer"](self.model, transformer)
deserialize = {
"vocab": self.vocab.from_disk,
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
"model": load_model,
}
util.from_disk(path, deserialize, exclude)
return self
def set_annotations(
self, docs: Iterable[Doc], predictions: FullTransformerBatch
) -> None:
"""
Assign the extracted features to the Doc objects. By default, the
TransformerData object is written to the doc._.trf_data attribute. Your
set_extra_annotations callback is then called, if provided. For more see
https://spacy.io/api/pipe#set_annotations
Args:
docs (Iterable[Doc]): The documents to modify.
predictions (FullTransformerBatch): A batch of activations.
"""
doc_data = list(predictions.doc_data)
for doc, data in zip(docs, doc_data):
setattr(doc._, self.doc_extension_attribute, data)
self.set_extra_annotations(docs, predictions)
def init(model: Model, X=None, Y=None):
if model.attrs["has_transformer"]:
return
name = model.attrs["name"]
tok_cfg = model.attrs["tokenizer_config"]
num_labels = model.attrs["num_labels"]
tokenizer, transformer = huggingface_classification_from_pretrained(
name, tok_cfg, num_labels
)
model.attrs["tokenizer"] = tokenizer
model.attrs["set_transformer"](model, transformer)
# Call the model with a batch of inputs to infer the width
texts = ["hello world", "foo bar"]
token_data = huggingface_tokenize(model.attrs["tokenizer"], texts)
wordpieces = WordpieceBatch.from_batch_encoding(token_data)
model.layers[0].initialize(X=wordpieces)
tensors = model.layers[0].predict(wordpieces)
def huggingface_classification_from_pretrained(
source: Union[Path, str], config: Dict, num_labels: int
):
"""
Create a Huggingface transformer model from pretrained weights. Will
download the model if it is not already downloaded.
Args:
source (Union[str, Path]): The name of the model or a path to it, such as
'bert-base-cased'.
config (dict): Settings to pass to the tokenizer.
"""
if hasattr(source, "absolute"):
str_path = str(source.absolute())
else:
str_path = source
tokenizer = AutoTokenizer.from_pretrained(str_path, **config)
transformer = AutoModelForSequenceClassification.from_pretrained(
str_path, num_labels=num_labels
)
ops = get_current_ops()
if isinstance(ops, CupyOps):
transformer.cuda()
return tokenizer, transformer
def make_classification_getter(category, labels, doc_extension):
def prop_getter(doc) -> dict:
trf_data = getattr(doc._, doc_extension)
if trf_data.tensors:
return {
"prop": softmax(trf_data.tensors[0][0]).round(decimals=3),
"labels": labels,
}
else:
warnings.warn(
"The tensors from the transformer forward pass is empty this is likely caused by an empty input string. Thus the model will return None"
)
return {
"prop": None,
"labels": labels,
}
def label_getter(doc) -> Optional[str]:
prop = getattr(doc._, f"{category}_prop")
if prop["prop"] is not None:
return labels[int(prop["prop"].argmax())]
else:
return None
return prop_getter, label_getter
def install_extensions(doc_extension_attribute) -> None:
if not Doc.has_extension(doc_extension_attribute):
Doc.set_extension(doc_extension_attribute, default=None)
def install_classification_extensions(
category: str,
labels: list,
doc_extension: str,
force: bool,
):
prop_getter, label_getter = make_classification_getter(
category, labels, doc_extension
)
Doc.set_extension(f"{category}_prop", getter=prop_getter, force=force)
Doc.set_extension(category, getter=label_getter, force=force)