-
Notifications
You must be signed in to change notification settings - Fork 268
/
tagger.py
134 lines (108 loc) · 3.58 KB
/
tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Model API.
"""
from ....imports import *
from .. import metrics
class Tagger(object):
"""A model API that tags input sentence.
Attributes:
model: Model.
preprocessor: Transformer. Preprocessing data for feature extraction.
tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.
"""
def __init__(self, model, preprocessor, tokenizer=str.split):
self.model = model
self.preprocessor = preprocessor
self.tokenizer = tokenizer
def predict_proba(self, text):
"""Probability estimates.
The returned estimates for all classes are ordered by the
label of classes.
Args:
text : string, the input text.
Returns:
y : array-like, shape = [num_words, num_classes]
Returns the probability of the word for each class in the model,
"""
assert isinstance(text, str)
words = self.tokenizer(text)
X = self.preprocessor.transform([words])
y = self.model.predict(X)
y = y[0] # reduce batch dimension.
return y
def _get_prob(self, pred):
prob = np.max(pred, -1)
return prob
def _get_tags(self, pred):
tags = self.preprocessor.inverse_transform([pred])
tags = tags[0] # reduce batch dimension
return tags
def _build_response(self, sent, tags, prob):
words = self.tokenizer(sent)
res = {"words": words, "entities": []}
chunks = metrics.get_entities(tags)
for chunk_type, chunk_start, chunk_end in chunks:
chunk_end += 1
entity = {
"text": " ".join(words[chunk_start:chunk_end]),
"type": chunk_type,
"score": float(np.average(prob[chunk_start:chunk_end])),
"beginOffset": chunk_start,
"endOffset": chunk_end,
}
res["entities"].append(entity)
return res
def analyze(self, text):
"""Analyze text and return pretty format.
Args:
text: string, the input text.
Returns:
res: dict.
Examples:
>>> text = 'President Obama is speaking at the White House.'
>>> model.analyze(text)
{
"words": [
"President",
"Obama",
"is",
"speaking",
"at",
"the",
"White",
"House."
],
"entities": [
{
"beginOffset": 1,
"endOffset": 2,
"score": 1,
"text": "Obama",
"type": "PER"
},
{
"beginOffset": 6,
"endOffset": 8,
"score": 1,
"text": "White House.",
"type": "ORG"
}
]
}
"""
pred = self.predict_proba(text)
tags = self._get_tags(pred)
prob = self._get_prob(pred)
res = self._build_response(text, tags, prob)
return res
def predict(self, text):
"""Predict using the model.
Args:
text: string, the input text.
Returns:
tags: list, shape = (num_words,)
Returns predicted values.
"""
pred = self.predict_proba(text)
tags = self._get_tags(pred)
return tags