-
Notifications
You must be signed in to change notification settings - Fork 21
/
myspacy.py
394 lines (315 loc) · 11.5 KB
/
myspacy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
"""Get words and tokens from a plain text with the help of Spacy.
This module supposes that you have installed Spacy and the necessary
language modules.
To get [Spacy](https://spacy.io), do
``` sh
pip install spacy
```
The English language module can then be installed by
``` sh
python -m spacy download en_core_web_sm
```
You can install many more [language models](https://spacy.io/usage/models).
"""
import re
from ..capable import CheckImport
from ..core.helpers import console
LANG_MODELS = """
ca core_news Catalan
da core_news Danish
de core_news German
el core_news Greek
en core_web English
es core_news Spanish
fi core_news Finnish
fr core_news French
hr core_news Croatian
it core_news Italian
ja core_news Japanese
ko core_news Korean
lt core_news Lithuanian
mk core_news Macedonian
nb core_news Norwegian (Bokmål)
nl core_news Dutch
pl core_news Polish
pt core_news Portuguese
ro core_news Romanian
ru core_news Russian
sv core_news Swedish
uk core_news Ukrainian
zh core_news Chinese
xx ent_wiki multi-language
""".strip().split("\n")
"""Languages and their associated Spacy models."""
class Spacy(CheckImport):
def __init__(self, lang=None, parser=False):
"""Sets up an NLP (Natural Language Processing) pipeline.
The pipeline is tied to a particular language model, which you can pass
as a parameter, provided you have installed it.
For now, we use Spacy in a fairly trivial way: only tokenisation and sentence
detection.
We do not need the parser for this.
Parameters
----------
lang: string, optional xx
The language to be used; Spacy may need to download it, if so, it will
happen automatically.
If the language is not supported by Spacy, we switch to the multi-language
called `xx`.
See `tf.tools.myspacy.LANG_MODELS` about the language models that Spacy supports.
"""
super().__init__("spacy", "spacyd")
if self.importOK(hint=True):
global spacy
global download
(spacy, download) = self.importGet()
else:
return
langModels = {}
languages = {}
self.canTag = False
self.canMorph = False
self.canLemma = False
for spec in LANG_MODELS:
(lng, model, language) = spec.split(maxsplit=2)
langModels[lng] = f"{lng}_{model}_sm"
languages[lng] = language
self.langModels = langModels
self.languages = languages
prevLang = None
targetLang = lang
loaded = False
i = 0
while True:
i += 1
targetModel = langModels.get(targetLang, None)
targetLanguage = languages.get(targetLang, None)
if targetModel is None:
(prevLang, targetLang) = (targetLang, "xx")
targetModel = langModels[targetLang]
targetLanguage = languages[targetLang]
if prevLang is None:
console("No language specified")
else:
console(
f"No language model for {prevLang} supported by Spacy.\n"
)
console(
f"Switching to the {targetLanguage} model"
)
if targetLang == prevLang:
break
else:
continue
try:
nlp = spacy.load(targetModel)
loaded = True
break
except Exception:
console(f"Language model {targetModel} not installed. Downloading ...")
try:
console(f"Downloading {targetModel} ...")
download(targetModel)
except Exception:
console(f"Could not download {targetModel} ...")
(prevLang, targetLang) = (targetLang, "xx")
if targetLang == prevLang:
break
else:
continue
console(f"NLP with language model {targetLang} {parser}")
if loaded:
try:
if not parser:
nlp.disable_pipe("parser")
nlp.disable_pipe("sentencizer")
except Exception:
pass
try:
nlp.enable_pipe("senter")
self.canSentence = True
except Exception:
self.canSentence = False
console("This language does not support sentence boundary detection")
if parser:
try:
nlp.enable_pipe("tagger")
self.canTag = True
console("This language supports tagging")
except Exception:
self.canTag = False
console("This language does not supports tagging")
try:
nlp.enable_pipe("morphologizer")
self.canMorph = True
console("This language supports morphologizing")
except Exception:
self.canMorph = False
console("This language does not supports morphologizing")
try:
nlp.enable_pipe("lemmatizer")
self.canLemma = True
console("This language supports lemmatizing")
except Exception:
self.canLemma = False
console("This language does not supports lemmatizing")
else:
console("Cannot load (language data) to get Spacy working")
nlp = None
self.nlp = nlp
self.doc = None
def read(self, text):
"""Process a plain text.
A text is ingested and tokenised. Sentences are detected.
This may require quite some processing time, think of 30 seconds for 200,000
words on a decent laptop.
Parameters
----------
text: string
The complete, raw text.
"""
if not self.importOK():
return
nText = len(text)
nlp = self.nlp
if nlp is None:
console("The NLP pipeline is not functioning")
return
nlp.max_length = nText
doc = nlp(text)
self.doc = doc
def getTokens(self):
"""Get the resulting tokens.
A token is represented as a tuple consisting of
* *start*: first character position that the token occupies in the text.
Character positions start at 0.
* *end*: last character position that the token occupies in the text
*plus one*.
* *text*: text of the token, **excluding any trailing white-space**.
* *space*: any white-space behind the token, if present, otherwise
the empty string.
!!! note "End position and space"
If there is a space behind the token, it will not add to the end position
of the token. So the start and end positions of the tokens reflect
where the tokens themselves are, and spaces do not belong to the tokens.
Returns
-------
list
All tokens as tuples.
"""
doc = self.doc
if doc is None:
console("No results available from the NLP pipeline")
return []
canTag = self.canTag
canMorph = self.canMorph
canLemma = self.canLemma
result = []
for token in doc:
start = token.idx
text = token.text
space = token.whitespace_
end = start + len(text)
pos = token.pos_ if canMorph else token.tag_ if canTag else None
morph = str(token.morph) if canMorph else None
lemma = token.lemma_.strip().lower() if canLemma else None
result.append((start, end, text, space, pos, morph, lemma))
return result
def getSentences(self):
"""Get the resulting sentences.
A sentence is represented as a tuple consisting of
* *start*: first character position that the sentence occupies in the text.
Character positions start at 0.
* *end*: last character position that the sentence occupies in the text
*plus one*.
* *text*: text of the sentence.
Returns
-------
list
All sentences as tuples.
"""
if not self.importOK():
return []
doc = self.doc
if doc is None:
console("No results available from the NLP pipeline")
return []
if not self.canSentence:
console("No sentence results available from the NLP pipeline")
return []
result = []
whiteRe = re.compile(r"^[.?!\s]*$", re.S)
spuriousNlBefore = re.compile(r"\n+(\W)")
spuriousNlAfter = re.compile(r"(\W)\n+")
for s in doc.sents:
text = s.text.strip("\n")
if whiteRe.match(text):
continue
tokenStart = doc[s.start]
tokenEnd = doc[s.end - 1]
sentStart = tokenStart.idx
sentEnd = tokenEnd.idx + len(tokenEnd.text)
text = spuriousNlBefore.sub(r"\1", text)
text = spuriousNlAfter.sub(r"\1", text)
result.append((sentStart, sentEnd, text))
return result
def getEntities(self):
"""Get the resulting named entities.
A named entity is represented as a tuple consisting of
* *start*: first character position that the entity occupies in the text.
Character positions start at 0.
* *end*: last character position that the entity occupies in the text
*plus one*.
* *text*: text of the entity.
* *kind*: kind of the entity.
Returns
-------
list
All entities as tuples.
"""
if not self.importOK():
return []
doc = self.doc
if doc is None:
console("No results available from the NLP pipeline")
return []
if not hasattr(doc, "ents"):
console("No entity results available from the NLP pipeline")
return []
result = []
for ent in doc.ents:
start = ent.start_char
end = ent.end_char
text = ent.text
kind = ent.label_
result.append((start, end, text, kind))
return result
def nlpOutput(text, lang="en", ner=False, parser=False):
"""Runs the Spacy NLP pipeline and delivers the results.
Parameters
----------
text: string
The complete, raw text.
lang: string, optional en
The language to be used; its model should be installed; see
`tf.tools.myspacy` for how to get language models.
ner: boolean, optional False
Whether to include named entities in the output.
parser: boolean, optional False
Whether to run the NLP parser.
Returns
-------
tuple
`tokens`: the token list as tuples
`sentences`: the sentence list as tuples
`entities`: the entity list as tuples, only if `ner=True`
Tokens are tuples (start, end, text, after).
Sentences are tuples (start, end, text).
Entities are tuples (start, end, text, kind).
"""
S = Spacy(lang=lang, parser=parser)
S.read(text)
tokens = S.getTokens()
sentences = S.getSentences()
entities = S.getEntities() if ner else None
return tuple(x for x in (tokens, sentences, entities) if x is not None)