/
tokenizer.py
361 lines (295 loc) · 11.8 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
"""
emoji.tokenizer
~~~~~~~~~~~~~~~
Components for detecting and tokenizing emoji in strings.
"""
from typing import List, NamedTuple, Dict, Optional, Union, Iterator, Any
from emoji import unicode_codes
__all__ = [
'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI', 'Token',
'tokenize', 'filter_tokens',
]
_ZWJ = '\u200D'
_SEARCH_TREE: Optional[Dict[str, Any]] = None
class EmojiMatch:
"""
Represents a match of a "recommended for general interchange" (RGI)
emoji in a string.
"""
__slots__ = ('emoji', 'start', 'end', 'data')
def __init__(self, emoji: str, start: int,
end: int, data: Union[Dict[str, Any], None]):
self.emoji = emoji
"""The emoji substring"""
self.start = start
"""The start index of the match in the string"""
self.end = end
"""The end index of the match in the string"""
self.data = data
"""The entry from :data:`EMOJI_DATA` for this emoji or ``None`` if the emoji is non-RGI"""
def data_copy(self) -> Dict[str, Any]:
"""
Returns a copy of the data from :data:`EMOJI_DATA` for this match
with the additional keys ``match_start`` and ``match_end``.
"""
if self.data:
emj_data = self.data.copy()
emj_data['match_start'] = self.start
emj_data['match_end'] = self.end
return emj_data
else:
return {
'match_start': self.start,
'match_end': self.end
}
def is_zwj(self) -> bool:
"""
Checks if this is a ZWJ-emoji.
:returns: True if this is a ZWJ-emoji, False otherwise
"""
return _ZWJ in self.emoji
def split(self) -> Union['EmojiMatchZWJ', 'EmojiMatch']:
"""
Splits a ZWJ-emoji into its constituents.
:returns: An :class:`EmojiMatchZWJ` containing the "sub-emoji" if this is a ZWJ-emoji, otherwise self
"""
if self.is_zwj():
return EmojiMatchZWJ(self)
else:
return self
def __repr__(self) -> str:
return f'{self.__class__.__name__}({self.emoji}, {self.start}:{self.end})'
class EmojiMatchZWJ(EmojiMatch):
"""
Represents a match of multiple emoji in a string that were joined by
zero-width-joiners (ZWJ/``\\u200D``)."""
__slots__ = ('emojis', )
def __init__(self, match: EmojiMatch):
super().__init__(match.emoji, match.start, match.end, match.data)
self.emojis: List[EmojiMatch] = []
"""List of sub emoji as EmojiMatch objects"""
i = match.start
for e in match.emoji.split(_ZWJ):
m = EmojiMatch(
e, i, i+len(e), unicode_codes.EMOJI_DATA.get(e, None))
self.emojis.append(m)
i += len(e) + 1
def join(self) -> str:
"""
Joins a ZWJ-emoji into a string
"""
return _ZWJ.join(e.emoji for e in self.emojis)
def is_zwj(self) -> bool:
return True
def split(self) -> 'EmojiMatchZWJ':
return self
def __repr__(self) -> str:
return f'{self.__class__.__name__}({self.join()}, {self.start}:{self.end})'
class EmojiMatchZWJNonRGI(EmojiMatchZWJ):
"""
Represents a match of multiple emoji in a string that were joined by
zero-width-joiners (ZWJ/``\\u200D``). This class is only used for emoji
that are not "recommended for general interchange" (non-RGI) by Unicode.org.
The data property of this class is always None.
"""
def __init__(self, first_emoji_match: EmojiMatch,
second_emoji_match: EmojiMatch):
self.emojis = [first_emoji_match, second_emoji_match]
"""List of sub emoji as EmojiMatch objects"""
self._update()
def _update(self):
self.emoji = _ZWJ.join(e.emoji for e in self.emojis)
self.start = self.emojis[0].start
self.end = self.emojis[-1].end
self.data = None
def _add(self, next_emoji_match: EmojiMatch):
self.emojis.append(next_emoji_match)
self._update()
class Token(NamedTuple):
"""
A named tuple containing the matched string and its :class:`EmojiMatch` object if it is an emoji
or a single character that is not a unicode emoji.
"""
chars: str
value: Union[str, EmojiMatch]
def tokenize(string: str, keep_zwj: bool) -> Iterator[Token]:
"""
Finds unicode emoji in a string. Yields all normal characters as a named
tuple :class:`Token` ``(char, char)`` and all emoji as :class:`Token` ``(chars, EmojiMatch)``.
:param string: String contains unicode characters. MUST BE UNICODE.
:param keep_zwj: Should ZWJ-characters (``\\u200D``) that join non-RGI emoji be
skipped or should be yielded as normal characters
:return: An iterable of tuples :class:`Token` ``(char, char)`` or :class:`Token` ``(chars, EmojiMatch)``
"""
tree = get_search_tree()
EMOJI_DATA = unicode_codes.EMOJI_DATA
# result: [ Token(oldsubstring0, EmojiMatch), Token(char1, char1), ... ]
result: List[Token] = []
i = 0
length = len(string)
ignore: List[int] = [] # index of chars in string that are skipped, i.e. the ZWJ-char in non-RGI-ZWJ-sequences
while i < length:
consumed = False
char = string[i]
if i in ignore:
i += 1
if char == _ZWJ and keep_zwj:
result.append(Token(char, char))
continue
elif char in tree:
j = i + 1
sub_tree = tree[char]
while j < length and string[j] in sub_tree:
if j in ignore:
break
sub_tree = sub_tree[string[j]]
j += 1
if 'data' in sub_tree:
emj_data = sub_tree['data']
code_points = string[i:j]
# We cannot yield the result here, we need to defer
# the call until we are sure that the emoji is finished
# i.e. we're not inside an ongoing ZWJ-sequence
match_obj = EmojiMatch(code_points, i, j, emj_data)
i = j - 1
consumed = True
result.append(Token(code_points, match_obj))
elif char == _ZWJ and result and result[-1].chars in EMOJI_DATA and i > 0 and string[i - 1] in tree:
# the current char is ZWJ and the last match was an emoji
ignore.append(i)
if EMOJI_DATA[result[-1].chars]["status"] == unicode_codes.STATUS["component"]:
# last match was a component, it could be ZWJ+EMOJI+COMPONENT
# or ZWJ+COMPONENT
i = i - sum(len(t.chars) for t in result[-2:])
if string[i] == _ZWJ:
# It's ZWJ+COMPONENT, move one back
i += 1
del result[-1]
else:
# It's ZWJ+EMOJI+COMPONENT, move two back
del result[-2:]
else:
# last match result[-1] was a normal emoji, move cursor
# before the emoji
i = i - len(result[-1].chars)
del result[-1]
continue
elif result:
yield from result
result = []
if not consumed and char != '\uFE0E' and char != '\uFE0F':
result.append(Token(char, char))
i += 1
yield from result
def filter_tokens(matches: Iterator[Token], emoji_only: bool, join_emoji: bool) -> Iterator[Token]:
"""
Filters the output of `tokenize()`
:param matches: An iterable of tuples of the form ``(match_str, result)``
where ``result`` is either an EmojiMatch or a string.
:param emoji_only: If True, only EmojiMatch are returned in the output.
If False all characters are returned
:param join_emoji: If True, multiple EmojiMatch are merged into
a single :class:`EmojiMatchZWJNonRGI` if they are separated only by a ZWJ.
:return: An iterable of tuples :class:`Token` ``(char, char)``,
:class:`Token` ``(chars, EmojiMatch)`` or :class:`Token` ``(chars, EmojiMatchZWJNonRGI)``
"""
if not join_emoji and not emoji_only:
yield from matches
return
if not join_emoji:
for token in matches:
if token.chars != _ZWJ:
yield token
return
# Combine multiple EmojiMatch that are separated by ZWJs into
# a single EmojiMatchZWJNonRGI
previous_is_emoji = False
previous_is_zwj = False
pre_previous_is_emoji = False
accumulator: List[Token] = []
for token in matches:
pre_previous_is_emoji = previous_is_emoji
if previous_is_emoji and token.value == _ZWJ:
previous_is_zwj = True
elif isinstance(token.value, EmojiMatch):
if pre_previous_is_emoji and previous_is_zwj:
if isinstance(accumulator[-1].value, EmojiMatchZWJNonRGI):
accumulator[-1].value._add(token.value) # type: ignore
accumulator[-1] = Token(accumulator[-1].chars +
_ZWJ + token.chars, accumulator[-1].value)
else:
prev = accumulator.pop()
accumulator.append(
Token(prev.chars + _ZWJ + token.chars,
EmojiMatchZWJNonRGI(
prev.value, # type: ignore
token.value)))
else:
accumulator.append(token)
previous_is_emoji = True
previous_is_zwj = False
else:
# Other character, not an emoji
previous_is_emoji = False
previous_is_zwj = False
yield from accumulator
if not emoji_only:
yield token
accumulator = []
yield from accumulator
def get_search_tree() -> Dict[str, Any]:
"""
Generate a search tree for demojize().
Example of a search tree::
EMOJI_DATA =
{'a': {'en': ':Apple:'},
'b': {'en': ':Bus:'},
'ba': {'en': ':Bat:'},
'band': {'en': ':Beatles:'},
'bandit': {'en': ':Outlaw:'},
'bank': {'en': ':BankOfEngland:'},
'bb': {'en': ':BB-gun:'},
'c': {'en': ':Car:'}}
_SEARCH_TREE =
{'a': {'data': {'en': ':Apple:'}},
'b': {'a': {'data': {'en': ':Bat:'},
'n': {'d': {'data': {'en': ':Beatles:'},
'i': {'t': {'data': {'en': ':Outlaw:'}}}},
'k': {'data': {'en': ':BankOfEngland:'}}}},
'b': {'data': {'en': ':BB-gun:'}},
'data': {'en': ':Bus:'}},
'c': {'data': {'en': ':Car:'}}}
_SEARCH_TREE
/ | ⧵
/ | ⧵
a b c
| / | ⧵ |
| / | ⧵ |
:Apple: ba :Bus: bb :Car:
/ ⧵ |
/ ⧵ |
:Bat: ban :BB-gun:
/ ⧵
/ ⧵
band bank
/ ⧵ |
/ ⧵ |
bandi :Beatles: :BankOfEngland:
|
bandit
|
:Outlaw:
"""
global _SEARCH_TREE
if _SEARCH_TREE is None:
_SEARCH_TREE = {} # type: ignore
for emj in unicode_codes.EMOJI_DATA:
sub_tree = _SEARCH_TREE
lastidx = len(emj) - 1
for i, char in enumerate(emj):
if char not in sub_tree:
sub_tree[char] = {}
sub_tree = sub_tree[char]
if i == lastidx:
sub_tree['data'] = unicode_codes.EMOJI_DATA[emj]
return _SEARCH_TREE