Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Demojize performance #197

Merged
merged 5 commits into from
Dec 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
emoji
=====

1.6.2
-----
* Improve performance of demojize()
* Added more tests
* Added warning when someone uses any other language than 'en' with use_aliases=True in emojize()

1.6.1
-----
* Allow multiple aliases
Expand All @@ -14,7 +20,6 @@ emoji
* emoji.version(string) method added
* Included 'variant' in the dict of dicts


1.5.0
-----
* Emojis of English version updated to the Emoji Charts v14.0
Expand Down
2 changes: 1 addition & 1 deletion emoji/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
'EMOJI_ALIAS_UNICODE_ENGLISH', 'UNICODE_EMOJI_ALIAS_ENGLISH', 'EMOJI_DATA',
]

__version__ = '1.6.1'
__version__ = '1.6.2'
__author__ = 'Taehoon Kim, Kevin Wurster and Tahir Jalilov'
__email__ = 'carpedm20@gmail.com'
# and wursterk@gmail.com, tahir.jalilov@gmail.com
Expand Down
191 changes: 148 additions & 43 deletions emoji/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import re
import sys
import warnings

from emoji import unicode_codes

Expand All @@ -23,6 +24,7 @@
PY2 = sys.version_info[0] == 2

_EMOJI_REGEXP = None
_SEARCH_TREE = None
_DEFAULT_DELIMITER = ':'


Expand Down Expand Up @@ -52,7 +54,8 @@ def emojize(
:param use_aliases: (optional) Enable emoji aliases. See ``emoji.UNICODE_EMOJI_ALIAS``.
:param delimiters: (optional) Use delimiters other than _DEFAULT_DELIMITER
:param variant: (optional) Choose variation selector between "base"(None), VS-15 ("text_type") and VS-16 ("emoji_type")
:param language: Choose language of emoji name
:param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias'
to use English aliases
:param version: (optional) Max version. If set to an Emoji Version,
all emoji above this version will be ignored.
:param handle_version: (optional) Replace the emoji above ``version``
Expand All @@ -74,20 +77,22 @@ def emojize(
:raises ValueError: if ``variant`` is neither None, 'text_type' or 'emoji_type'

"""
EMOJI_UNICODE = unicode_codes.EMOJI_UNICODE[language]

if use_aliases or language == 'alias':
if language not in ('en', 'alias'):
warnings.warn("use_aliases=True is only supported for language='en'. "
"It is recommended to use emojize(string, language='alias') instead", stacklevel=2)
use_aliases = True
language = 'en'

EMOJI_UNICODE = unicode_codes.EMOJI_ALIAS_UNICODE_ENGLISH if use_aliases else unicode_codes.EMOJI_UNICODE[language]
pattern = re.compile(u'(%s[\\w\\-&.’”“()!#*+?–,/]+%s)' % delimiters, flags=re.UNICODE)

def replace(match):
mg = match.group(1).replace(delimiters[0], _DEFAULT_DELIMITER).replace(
delimiters[1], _DEFAULT_DELIMITER
)
if use_aliases:
emj = unicode_codes.EMOJI_ALIAS_UNICODE_ENGLISH.get(mg)
else:
emj = EMOJI_UNICODE.get(mg)

mg = match.group(1)[len(delimiters[0]):-len(delimiters[1])]
emj = EMOJI_UNICODE.get(_DEFAULT_DELIMITER + mg + _DEFAULT_DELIMITER)
if emj is None:
return mg
return match.group(1)

if version is not None:
if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
Expand Down Expand Up @@ -134,7 +139,8 @@ def demojize(
:param string: String contains unicode characters. MUST BE UNICODE.
:param use_aliases: (optional) Return emoji aliases. See ``emoji.UNICODE_EMOJI_ALIAS``.
:param delimiters: (optional) User delimiters other than ``_DEFAULT_DELIMITER``
:param language: (optional) Choose language of emoji name
:param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias'
to use English aliases
:param version: (optional) Max version. If set to an Emoji Version,
all emoji above this version will be removed.
:param handle_version: (optional) Replace the emoji above ``version``
Expand All @@ -156,24 +162,60 @@ def demojize(

"""

codes_dict = unicode_codes.UNICODE_EMOJI_ALIAS_ENGLISH if use_aliases else unicode_codes.UNICODE_EMOJI[language]

def replace(match):
emj = match.group(0)
val = codes_dict.get(emj)
if val is None:
return emj
if version is not None:
if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
if callable(handle_version):
return handle_version(emj, unicode_codes.EMOJI_DATA[emj])
elif handle_version is not None:
return str(handle_version)
if language == 'alias':
language = 'en'
use_aliases = True
elif use_aliases and language != 'en':
warnings.warn("use_aliases=True is only supported for language='en'. "
"It is recommended to use demojize(string, language='alias') instead", stacklevel=2)
language = 'en'

tree = _get_search_tree()
result = []
i = 0
length = len(string)
while i < length:
consumed = False
char = string[i]
if char in tree:
j = i + 1
sub_tree = tree[char]
while j < length and string[j] in sub_tree:
sub_tree = sub_tree[string[j]]
j += 1
if 'data' in sub_tree:
emj_data = sub_tree['data']
code_points = string[i:j]
replace_str = None
if version is not None and emj_data['E'] > version:
if callable(handle_version):
emj_data = emj_data.copy()
emj_data['match_start'] = i
emj_data['match_end'] = j
replace_str = handle_version(code_points, emj_data)
elif handle_version is not None:
replace_str = str(handle_version)
else:
replace_str = None
elif language in emj_data:
if use_aliases and 'alias' in emj_data:
replace_str = delimiters[0] + emj_data['alias'][0][1:-1] + delimiters[1]
else:
replace_str = delimiters[0] + emj_data[language][1:-1] + delimiters[1]
else:
return ''
return delimiters[0] + val[1:-1] + delimiters[1]
# The emoji exists, but it is not translated, so we keep the emoji
replace_str = code_points

i = j - 1
consumed = True
if replace_str:
result.append(replace_str)

if not consumed and char != u'\ufe0e' and char != u'\ufe0f':
result.append(char)
i += 1

return get_emoji_regexp().sub(replace, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')
return "".join(result)


def replace_emoji(string, replace='', language=None, version=-1):
Expand All @@ -189,20 +231,17 @@ def replace_emoji(string, replace='', language=None, version=-1):
:param language: (optional) Parameter is no longer used
"""

if version <= 0 and not callable(replace):
return get_emoji_regexp().sub(replace, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')

def replace_fct(match):
emj = match.group(0)

if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
if version > -1:
def f(emj, emj_data):
if emj_data['E'] <= version:
return emj # Do not replace emj
if callable(replace):
return replace(emj, unicode_codes.EMOJI_DATA[emj])
else:
return str(replace)
return emj
return replace(emj, emj_data)
return str(replace)

return get_emoji_regexp().sub(replace_fct, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')
return demojize(string, use_aliases=False, language='en', version=-1, handle_version=f)
else:
return demojize(string, use_aliases=False, language='en', version=-1, handle_version=replace)


def get_emoji_regexp(language=None):
Expand Down Expand Up @@ -232,12 +271,14 @@ def emoji_lis(string, language=None):
"""
_entities = []

for match in get_emoji_regexp().finditer(string):
def f(emj, emj_data):
_entities.append({
'location': match.start(),
'emoji': match.group(),
'location': emj_data['match_start'],
'emoji': emj,
})

demojize(string, use_aliases=False, language='en',
version=-1, handle_version=f)
return _entities


Expand Down Expand Up @@ -285,6 +326,7 @@ def version(string):

# Try to find first emoji in string
version = []

def f(e, emoji_data):
version.append(emoji_data['E'])
return ''
Expand All @@ -300,3 +342,66 @@ def f(e, emoji_data):
return version[0]

raise ValueError("No emoji found in string")


def _get_search_tree():
"""
Generate a search tree for demojize()
Example of a search tree::

EMOJI_DATA =
{'a': {'en': ':Apple:'},
'b': {'en': ':Bus:'},
'ba': {'en': ':Bat:'},
'band': {'en': ':Beatles:'},
'bandit': {'en': ':Outlaw:'},
'bank': {'en': ':BankOfEngland:'},
'bb': {'en': ':BB-gun:'},
'c': {'en': ':Car:'}}

_SEARCH_TREE =
{'a': {'data': {'en': ':Apple:'}},
'b': {'a': {'data': {'en': ':Bat:'},
'n': {'d': {'data': {'en': ':Beatles:'},
'i': {'t': {'data': {'en': ':Outlaw:'}}}},
'k': {'data': {'en': ':BankOfEngland:'}}}},
'b': {'data': {'en': ':BB-gun:'}},
'data': {'en': ':Bus:'}},
'c': {'data': {'en': ':Car:'}}}

_SEARCH_TREE
/ | ⧵
/ | ⧵
a b c
| / | ⧵ |
| / | ⧵ |
:Apple: ba :Bus: bb :Car:
/ ⧵ |
/ ⧵ |
:Bat: ban :BB-gun:
/ ⧵
/ ⧵
band bank
/ ⧵ |
/ ⧵ |
bandi :Beatles: :BankOfEngland:
|
bandit
|
:Outlaw:


"""
global _SEARCH_TREE
if _SEARCH_TREE is None:
_SEARCH_TREE = {}
for emj in unicode_codes.EMOJI_DATA:
sub_tree = _SEARCH_TREE
lastidx = len(emj) - 1
for i, char in enumerate(emj):
if char not in sub_tree:
sub_tree[char] = {}
sub_tree = sub_tree[char]
if i == lastidx:
sub_tree['data'] = unicode_codes.EMOJI_DATA[emj]
return _SEARCH_TREE
Loading