Skip to content

Commit

Permalink
Merge pull request #197 from cvzi/demojize_performance
Browse files Browse the repository at this point in the history
Demojize performance
  • Loading branch information
TahirJalilov committed Dec 6, 2021
2 parents 290529c + b3b0dc6 commit e35fc45
Show file tree
Hide file tree
Showing 5 changed files with 455 additions and 79 deletions.
7 changes: 6 additions & 1 deletion CHANGES.md
@@ -1,6 +1,12 @@
emoji
=====

1.6.2
-----
* Improve performance of demojize()
* Added more tests
* Added warning when someone uses any other language than 'en' with use_aliases=True in emojize()

1.6.1
-----
* Allow multiple aliases
Expand All @@ -14,7 +20,6 @@ emoji
* emoji.version(string) method added
* Included 'variant' in the dict of dicts


1.5.0
-----
* Emojis of English version updated to the Emoji Charts v14.0
Expand Down
2 changes: 1 addition & 1 deletion emoji/__init__.py
Expand Up @@ -30,7 +30,7 @@
'EMOJI_ALIAS_UNICODE_ENGLISH', 'UNICODE_EMOJI_ALIAS_ENGLISH', 'EMOJI_DATA',
]

__version__ = '1.6.1'
__version__ = '1.6.2'
__author__ = 'Taehoon Kim, Kevin Wurster and Tahir Jalilov'
__email__ = 'carpedm20@gmail.com'
# and wursterk@gmail.com, tahir.jalilov@gmail.com
Expand Down
191 changes: 148 additions & 43 deletions emoji/core.py
Expand Up @@ -11,6 +11,7 @@

import re
import sys
import warnings

from emoji import unicode_codes

Expand All @@ -23,6 +24,7 @@
PY2 = sys.version_info[0] == 2

_EMOJI_REGEXP = None
_SEARCH_TREE = None
_DEFAULT_DELIMITER = ':'


Expand Down Expand Up @@ -52,7 +54,8 @@ def emojize(
:param use_aliases: (optional) Enable emoji aliases. See ``emoji.UNICODE_EMOJI_ALIAS``.
:param delimiters: (optional) Use delimiters other than _DEFAULT_DELIMITER
:param variant: (optional) Choose variation selector between "base"(None), VS-15 ("text_type") and VS-16 ("emoji_type")
:param language: Choose language of emoji name
:param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias'
to use English aliases
:param version: (optional) Max version. If set to an Emoji Version,
all emoji above this version will be ignored.
:param handle_version: (optional) Replace the emoji above ``version``
Expand All @@ -74,20 +77,22 @@ def emojize(
:raises ValueError: if ``variant`` is neither None, 'text_type' or 'emoji_type'
"""
EMOJI_UNICODE = unicode_codes.EMOJI_UNICODE[language]

if use_aliases or language == 'alias':
if language not in ('en', 'alias'):
warnings.warn("use_aliases=True is only supported for language='en'. "
"It is recommended to use emojize(string, language='alias') instead", stacklevel=2)
use_aliases = True
language = 'en'

EMOJI_UNICODE = unicode_codes.EMOJI_ALIAS_UNICODE_ENGLISH if use_aliases else unicode_codes.EMOJI_UNICODE[language]
pattern = re.compile(u'(%s[\\w\\-&.’”“()!#*+?–,/]+%s)' % delimiters, flags=re.UNICODE)

def replace(match):
mg = match.group(1).replace(delimiters[0], _DEFAULT_DELIMITER).replace(
delimiters[1], _DEFAULT_DELIMITER
)
if use_aliases:
emj = unicode_codes.EMOJI_ALIAS_UNICODE_ENGLISH.get(mg)
else:
emj = EMOJI_UNICODE.get(mg)

mg = match.group(1)[len(delimiters[0]):-len(delimiters[1])]
emj = EMOJI_UNICODE.get(_DEFAULT_DELIMITER + mg + _DEFAULT_DELIMITER)
if emj is None:
return mg
return match.group(1)

if version is not None:
if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
Expand Down Expand Up @@ -134,7 +139,8 @@ def demojize(
:param string: String contains unicode characters. MUST BE UNICODE.
:param use_aliases: (optional) Return emoji aliases. See ``emoji.UNICODE_EMOJI_ALIAS``.
:param delimiters: (optional) User delimiters other than ``_DEFAULT_DELIMITER``
:param language: (optional) Choose language of emoji name
:param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias'
to use English aliases
:param version: (optional) Max version. If set to an Emoji Version,
all emoji above this version will be removed.
:param handle_version: (optional) Replace the emoji above ``version``
Expand All @@ -156,24 +162,60 @@ def demojize(
"""

codes_dict = unicode_codes.UNICODE_EMOJI_ALIAS_ENGLISH if use_aliases else unicode_codes.UNICODE_EMOJI[language]

def replace(match):
emj = match.group(0)
val = codes_dict.get(emj)
if val is None:
return emj
if version is not None:
if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
if callable(handle_version):
return handle_version(emj, unicode_codes.EMOJI_DATA[emj])
elif handle_version is not None:
return str(handle_version)
if language == 'alias':
language = 'en'
use_aliases = True
elif use_aliases and language != 'en':
warnings.warn("use_aliases=True is only supported for language='en'. "
"It is recommended to use demojize(string, language='alias') instead", stacklevel=2)
language = 'en'

tree = _get_search_tree()
result = []
i = 0
length = len(string)
while i < length:
consumed = False
char = string[i]
if char in tree:
j = i + 1
sub_tree = tree[char]
while j < length and string[j] in sub_tree:
sub_tree = sub_tree[string[j]]
j += 1
if 'data' in sub_tree:
emj_data = sub_tree['data']
code_points = string[i:j]
replace_str = None
if version is not None and emj_data['E'] > version:
if callable(handle_version):
emj_data = emj_data.copy()
emj_data['match_start'] = i
emj_data['match_end'] = j
replace_str = handle_version(code_points, emj_data)
elif handle_version is not None:
replace_str = str(handle_version)
else:
replace_str = None
elif language in emj_data:
if use_aliases and 'alias' in emj_data:
replace_str = delimiters[0] + emj_data['alias'][0][1:-1] + delimiters[1]
else:
replace_str = delimiters[0] + emj_data[language][1:-1] + delimiters[1]
else:
return ''
return delimiters[0] + val[1:-1] + delimiters[1]
# The emoji exists, but it is not translated, so we keep the emoji
replace_str = code_points

i = j - 1
consumed = True
if replace_str:
result.append(replace_str)

if not consumed and char != u'\ufe0e' and char != u'\ufe0f':
result.append(char)
i += 1

return get_emoji_regexp().sub(replace, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')
return "".join(result)


def replace_emoji(string, replace='', language=None, version=-1):
Expand All @@ -189,20 +231,17 @@ def replace_emoji(string, replace='', language=None, version=-1):
:param language: (optional) Parameter is no longer used
"""

if version <= 0 and not callable(replace):
return get_emoji_regexp().sub(replace, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')

def replace_fct(match):
emj = match.group(0)

if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version:
if version > -1:
def f(emj, emj_data):
if emj_data['E'] <= version:
return emj # Do not replace emj
if callable(replace):
return replace(emj, unicode_codes.EMOJI_DATA[emj])
else:
return str(replace)
return emj
return replace(emj, emj_data)
return str(replace)

return get_emoji_regexp().sub(replace_fct, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '')
return demojize(string, use_aliases=False, language='en', version=-1, handle_version=f)
else:
return demojize(string, use_aliases=False, language='en', version=-1, handle_version=replace)


def get_emoji_regexp(language=None):
Expand Down Expand Up @@ -232,12 +271,14 @@ def emoji_lis(string, language=None):
"""
_entities = []

for match in get_emoji_regexp().finditer(string):
def f(emj, emj_data):
_entities.append({
'location': match.start(),
'emoji': match.group(),
'location': emj_data['match_start'],
'emoji': emj,
})

demojize(string, use_aliases=False, language='en',
version=-1, handle_version=f)
return _entities


Expand Down Expand Up @@ -285,6 +326,7 @@ def version(string):

# Try to find first emoji in string
version = []

def f(e, emoji_data):
version.append(emoji_data['E'])
return ''
Expand All @@ -300,3 +342,66 @@ def f(e, emoji_data):
return version[0]

raise ValueError("No emoji found in string")


def _get_search_tree():
"""
Generate a search tree for demojize()
Example of a search tree::
EMOJI_DATA =
{'a': {'en': ':Apple:'},
'b': {'en': ':Bus:'},
'ba': {'en': ':Bat:'},
'band': {'en': ':Beatles:'},
'bandit': {'en': ':Outlaw:'},
'bank': {'en': ':BankOfEngland:'},
'bb': {'en': ':BB-gun:'},
'c': {'en': ':Car:'}}
_SEARCH_TREE =
{'a': {'data': {'en': ':Apple:'}},
'b': {'a': {'data': {'en': ':Bat:'},
'n': {'d': {'data': {'en': ':Beatles:'},
'i': {'t': {'data': {'en': ':Outlaw:'}}}},
'k': {'data': {'en': ':BankOfEngland:'}}}},
'b': {'data': {'en': ':BB-gun:'}},
'data': {'en': ':Bus:'}},
'c': {'data': {'en': ':Car:'}}}
_SEARCH_TREE
/ | ⧵
/ | ⧵
a b c
| / | ⧵ |
| / | ⧵ |
:Apple: ba :Bus: bb :Car:
/ ⧵ |
/ ⧵ |
:Bat: ban :BB-gun:
/ ⧵
/ ⧵
band bank
/ ⧵ |
/ ⧵ |
bandi :Beatles: :BankOfEngland:
|
bandit
|
:Outlaw:
"""
global _SEARCH_TREE
if _SEARCH_TREE is None:
_SEARCH_TREE = {}
for emj in unicode_codes.EMOJI_DATA:
sub_tree = _SEARCH_TREE
lastidx = len(emj) - 1
for i, char in enumerate(emj):
if char not in sub_tree:
sub_tree[char] = {}
sub_tree = sub_tree[char]
if i == lastidx:
sub_tree['data'] = unicode_codes.EMOJI_DATA[emj]
return _SEARCH_TREE

0 comments on commit e35fc45

Please sign in to comment.