From cfe81a42094de28421292a03809ecec6aa9cb831 Mon Sep 17 00:00:00 2001 From: cvzi Date: Thu, 18 Nov 2021 16:11:48 +0100 Subject: [PATCH 1/5] Improve performance of demojize --- emoji/core.py | 175 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 139 insertions(+), 36 deletions(-) diff --git a/emoji/core.py b/emoji/core.py index eeb2b26e..afcf376f 100644 --- a/emoji/core.py +++ b/emoji/core.py @@ -11,6 +11,7 @@ import re import sys +import warnings from emoji import unicode_codes @@ -23,6 +24,7 @@ PY2 = sys.version_info[0] == 2 _EMOJI_REGEXP = None +_SEARCH_TREE = None _DEFAULT_DELIMITER = ':' @@ -74,18 +76,21 @@ def emojize( :raises ValueError: if ``variant`` is neither None, 'text_type' or 'emoji_type' """ - EMOJI_UNICODE = unicode_codes.EMOJI_UNICODE[language] + + if use_aliases and (language not in ('en', 'alias')): + warnings.warn("use_aliases=True is only supported in combination with language='en', use emojize(string, language='alias') for short", stacklevel=2) + + use_aliases = (use_aliases and language == 'en') or language == 'alias' + + EMOJI_UNICODE = unicode_codes.EMOJI_ALIAS_UNICODE_ENGLISH if use_aliases else unicode_codes.EMOJI_UNICODE[language] pattern = re.compile(u'(%s[\\w\\-&.’”“()!#*+?–,/]+%s)' % delimiters, flags=re.UNICODE) def replace(match): mg = match.group(1).replace(delimiters[0], _DEFAULT_DELIMITER).replace( delimiters[1], _DEFAULT_DELIMITER ) - if use_aliases: - emj = unicode_codes.EMOJI_ALIAS_UNICODE_ENGLISH.get(mg) - else: - emj = EMOJI_UNICODE.get(mg) + emj = EMOJI_UNICODE.get(mg) if emj is None: return mg @@ -156,24 +161,61 @@ def demojize( """ - codes_dict = unicode_codes.UNICODE_EMOJI_ALIAS_ENGLISH if use_aliases else unicode_codes.UNICODE_EMOJI[language] - - def replace(match): - emj = match.group(0) - val = codes_dict.get(emj) - if val is None: - return emj - if version is not None: - if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version: - if callable(handle_version): - return handle_version(emj, unicode_codes.EMOJI_DATA[emj]) - elif handle_version is not None: - return str(handle_version) + if language == 'alias': + language = 'en' + use_aliases = True + else: + if use_aliases and language != 'en': + warnings.warn("use_aliases=True is only supported in combination with language='en', use demojize(string, language='alias') for short", stacklevel=2) + use_aliases = use_aliases and language == 'en' + + tree = _get_search_tree() + result = [] + i = 0 + length = len(string) + while i < length: + consumed = False + char = string[i] + if char in tree: + j = i + 1 + sub_tree = tree[char] + while j < length and string[j] in sub_tree: + sub_tree = sub_tree[string[j]] + j += 1 + if 'data' in sub_tree: + emj_data = sub_tree['data'] + code_points = string[i:j] + replace_str = None + if version is not None and emj_data['E'] > version: + if callable(handle_version): + emj_data = emj_data.copy() + emj_data['match_start'] = i + emj_data['match_end'] = j + replace_str = handle_version(code_points, emj_data) + elif handle_version is not None: + replace_str = str(handle_version) + else: + replace_str = None + elif language in emj_data: + if use_aliases and 'alias' in emj_data: + replace_str = delimiters[0] + emj_data['alias'][0][1:-1] + delimiters[1] + else: + replace_str = delimiters[0] + emj_data[language][1:-1] + delimiters[1] else: - return '' - return delimiters[0] + val[1:-1] + delimiters[1] + # The emoji exists, but it is not translated, so we keep the emoji + # TODO write a test for this case + replace_str = code_points + + i = j - 1 + consumed = True + if replace_str: + result.append(replace_str) - return get_emoji_regexp().sub(replace, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '') + if not consumed and char != u'\ufe0e' and char != u'\ufe0f': + result.append(char) + i += 1 + + return "".join(result) def replace_emoji(string, replace='', language=None, version=-1): @@ -189,20 +231,17 @@ def replace_emoji(string, replace='', language=None, version=-1): :param language: (optional) Parameter is no longer used """ - if version <= 0 and not callable(replace): - return get_emoji_regexp().sub(replace, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '') - - def replace_fct(match): - emj = match.group(0) - - if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version: + if version > -1: + def f(emj, emj_data): + if emj_data['E'] <= version: + return emj # Do not replace emj if callable(replace): - return replace(emj, unicode_codes.EMOJI_DATA[emj]) - else: - return str(replace) - return emj + return replace(emj, emj_data) + return str(replace) - return get_emoji_regexp().sub(replace_fct, string).replace(u'\ufe0e', '').replace(u'\ufe0f', '') + return demojize(string, use_aliases=False, language='en', version=-1, handle_version=f) + else: + return demojize(string, use_aliases=False, language='en', version=-1, handle_version=replace) def get_emoji_regexp(language=None): @@ -232,12 +271,14 @@ def emoji_lis(string, language=None): """ _entities = [] - for match in get_emoji_regexp().finditer(string): + def f(emj, emj_data): _entities.append({ - 'location': match.start(), - 'emoji': match.group(), + 'location': emj_data['match_start'], + 'emoji': emj, }) + demojize(string, use_aliases=False, language='en', + version=-1, handle_version=f) return _entities @@ -300,3 +341,65 @@ def f(e, emoji_data): return version[0] raise ValueError("No emoji found in string") + +def _get_search_tree(): + """ + Generate a search tree for demojize() + Example of a search tree:: + + EMOJI_DATA = + {'a': {'en': ':Apple:'}, + 'b': {'en': ':Bus:'}, + 'ba': {'en': ':Bat:'}, + 'band': {'en': ':Beatles:'}, + 'bandit': {'en': ':Outlaw:'}, + 'bank': {'en': ':BankOfEngland:'}, + 'bb': {'en': ':BB-gun:'}, + 'c': {'en': ':Car:'}} + + _SEARCH_TREE = + {'a': {'data': {'en': ':Apple:'}}, + 'b': {'a': {'data': {'en': ':Bat:'}, + 'n': {'d': {'data': {'en': ':Beatles:'}, + 'i': {'t': {'data': {'en': ':Outlaw:'}}}}, + 'k': {'data': {'en': ':BankOfEngland:'}}}}, + 'b': {'data': {'en': ':BB-gun:'}}, + 'data': {'en': ':Bus:'}}, + 'c': {'data': {'en': ':Car:'}}} + + _SEARCH_TREE + / | ⧵ + / | ⧵ + a b c + | / | ⧵ | + | / | ⧵ | + :Apple: ba :Bus: bb :Car: + / ⧵ | + / ⧵ | + :Bat: ban :BB-gun: + / ⧵ + / ⧵ + band bank + / ⧵ | + / ⧵ | + bandi :Beatles: :BankOfEngland: + | + bandit + | + :Outlaw: + + + """ + global _SEARCH_TREE + if _SEARCH_TREE is None: + _SEARCH_TREE = {} + for emj in unicode_codes.EMOJI_DATA: + sub_tree = _SEARCH_TREE + lastidx = len(emj) - 1 + for i, char in enumerate(emj): + if char not in sub_tree: + sub_tree[char] = {} + sub_tree = sub_tree[char] + if i == lastidx: + sub_tree['data'] = unicode_codes.EMOJI_DATA[emj] + return _SEARCH_TREE From bd9d89d5753a1c9fda4f01875b5bc7e90e043b27 Mon Sep 17 00:00:00 2001 From: cvzi Date: Thu, 18 Nov 2021 16:12:06 +0100 Subject: [PATCH 2/5] More tests --- tests/test_core.py | 280 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 246 insertions(+), 34 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 146cc74a..03d93de1 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -5,6 +5,7 @@ from __future__ import unicode_literals +import random import re import emoji import pytest @@ -36,29 +37,38 @@ def test_emojize_complicated_string(): expected = emoji.emojize(actual, False) assert expected == actual, '%s != %s' % (expected, actual) + def test_emojize_languages(): for lang_code, emoji_pack in emoji.EMOJI_UNICODE.items(): for name, emj in emoji_pack.items(): assert emoji.emojize(name, language=lang_code) == emj + def test_demojize_languages(): - for lang_code, emoji_pack in emoji.UNICODE_EMOJI.items(): - for emj, name in emoji_pack.items(): + for lang_code, emoji_pack in emoji.EMOJI_UNICODE.items(): + for name, emj in emoji_pack.items(): assert emoji.demojize(emj, language=lang_code) == name def test_emojize_variant(): - remove_variant = lambda s: re.sub(u'[\ufe0e\ufe0f]$', '', s) - - assert emoji.emojize(':Taurus:', variant=None) == emoji.EMOJI_UNICODE['en'][':Taurus:'] - assert emoji.emojize(':Taurus:', variant=None) == emoji.emojize(':Taurus:') - assert emoji.emojize(':Taurus:', variant='text_type') == remove_variant(emoji.EMOJI_UNICODE['en'][':Taurus:']) + u'\ufe0e' - assert emoji.emojize(':Taurus:', variant='emoji_type') == remove_variant(emoji.EMOJI_UNICODE['en'][':Taurus:']) + u'\ufe0f' - - assert emoji.emojize(':admission_tickets:', variant=None) == emoji.EMOJI_UNICODE['en'][':admission_tickets:'] - assert emoji.emojize(':admission_tickets:', variant=None) == emoji.emojize(':admission_tickets:') - assert emoji.emojize(':admission_tickets:', variant='text_type') == remove_variant(emoji.EMOJI_UNICODE['en'][':admission_tickets:']) + u'\ufe0e' - assert emoji.emojize(':admission_tickets:', variant='emoji_type') == remove_variant(emoji.EMOJI_UNICODE['en'][':admission_tickets:']) + u'\ufe0f' + def remove_variant(s): return re.sub(u'[\ufe0e\ufe0f]$', '', s) + + assert emoji.emojize( + ':Taurus:', variant=None) == emoji.EMOJI_UNICODE['en'][':Taurus:'] + assert emoji.emojize(':Taurus:', variant=None) == emoji.emojize(':Taurus:') + assert emoji.emojize(':Taurus:', variant='text_type') == remove_variant( + emoji.EMOJI_UNICODE['en'][':Taurus:']) + u'\ufe0e' + assert emoji.emojize(':Taurus:', variant='emoji_type') == remove_variant( + emoji.EMOJI_UNICODE['en'][':Taurus:']) + u'\ufe0f' + + assert emoji.emojize( + ':admission_tickets:', variant=None) == emoji.EMOJI_UNICODE['en'][':admission_tickets:'] + assert emoji.emojize(':admission_tickets:', variant=None) == emoji.emojize( + ':admission_tickets:') + assert emoji.emojize(':admission_tickets:', variant='text_type') == remove_variant( + emoji.EMOJI_UNICODE['en'][':admission_tickets:']) + u'\ufe0e' + assert emoji.emojize(':admission_tickets:', variant='emoji_type') == remove_variant( + emoji.EMOJI_UNICODE['en'][':admission_tickets:']) + u'\ufe0f' with pytest.raises(ValueError): emoji.emojize(':admission_tickets:', variant=False) @@ -70,21 +80,23 @@ def test_emojize_variant(): emoji.emojize(':admission_tickets:', variant='wrong') assert emoji.emojize(":football:", use_aliases=False) == ':football:' - assert emoji.emojize(":football:", variant="text_type", use_aliases=False) == ':football:' - assert emoji.emojize(":football:", use_aliases=True) == u'\U0001F3C8' - assert emoji.emojize(":football:", variant="emoji_type", use_aliases=True) == u'\U0001F3C8' + assert emoji.emojize(":football:", variant="text_type", + use_aliases=False) == ':football:' + assert emoji.emojize(":football:", use_aliases=True) == u'\U0001F3C8' + assert emoji.emojize(":football:", variant="emoji_type", + use_aliases=True) == u'\U0001F3C8' def test_demojize_removes_variant(): # demojize should remove all variant indicators \ufe0e and \ufe0f from the string text = "".join([emoji.emojize(':Taurus:', variant='text_type'), - emoji.emojize(':Taurus:', variant='emoji_type'), - emoji.emojize(':admission_tickets:', variant='text_type'), - emoji.emojize(':admission_tickets:', variant='emoji_type'), - emoji.emojize(':alien:', variant='text_type'), - emoji.emojize(':alien:', variant='emoji_type'), - emoji.emojize(':atom_symbol:', variant='text_type'), - emoji.emojize(':atom_symbol:', variant='emoji_type')]) + emoji.emojize(':Taurus:', variant='emoji_type'), + emoji.emojize(':admission_tickets:', variant='text_type'), + emoji.emojize(':admission_tickets:', variant='emoji_type'), + emoji.emojize(':alien:', variant='text_type'), + emoji.emojize(':alien:', variant='emoji_type'), + emoji.emojize(':atom_symbol:', variant='text_type'), + emoji.emojize(':atom_symbol:', variant='emoji_type')]) for lang_code in emoji.UNICODE_EMOJI: result = emoji.demojize(text, language=lang_code) @@ -102,26 +114,81 @@ def test_alias(): assert emoji.emojize(':soccer:', use_aliases=False) == ':soccer:' assert emoji.emojize(':soccer:', use_aliases=True) == u'\U000026BD' assert emoji.emojize(':football:', use_aliases=False) == ':football:' - assert emoji.emojize(':football:', use_aliases=True) == u'\U0001F3C8' + assert emoji.emojize(':football:', use_aliases=True) == u'\U0001F3C8' # Multiple aliases for one emoji: - assert emoji.emojize(':thumbsup:', use_aliases=True) == emoji.emojize(':+1:', use_aliases=True) - assert emoji.emojize(':thumbsup:', use_aliases=True) == emoji.emojize(':thumbs_up:', use_aliases=True) + assert emoji.emojize(':thumbsup:', use_aliases=True) == emoji.emojize( + ':+1:', use_aliases=True) + assert emoji.emojize(':thumbsup:', use_aliases=True) == emoji.emojize( + ':thumbs_up:', use_aliases=True) assert emoji.emojize(':thumbsup:', use_aliases=True) == u'\U0001f44d' + thumbsup = u'\U0001f44d' + assert emoji.demojize(thumbsup, use_aliases=True) != thumbsup + assert emoji.demojize(thumbsup, use_aliases=True) != ':thumbs_up:' + assert emoji.demojize(thumbsup, use_aliases=True) != emoji.demojize( + thumbsup, use_aliases=False) + + thailand = u'🇹🇭' + assert emoji.demojize(thailand, use_aliases=True) != thailand + assert emoji.demojize(thailand, use_aliases=True) != ':Thailand:' + assert emoji.demojize(thailand, use_aliases=True) != emoji.demojize( + thailand, use_aliases=False) + assert emoji.demojize(thailand, use_aliases=True, version=1.0) != emoji.demojize( + thailand, use_aliases=True) + + # No alias + for emj, emoji_data in emoji.EMOJI_DATA.items(): + if emoji_data['status'] != emoji.STATUS['fully_qualified']: + continue + if 'alias' not in emoji_data: + assert emoji.emojize(emoji_data['en'], use_aliases=True) != emoji_data['en'] + assert emoji.demojize(emj, use_aliases=True) == emoji_data['en'] + + # language='alias' + assert emoji.emojize(':flag_for_Thailand:', use_aliases=True, language="en") == thailand + assert emoji.emojize(':flag_for_Thailand:', language="alias") == thailand + assert emoji.emojize(':flag_for_Thailand:', language="alias", use_aliases=True) == thailand + assert emoji.demojize(thailand, use_aliases=True, language="en") == ':flag_for_Thailand:' + assert emoji.demojize(thailand, language="alias") ==':flag_for_Thailand:' + assert emoji.demojize(thailand, language="alias", use_aliases=True) ==':flag_for_Thailand:' + def test_invalid_alias(): # Invalid aliases should be passed through untouched assert emoji.emojize(':tester:', use_aliases=True) == ':tester:' assert emoji.emojize(':footbal:', use_aliases=True) == ':footbal:' assert emoji.emojize(':socer:', use_aliases=True) == ':socer:' - emoji.emojize(':socer:', use_aliases=True, variant="text_type") == ':socer:' + emoji.emojize(':socer:', use_aliases=True, + variant="text_type") == ':socer:' + + +@pytest.mark.filterwarnings("ignore") +def test_alias_wrong_language(): + # Alias with wrong languages + thailand = u'🇹🇭' + with pytest.warns(UserWarning) as w: + emoji.emojize(':flag_for_Thailand:', use_aliases=True, language="es") + assert emoji.emojize(':flag_for_Thailand:', use_aliases=True, language="es") == ':flag_for_Thailand:' + assert emoji.emojize(':flag_for_Thailand:', use_aliases=True, language="en") == thailand + + with pytest.warns(UserWarning) as w: + emoji.demojize(thailand, use_aliases=True, language="es") + assert emoji.demojize(thailand, use_aliases=True, language="es") == ':bandera_tailandia:' + assert emoji.demojize(thailand, use_aliases=True, language="en") == ':flag_for_Thailand:' def test_demojize_name_only(): - for name in emoji.EMOJI_UNICODE.keys(): - oneway = emoji.emojize(name, False) - roundtrip = emoji.demojize(oneway) - assert name == roundtrip, '%s != %s' % (name, roundtrip) + for emj, item in emoji.EMOJI_DATA.items(): + if item['status'] != emoji.STATUS['fully_qualified']: + continue + for lang_code in emoji.UNICODE_EMOJI: + if not lang_code in item: + continue + name = item[lang_code] + oneway = emoji.emojize(name, use_aliases=False, language=lang_code) + assert oneway == emj + roundtrip = emoji.demojize(oneway, language=lang_code) + assert name == roundtrip, '%s != %s' % (name, roundtrip) def test_demojize_complicated_string(): @@ -131,11 +198,22 @@ def test_demojize_complicated_string(): assert constructed == destructed, '%s != %s' % (constructed, destructed) +def test_demojize_delimiters(): + for e in [u'\U000026BD', u'\U0001f44d', u'\U0001F3C8']: + for d in [(":", ":"), ("a", "b"), ("123", "456"), (u"😁", u"👌")]: + s = emoji.demojize(e, delimiters=d) + assert s.startswith(d[0]) + assert s.endswith(d[1]) + + def test_emoji_lis(): - assert emoji.emoji_lis('Hi, I am fine. 😁') == [{'location': 15, 'emoji': '😁'}] + assert emoji.emoji_lis('Hi, I am 👌 test')[0]['location'] == 9 assert emoji.emoji_lis('Hi') == [] - if len('Hello 🇫🇷👌') < 10: # skip this test on python with UCS-2 as the string length/positions are different - assert emoji.emoji_lis('Hello 🇫🇷👌') == [{'emoji': '🇫🇷', 'location': 6}, {'emoji': '👌', 'location': 8}] + if len('Hello 🇫🇷👌') < 10: # skip these tests on python with UCS-2 as the string length/positions are different + assert emoji.emoji_lis('Hi, I am fine. 😁') == [ + {'location': 15, 'emoji': '😁'}] + assert emoji.emoji_lis('Hello 🇫🇷👌') == [ + {'emoji': '🇫🇷', 'location': 6}, {'emoji': '👌', 'location': 8}] def test_distinct_emoji_lis(): @@ -167,3 +245,137 @@ def test_is_emoji(): assert emoji.is_emoji('😁') assert not emoji.is_emoji('H') assert emoji.is_emoji('🇫🇷') + + +def test_long_emoji(): + assert emoji.demojize('This is \U0001F9D1\U0001F3FC\U0000200D\U0001F37C example text') == 'This is :person_feeding_baby_medium-light_skin_tone: example text' + assert emoji.demojize('This is \U0001f468\U0001f3ff\u200d\u2764\ufe0f\u200d\U0001f468\U0001f3ff example text \U0001F469\U0001F3FB\U0000200D\U0001F91D\U0000200D\U0001F468\U0001F3FF') == 'This is :couple_with_heart_man_man_dark_skin_tone: example text :woman_and_man_holding_hands_light_skin_tone_dark_skin_tone:' + assert emoji.demojize('This is \U0001f468\U0001f3ff\u200d\u2764\ufe0f\u200d\U0001f468\U0001f3ff\U0001f468\U0001f3ff\u200d\u2764\ufe0f\u200d\U0001f48b\u200d\U0001f468\U0001f3ff example text \U0001F469\U0001F3FB\U0000200D\U0001F91D\U0000200D\U0001F468\U0001F3FF') == 'This is :couple_with_heart_man_man_dark_skin_tone::kiss_man_man_dark_skin_tone: example text :woman_and_man_holding_hands_light_skin_tone_dark_skin_tone:' + assert emoji.demojize('\U0001F46B\U0001F3FB This is \U0001f468\U0001f3ff\U0001f468\U0001f3ff\u200d\u2764\ufe0f\u200d\U0001f468\U0001f3ff\U0001f468\U0001f3ff\u200d\u2764\ufe0f\u200d\U0001f48b\u200d\U0001f468\U0001f3ff example text \U0001F469\U0001F3FB\U0000200D\U0001F91D\U0000200D\U0001F468\U0001F3FF') == ':woman_and_man_holding_hands_light_skin_tone: This is :man_dark_skin_tone::couple_with_heart_man_man_dark_skin_tone::kiss_man_man_dark_skin_tone: example text :woman_and_man_holding_hands_light_skin_tone_dark_skin_tone:' + assert emoji.demojize('\U0001F46B\U0001F3FB\U0001F46B\U0001F3FB\U0001F469\U0001F3FB\U0000200D\U0001F91D\U0000200D\U0001F468\U0001F3FF\U0001FAF1\U0001F3FD\U0001FAF1\U0001F3FD\U0000200D\U0001FAF2\U0001F3FF') == ':woman_and_man_holding_hands_light_skin_tone::woman_and_man_holding_hands_light_skin_tone::woman_and_man_holding_hands_light_skin_tone_dark_skin_tone::rightwards_hand_medium_skin_tone::handshake_medium_skin_tone_dark_skin_tone:' + s = ":crossed_fingers_medium-light_skin_tone::crossed_fingers::crossed_fingers_dark_skin_tone:" + assert emoji.demojize(emoji.demojize(s)) == s + + + +def test_untranslated(): + for emj, item in emoji.EMOJI_DATA.items(): + if item['status'] != emoji.STATUS['fully_qualified']: + continue + if 'es' not in item: + # untranslated + value = emoji.emojize(item['en'], language='en') + roundtrip = emoji.demojize(value, language='es') + assert roundtrip == value, '%s != %s (from %s)' % (roundtrip.encode("unicode-escape").decode(), value.encode("unicode-escape").decode(), item['en']) + else: + # translated + value = emoji.emojize(item['en'], language='en') + roundtrip = emoji.demojize(value, language='es') + assert roundtrip == item['es'], '%s != %s' % (roundtrip, item['es']) + + +def test_text(): + UCS2 = len('Hello 🇫🇷👌') > 9 # don't break up characters on python with UCS-2 + + text = u"""Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. +Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. +Excepteur sint occaecat in reprehenderit in cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +Stróż pchnął kość w quiz gędźb vel fax myjń. +Høj bly gom vandt fræk sexquiz på wc. +Съешь же ещё этих мягких французских булок, да выпей чаю. +За миг бях в чужд плюшен скърцащ фотьойл. +هلا سكنت بذي ضغثٍ فقد زعموا — شخصت تطلب ظبياً راح مجتازا +שפן אכל קצת גזר בטעם חסה, ודי +ऋषियों को सताने वाले दुष्ट राक्षसों के राजा रावण का सर्वनाश करने वाले विष्णुवतार भगवान श्रीराम, अयोध्या के महाराज दशरथ के बड़े सपुत्र थे। +とりなくこゑす ゆめさませ みよあけわたる ひんかしを そらいろはえて おきつへに ほふねむれゐぬ もやのうち +視野無限廣,窗外有藍天 +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. +Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +""" + + def add_random_emoji(text, lst, select=lambda emj_data: emj_data['en']): + + text = text + + emoji_list = [] + text_with_unicode = u"" + text_with_placeholder = u"" + for i in range(0, len(text), 10): + while True: + emj, emj_data = random.choice(lst) + placeholder = select(emj_data) + if placeholder: + break + + if UCS2: + j = text.find(u" ", i, i + 10) + if j == -1: + continue + else: + j = random.randint(i, i + 10) + + text_with_unicode += text[i:j] + text_with_unicode += emj + text_with_unicode += text[j:i + 10] + + text_with_placeholder += text[i:j] + text_with_placeholder += placeholder + text_with_placeholder += text[j:i + 10] + + emoji_list.append(emj) + + return text_with_unicode, text_with_placeholder, emoji_list + + def clean(s): + return s.replace(u'\u200d', '').replace(u'\ufe0f', '') + + all_emoji_list = list(emoji.EMOJI_DATA.items()) + qualified_emoji_list = list((emj, item) for emj, item in emoji.EMOJI_DATA.items() if item['status'] == emoji.STATUS['fully_qualified']) + + # qualified emoji + text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, qualified_emoji_list) + assert emoji.demojize(text_with_unicode) == text_with_placeholder + assert emoji.emojize(text_with_placeholder) == text_with_unicode + if not UCS2: + assert emoji.replace_emoji(text_with_unicode, u'') == text + assert set(emoji.distinct_emoji_lis(text_with_unicode)) == set(emoji_list) + for i, lis in enumerate(emoji.emoji_lis(text_with_unicode)): + assert lis['emoji'] == emoji_list[i] + + # qualified emoji from "es" + selector = lambda emoji_data: emoji_data["es"] if "es" in emoji_data else False + text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, qualified_emoji_list, selector) + assert emoji.demojize(text_with_unicode, language="es") == text_with_placeholder + assert emoji.emojize(text_with_placeholder, language="es") == text_with_unicode + if not UCS2: + assert emoji.replace_emoji(text_with_unicode, u'') == text + assert set(emoji.distinct_emoji_lis(text_with_unicode)) == set(emoji_list) + for i, lis in enumerate(emoji.emoji_lis(text_with_unicode)): + assert lis['emoji'] == emoji_list[i] + + # qualified emoji from "alias" + selector = lambda emoji_data: emoji_data["alias"][0] if "alias" in emoji_data else False + text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, qualified_emoji_list, selector) + assert emoji.demojize(text_with_unicode, use_aliases=True) == text_with_placeholder + assert emoji.emojize(text_with_placeholder, use_aliases=True) == text_with_unicode + if not UCS2: + assert emoji.replace_emoji(text_with_unicode, u'') == text + assert set(emoji.distinct_emoji_lis(text_with_unicode)) == set(emoji_list) + for i, lis in enumerate(emoji.emoji_lis(text_with_unicode)): + assert lis['emoji'] == emoji_list[i] + + # all emoji + text_with_unicode, text_with_placeholder, emoji_list = add_random_emoji(text, all_emoji_list) + assert emoji.demojize(text_with_unicode) == text_with_placeholder + assert clean(emoji.emojize(text_with_placeholder)) == clean(text_with_unicode) + if not UCS2: + assert emoji.replace_emoji(text_with_unicode, u'') == text + assert set(emoji.distinct_emoji_lis(text_with_unicode)) == set(emoji_list) + for i, lis in enumerate(emoji.emoji_lis(text_with_unicode)): + assert lis['emoji'] == emoji_list[i] + + +def test_text_multiple_times(): + for i in range(100): + test_text() From 148d531cf21eaafc0387c31fe48780442544a8ce Mon Sep 17 00:00:00 2001 From: cvzi Date: Thu, 18 Nov 2021 16:30:06 +0100 Subject: [PATCH 3/5] Code style --- emoji/core.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/emoji/core.py b/emoji/core.py index afcf376f..b085f88b 100644 --- a/emoji/core.py +++ b/emoji/core.py @@ -78,7 +78,8 @@ def emojize( """ if use_aliases and (language not in ('en', 'alias')): - warnings.warn("use_aliases=True is only supported in combination with language='en', use emojize(string, language='alias') for short", stacklevel=2) + warnings.warn("use_aliases=True is only supported for language='en'. " + "It is recommended to use emojize(string, language='alias') instead", stacklevel=2) use_aliases = (use_aliases and language == 'en') or language == 'alias' @@ -166,7 +167,8 @@ def demojize( use_aliases = True else: if use_aliases and language != 'en': - warnings.warn("use_aliases=True is only supported in combination with language='en', use demojize(string, language='alias') for short", stacklevel=2) + warnings.warn("use_aliases=True is only supported for language='en'. " + "It is recommended to use demojize(string, language='alias') instead", stacklevel=2) use_aliases = use_aliases and language == 'en' tree = _get_search_tree() @@ -203,7 +205,6 @@ def demojize( replace_str = delimiters[0] + emj_data[language][1:-1] + delimiters[1] else: # The emoji exists, but it is not translated, so we keep the emoji - # TODO write a test for this case replace_str = code_points i = j - 1 @@ -326,6 +327,7 @@ def version(string): # Try to find first emoji in string version = [] + def f(e, emoji_data): version.append(emoji_data['E']) return '' @@ -342,6 +344,7 @@ def f(e, emoji_data): raise ValueError("No emoji found in string") + def _get_search_tree(): """ Generate a search tree for demojize() From 424acc5cd91cdc3ab7d001d7078bdd23e5deb977 Mon Sep 17 00:00:00 2001 From: cvzi Date: Fri, 26 Nov 2021 23:48:13 +0100 Subject: [PATCH 4/5] use_aliases=True overrides language='...' (this restores the behaviour of previous versions) Bugfix: Default delimiters were used instead of the custom delimiters in emojize() when an unknown emoji was found. --- emoji/core.py | 31 +++++++++++------------ tests/test_core.py | 62 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 22 deletions(-) diff --git a/emoji/core.py b/emoji/core.py index b085f88b..d9967a31 100644 --- a/emoji/core.py +++ b/emoji/core.py @@ -54,7 +54,8 @@ def emojize( :param use_aliases: (optional) Enable emoji aliases. See ``emoji.UNICODE_EMOJI_ALIAS``. :param delimiters: (optional) Use delimiters other than _DEFAULT_DELIMITER :param variant: (optional) Choose variation selector between "base"(None), VS-15 ("text_type") and VS-16 ("emoji_type") - :param language: Choose language of emoji name + :param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias' + to use English aliases :param version: (optional) Max version. If set to an Emoji Version, all emoji above this version will be ignored. :param handle_version: (optional) Replace the emoji above ``version`` @@ -77,23 +78,21 @@ def emojize( """ - if use_aliases and (language not in ('en', 'alias')): - warnings.warn("use_aliases=True is only supported for language='en'. " - "It is recommended to use emojize(string, language='alias') instead", stacklevel=2) - - use_aliases = (use_aliases and language == 'en') or language == 'alias' + if use_aliases or language == 'alias': + if language not in ('en', 'alias'): + warnings.warn("use_aliases=True is only supported for language='en'. " + "It is recommended to use emojize(string, language='alias') instead", stacklevel=2) + use_aliases = True + language = 'en' EMOJI_UNICODE = unicode_codes.EMOJI_ALIAS_UNICODE_ENGLISH if use_aliases else unicode_codes.EMOJI_UNICODE[language] pattern = re.compile(u'(%s[\\w\\-&.’”“()!#*+?–,/]+%s)' % delimiters, flags=re.UNICODE) def replace(match): - mg = match.group(1).replace(delimiters[0], _DEFAULT_DELIMITER).replace( - delimiters[1], _DEFAULT_DELIMITER - ) - - emj = EMOJI_UNICODE.get(mg) + mg = match.group(1)[len(delimiters[0]):-len(delimiters[1])] + emj = EMOJI_UNICODE.get(_DEFAULT_DELIMITER + mg + _DEFAULT_DELIMITER) if emj is None: - return mg + return match.group(1) if version is not None: if emj in unicode_codes.EMOJI_DATA and unicode_codes.EMOJI_DATA[emj]['E'] > version: @@ -140,7 +139,8 @@ def demojize( :param string: String contains unicode characters. MUST BE UNICODE. :param use_aliases: (optional) Return emoji aliases. See ``emoji.UNICODE_EMOJI_ALIAS``. :param delimiters: (optional) User delimiters other than ``_DEFAULT_DELIMITER`` - :param language: (optional) Choose language of emoji name + :param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias' + to use English aliases :param version: (optional) Max version. If set to an Emoji Version, all emoji above this version will be removed. :param handle_version: (optional) Replace the emoji above ``version`` @@ -165,11 +165,10 @@ def demojize( if language == 'alias': language = 'en' use_aliases = True - else: - if use_aliases and language != 'en': + elif use_aliases and language != 'en': warnings.warn("use_aliases=True is only supported for language='en'. " "It is recommended to use demojize(string, language='alias') instead", stacklevel=2) - use_aliases = use_aliases and language == 'en' + language = 'en' tree = _get_search_tree() result = [] diff --git a/tests/test_core.py b/tests/test_core.py index 03d93de1..7cef4b08 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -11,6 +11,11 @@ import pytest +def ascii(s): + # return escaped Code points \U000AB123 + return s.encode("unicode-escape").decode() + + def test_emojize_name_only(): for lang_code, emoji_pack in emoji.EMOJI_UNICODE.items(): for name in emoji_pack.keys(): @@ -108,6 +113,8 @@ def test_emojize_invalid_emoji(): string = '__---___--Invalid__--__-Name' assert emoji.emojize(string, False) == string + string = ':: baby:: :_: : : : : :-: :+:' + assert emoji.emojize(string, False) == string def test_alias(): # When use_aliases=False aliases should be passed through untouched @@ -162,20 +169,28 @@ def test_invalid_alias(): variant="text_type") == ':socer:' -@pytest.mark.filterwarnings("ignore") def test_alias_wrong_language(): # Alias with wrong languages thailand = u'🇹🇭' with pytest.warns(UserWarning) as w: emoji.emojize(':flag_for_Thailand:', use_aliases=True, language="es") - assert emoji.emojize(':flag_for_Thailand:', use_aliases=True, language="es") == ':flag_for_Thailand:' + with pytest.warns(UserWarning) as w: + assert emoji.emojize(':flag_for_Thailand:', use_aliases=True, language="de") == thailand + with pytest.warns(UserWarning) as w: + assert emoji.emojize(':flag_for_Thailand:', use_aliases=True, language="es") == thailand + assert emoji.emojize(':flag_for_Thailand:', use_aliases=False, language="es") == ':flag_for_Thailand:' assert emoji.emojize(':flag_for_Thailand:', use_aliases=True, language="en") == thailand + assert emoji.emojize(':flag_for_Thailand:', use_aliases=False, language="alias") == thailand + assert emoji.emojize(':flag_for_Thailand:', use_aliases=True, language="alias") == thailand with pytest.warns(UserWarning) as w: emoji.demojize(thailand, use_aliases=True, language="es") - assert emoji.demojize(thailand, use_aliases=True, language="es") == ':bandera_tailandia:' + with pytest.warns(UserWarning) as w: + assert emoji.demojize(thailand, use_aliases=True, language="es") == ':flag_for_Thailand:' + assert emoji.demojize(thailand, use_aliases=False, language="es") == ':bandera_tailandia:' assert emoji.demojize(thailand, use_aliases=True, language="en") == ':flag_for_Thailand:' - + assert emoji.demojize(thailand, use_aliases=False, language="alias") == ':flag_for_Thailand:' + assert emoji.demojize(thailand, use_aliases=True, language="alias") == ':flag_for_Thailand:' def test_demojize_name_only(): for emj, item in emoji.EMOJI_DATA.items(): @@ -200,11 +215,23 @@ def test_demojize_complicated_string(): def test_demojize_delimiters(): for e in [u'\U000026BD', u'\U0001f44d', u'\U0001F3C8']: - for d in [(":", ":"), ("a", "b"), ("123", "456"), (u"😁", u"👌")]: + for d in [(":", ":"), ("a", "b"), ("!", "!!"), ("123", "456"), (u"😁", u"👌")]: s = emoji.demojize(e, delimiters=d) assert s.startswith(d[0]) assert s.endswith(d[1]) + text = u"Example of a text with an emoji%sin a sentence" + for e in [u'\U000026BD', u'\U0001f44d', u'\U0001F3C8']: + for d in [(":", ":"), ("!", "-!-"), ("-", "-"), (":", "::"), ("::", "::"), (u"😁", u"👌")]: + text_with_unicode = text % e + demojized_text = emoji.demojize(text_with_unicode, delimiters=d) + assert text_with_unicode != demojized_text + assert e not in demojized_text + assert emoji.emojize(demojized_text, delimiters=d) == text_with_unicode + text_with_emoji = text % emoji.demojize(e, delimiters=d) + assert demojized_text == text_with_emoji + assert emoji.emojize(text_with_emoji, delimiters=d) == text_with_unicode + def test_emoji_lis(): assert emoji.emoji_lis('Hi, I am 👌 test')[0]['location'] == 9 @@ -266,7 +293,7 @@ def test_untranslated(): # untranslated value = emoji.emojize(item['en'], language='en') roundtrip = emoji.demojize(value, language='es') - assert roundtrip == value, '%s != %s (from %s)' % (roundtrip.encode("unicode-escape").decode(), value.encode("unicode-escape").decode(), item['en']) + assert roundtrip == value, '%s != %s (from %s)' % (ascii(roundtrip), ascii(value), item['en']) else: # translated value = emoji.emojize(item['en'], language='en') @@ -377,5 +404,28 @@ def clean(s): def test_text_multiple_times(): + # Run test_text() multiple times because it relies on a random text for i in range(100): test_text() + + +def test_invalid_chars(): + invalidchar = u"\U0001F20F" + assert emoji.demojize(invalidchar) == invalidchar, "%r != %r" % (ascii(emoji.demojize(invalidchar)), ascii(invalidchar)) + assert emoji.demojize(invalidchar) == invalidchar, "%r != %r" % (ascii(emoji.demojize(invalidchar)), ascii(invalidchar)) + + invalidchar = u"u\2302 ⌂" + assert emoji.demojize(invalidchar) == invalidchar, "%r != %r" % (ascii(emoji.demojize(invalidchar)), ascii(invalidchar)) + assert emoji.demojize(invalidchar) == invalidchar, "%r != %r" % (ascii(emoji.demojize(invalidchar)), ascii(invalidchar)) + + +def test_combine_with_component(): + text = u"Example of a combined emoji%sin a sentence" + + combined = emoji.emojize(text % u":woman_dark_skin_tone:") + seperated = emoji.emojize(text % u":woman::dark_skin_tone:") + assert combined == seperated, "%r != %r" % (ascii(combined), ascii(seperated)) + + combined = emoji.emojize(text % u":woman_dark_skin_tone_white_hair:") + seperated = emoji.emojize(text % u":woman::dark_skin_tone:\u200d:white_hair:") + assert combined == seperated, "%r != %r" % (ascii(combined), ascii(seperated)) From b3b0dc6ccb62bbe8a0ab57bba8c6e786ced6d59f Mon Sep 17 00:00:00 2001 From: Tahir Jalilov Date: Mon, 6 Dec 2021 15:00:58 +0400 Subject: [PATCH 5/5] small fixes --- CHANGES.md | 7 ++++++- emoji/__init__.py | 2 +- tests/test_core.py | 5 +++-- tests/test_dict.py | 3 +++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 0880d87d..fb47dbfd 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,12 @@ emoji ===== +1.6.2 +----- +* Improve performance of demojize() +* Added more tests +* Added warning when someone uses any other language than 'en' with use_aliases=True in emojize() + 1.6.1 ----- * Allow multiple aliases @@ -14,7 +20,6 @@ emoji * emoji.version(string) method added * Included 'variant' in the dict of dicts - 1.5.0 ----- * Emojis of English version updated to the Emoji Charts v14.0 diff --git a/emoji/__init__.py b/emoji/__init__.py index 9edc0293..ce1f6e89 100644 --- a/emoji/__init__.py +++ b/emoji/__init__.py @@ -30,7 +30,7 @@ 'EMOJI_ALIAS_UNICODE_ENGLISH', 'UNICODE_EMOJI_ALIAS_ENGLISH', 'EMOJI_DATA', ] -__version__ = '1.6.1' +__version__ = '1.6.2' __author__ = 'Taehoon Kim, Kevin Wurster and Tahir Jalilov' __email__ = 'carpedm20@gmail.com' # and wursterk@gmail.com, tahir.jalilov@gmail.com diff --git a/tests/test_core.py b/tests/test_core.py index 7cef4b08..b3386d16 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -116,6 +116,7 @@ def test_emojize_invalid_emoji(): string = ':: baby:: :_: : : : : :-: :+:' assert emoji.emojize(string, False) == string + def test_alias(): # When use_aliases=False aliases should be passed through untouched assert emoji.emojize(':soccer:', use_aliases=False) == ':soccer:' @@ -165,7 +166,7 @@ def test_invalid_alias(): assert emoji.emojize(':tester:', use_aliases=True) == ':tester:' assert emoji.emojize(':footbal:', use_aliases=True) == ':footbal:' assert emoji.emojize(':socer:', use_aliases=True) == ':socer:' - emoji.emojize(':socer:', use_aliases=True, + assert emoji.emojize(':socer:', use_aliases=True, variant="text_type") == ':socer:' @@ -192,6 +193,7 @@ def test_alias_wrong_language(): assert emoji.demojize(thailand, use_aliases=False, language="alias") == ':flag_for_Thailand:' assert emoji.demojize(thailand, use_aliases=True, language="alias") == ':flag_for_Thailand:' + def test_demojize_name_only(): for emj, item in emoji.EMOJI_DATA.items(): if item['status'] != emoji.STATUS['fully_qualified']: @@ -284,7 +286,6 @@ def test_long_emoji(): assert emoji.demojize(emoji.demojize(s)) == s - def test_untranslated(): for emj, item in emoji.EMOJI_DATA.items(): if item['status'] != emoji.STATUS['fully_qualified']: diff --git a/tests/test_dict.py b/tests/test_dict.py index 9460bda6..d1afe0d9 100644 --- a/tests/test_dict.py +++ b/tests/test_dict.py @@ -8,6 +8,8 @@ import emoji _all_languages = None + + def all_languages(): """List of all language keys in EMOJI_DATA""" @@ -48,6 +50,7 @@ def check_duplicate_names(lang): assert name not in seen seen[name] = 0 + def test_duplicate_names(): """Check that there are no duplicate names in the fully_qualified except for differnt variants""" for lang in all_languages():