In [1]:
!pip install tiktoken
!pip install emoji

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0
Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl.metadata (5.4 kB)
Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.12.1


In [2]:
# init the GPT-4 Tokenizer
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4")
print(enc.n_vocab) # number of tokens in total

100277


In [3]:
# init the emojis
import emoji
emojis = list(emoji.EMOJI_DATA.keys())
import random
random.seed(15)
random.shuffle(emojis)
print(len(emoji.EMOJI_DATA)) # number of possible emoji

5034


In [4]:
def text_to_tokens(text, max_per_row=10):
    ids = enc.encode(text)
    unique_tokens = set(ids)
    # map all tokens we see to a unique emoji
    id_to_emoji = {id: emoji for emoji, id in zip(emojis, unique_tokens)}
    # do the translation
    lines = []
    for i in range(0, len(ids), max_per_row):
        lines.append(''.join([id_to_emoji[id] for id in ids[i:i+max_per_row]]))
    out = '\n'.join(lines)
    return out, id_to_emoji

In [5]:
text = """How many letters 'r' in the word 'strawberry'?"""
print(text_to_tokens(text, max_per_row=20))

('🧑\u200d🦽👩🏿\u200d❤️\u200d💋\u200d👨🏻🤾🏻\u200d♀️🙍\u200d♀️🤙🏻🧑🏾\u200d🦼\u200d➡️✌🏿💂📏🙍\u200d♀️🈴🧎\u200d♀🍏🧑\u200d🦼\u200d➡️', {675: '🧎\u200d♀', 3492: '📏', 12197: '🤾🏻\u200d♀️', 6: '🧑🏾\u200d🦼\u200d➡️', 15717: '🍏', 364: '🙍\u200d♀️', 304: '✌🏿', 81: '🤙🏻', 496: '🈴', 71090: '🧑\u200d🦼\u200d➡️', 4438: '🧑\u200d🦽', 279: '💂', 1690: '👩🏿\u200d❤️\u200d💋\u200d👨🏻'})


In [6]:
enc.encode(text)

[4438, 1690, 12197, 364, 81, 6, 304, 279, 3492, 364, 496, 675, 15717, 71090]

In [7]:
def tokens_to_text(emoji_text, id_to_emoji):
    emoji_to_id = {v: k for k, v in id_to_emoji.items()}
    #print(emoji_to_id)
    # Convert emoji_text to a list of emojis
    emojis_in_text = emoji.emoji_list(emoji_text)
    #print(emojis_in_text)
    ids = []
    for item in emojis_in_text:
        char = item['emoji']
        if char in emoji_to_id:
            ids.append(emoji_to_id[char])
        else:
            raise ValueError(f"Emoji {char} not found in the mapping")
    text = enc.decode(ids)
    return text


In [9]:
words = """your words """
emoji_seq, emoji_ids = text_to_tokens(words, max_per_row=15)
print("Emoji Sequence:\n", emoji_seq)
print("Encoded Tokens:\n", enc.encode(words))
print("Emoji to Token Mapping:\n", emoji_ids)
print("Decoded Text:\n", tokens_to_text(emoji_seq, emoji_ids))

Emoji Sequence:
 🤾🏻‍♀️🧎‍♀📏
Encoded Tokens:
 [22479, 4339, 220]
Emoji to Token Mapping:
 {4339: '🧎\u200d♀', 220: '📏', 22479: '🤾🏻\u200d♀️'}
Decoded Text:
 your words 
