In [1]:
%pip install --upgrade tiktoken
import tiktoken
import csv
from tiktoken import Encoding

Note: you may need to restart the kernel to use updated packages.


In [2]:
cl100k_base = tiktoken.get_encoding("cl100k_base")

enc = tiktoken.Encoding(
    name="SELFIES_encoder",
    pat_str=cl100k_base._pat_str,
    mergeable_ranks=cl100k_base._mergeable_ranks,
    special_tokens={
        **cl100k_base._special_tokens,
        "[": 100264,
        "]": 100265,
        "=": 100266,
        "#": 100267,
        "-": 100268,
        "+": 100269,
        "@": 100270,
        "H": 100271
    }
)

specials_allowed = {'[', ']', '=', '#', '-', '+', '@', 'H'}

In [None]:
def compare_encodings(example_string: str) -> None:
    """Prints a comparison of three string encodings."""
    # print the example string
    print(f'\nExample string: "{example_string}"')
    # for each encoding, print the # of tokens, the token integers, and the token bytes
    for encoding_name in ["gpt2", "p50k_base", "cl100k_base"]:
        encoding = tiktoken.get_encoding(encoding_name)
        token_integers = encoding.encode(example_string)
        num_tokens = len(token_integers)
        token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]
        print()
        print(f"{encoding_name}: {num_tokens} tokens")
        print(f"token integers: {token_integers}")
        print(f"token bytes: {token_bytes}")
    token_ints = enc.encode(example_string, allowed_special={"[", "]", "="})
    num_custom = len(token_ints)
    custom_bytes = [enc.decode_single_token_bytes(token) for token in token_ints]
    print()
    print(f"Custom encoder: {num_custom}")
    print(f"token integers: {token_ints}")
    print(f"token bytes: {custom_bytes}")


compare_encodings("[C][N][Branch][C][=C][C][=C][C][=C][Ring1][=Branch][pop][C][=C][C][=C][C][=C][Branch][C][=Branch][=O][pop][N][C][C][Branch][O][pop][C][C][O][C][C][Ring1][#Branch][pop][Ring1][=Branch]")
compare_encodings("[=C][O][C][=N][C][Branch][C][=Branch][=O][pop][OH0-1][pop][=C][C][Branch][C][Branch][F][pop][F][pop][=C][Branch][Br][pop][Ring1][#C]")
compare_encodings("[NH3+1][C][C][O][C][Branch][N][N][=C][C][=Branch][=O][pop][N][C][=Branch][=O][pop][Ring1][#Branch][pop][C][Branch][O][pop][C][Branch][O][pop][Ring1][P]")