In [1]:
import os
import json
from collections import OrderedDict

### Steps

1. **Download** vocab.json from [here](https://huggingface.co/DeepChem/SmilesTokenizer_PubChem_1M/tree/main)
2. place in `/data-bin/tokenizers/tokenizer_smiles` folder
3. rename to `vocab_deepchem.json`

In [2]:
os.chdir("./tokenizers/tokenizer_smiles")

In [3]:
with open("./deepchem_vocab.json", "r") as f:
    vocab = json.load(f)

In [4]:
vocab_tokens = [t for t in vocab.keys()]

print(len(vocab_tokens))

591


In [5]:
vocab_tokens

['[PAD]',
 '[unused1]',
 '[unused2]',
 '[unused3]',
 '[unused4]',
 '[unused5]',
 '[unused6]',
 '[unused7]',
 '[unused8]',
 '[unused9]',
 '[unused10]',
 '[UNK]',
 '[CLS]',
 '[SEP]',
 '[MASK]',
 'c',
 'C',
 '(',
 ')',
 'O',
 '1',
 '2',
 '=',
 'N',
 '.',
 'n',
 '3',
 'F',
 'Cl',
 '>>',
 '~',
 '-',
 '4',
 '[C@H]',
 'S',
 '[C@@H]',
 '[O-]',
 'Br',
 '#',
 '/',
 '[nH]',
 '[N+]',
 's',
 '5',
 'o',
 'P',
 '[Na+]',
 '[Si]',
 'I',
 '[Na]',
 '[Pd]',
 '[K+]',
 '[K]',
 '[P]',
 'B',
 '[C@]',
 '[C@@]',
 '[Cl-]',
 '6',
 '[OH-]',
 '\\',
 '[N-]',
 '[Li]',
 '[H]',
 '[2H]',
 '[NH4+]',
 '[c-]',
 '[P-]',
 '[Cs+]',
 '[Li+]',
 '[Cs]',
 '[NaH]',
 '[H-]',
 '[O+]',
 '[BH4-]',
 '[Cu]',
 '7',
 '[Mg]',
 '[Fe+2]',
 '[n+]',
 '[Sn]',
 '[BH-]',
 '[Pd+2]',
 '[CH]',
 '[I-]',
 '[Br-]',
 '[C-]',
 '[Zn]',
 '[B-]',
 '[F-]',
 '[Al]',
 '[P+]',
 '[BH3-]',
 '[Fe]',
 '[C]',
 '[AlH4]',
 '[Ni]',
 '[SiH]',
 '8',
 '[Cu+2]',
 '[Mn]',
 '[AlH]',
 '[nH+]',
 '[AlH4-]',
 '[O-2]',
 '[Cr]',
 '[Mg+2]',
 '[NH3+]',
 '[S@]',
 '[Pt]',
 '[Al+3]',
 

In [6]:
# define special tokens, order is important!
special_tokens = [
    "<s>",
    "[PAD]",
    "</s>",
    "[UNK]",
    "[SEP]",
    "[CLS]",
    "[MASK]",
    "[unused1]",
    "[unused2]",
    "[unused3]",
    "[unused4]",
    "[unused5]",
    "[unused6]",
    "[unused7]",
    "[unused8]",
    "[unused9]",
    "[unused10]"
]

In [7]:
# find special tokens in vocab and remove
vocab_tokens[:15]

['[PAD]',
 '[unused1]',
 '[unused2]',
 '[unused3]',
 '[unused4]',
 '[unused5]',
 '[unused6]',
 '[unused7]',
 '[unused8]',
 '[unused9]',
 '[unused10]',
 '[UNK]',
 '[CLS]',
 '[SEP]',
 '[MASK]']

In [8]:
filtered_vocab_tokens = vocab_tokens[15:]
filtered_vocab_tokens

['c',
 'C',
 '(',
 ')',
 'O',
 '1',
 '2',
 '=',
 'N',
 '.',
 'n',
 '3',
 'F',
 'Cl',
 '>>',
 '~',
 '-',
 '4',
 '[C@H]',
 'S',
 '[C@@H]',
 '[O-]',
 'Br',
 '#',
 '/',
 '[nH]',
 '[N+]',
 's',
 '5',
 'o',
 'P',
 '[Na+]',
 '[Si]',
 'I',
 '[Na]',
 '[Pd]',
 '[K+]',
 '[K]',
 '[P]',
 'B',
 '[C@]',
 '[C@@]',
 '[Cl-]',
 '6',
 '[OH-]',
 '\\',
 '[N-]',
 '[Li]',
 '[H]',
 '[2H]',
 '[NH4+]',
 '[c-]',
 '[P-]',
 '[Cs+]',
 '[Li+]',
 '[Cs]',
 '[NaH]',
 '[H-]',
 '[O+]',
 '[BH4-]',
 '[Cu]',
 '7',
 '[Mg]',
 '[Fe+2]',
 '[n+]',
 '[Sn]',
 '[BH-]',
 '[Pd+2]',
 '[CH]',
 '[I-]',
 '[Br-]',
 '[C-]',
 '[Zn]',
 '[B-]',
 '[F-]',
 '[Al]',
 '[P+]',
 '[BH3-]',
 '[Fe]',
 '[C]',
 '[AlH4]',
 '[Ni]',
 '[SiH]',
 '8',
 '[Cu+2]',
 '[Mn]',
 '[AlH]',
 '[nH+]',
 '[AlH4-]',
 '[O-2]',
 '[Cr]',
 '[Mg+2]',
 '[NH3+]',
 '[S@]',
 '[Pt]',
 '[Al+3]',
 '[S@@]',
 '[S-]',
 '[Ti]',
 '[Zn+2]',
 '[PH]',
 '[NH2+]',
 '[Ru]',
 '[Ag+]',
 '[S+]',
 '[I+3]',
 '[NH+]',
 '[Ca+2]',
 '[Ag]',
 '9',
 '[Os]',
 '[Se]',
 '[SiH2]',
 '[Ca]',
 '[Ti+4]',
 '[Ac]',
 '

In [9]:
# prepend our special tokens
final_vocab_tokens = special_tokens + filtered_vocab_tokens

print("Len", len(final_vocab_tokens))
final_vocab_tokens

Len 593


['<s>',
 '[PAD]',
 '</s>',
 '[UNK]',
 '[SEP]',
 '[CLS]',
 '[MASK]',
 '[unused1]',
 '[unused2]',
 '[unused3]',
 '[unused4]',
 '[unused5]',
 '[unused6]',
 '[unused7]',
 '[unused8]',
 '[unused9]',
 '[unused10]',
 'c',
 'C',
 '(',
 ')',
 'O',
 '1',
 '2',
 '=',
 'N',
 '.',
 'n',
 '3',
 'F',
 'Cl',
 '>>',
 '~',
 '-',
 '4',
 '[C@H]',
 'S',
 '[C@@H]',
 '[O-]',
 'Br',
 '#',
 '/',
 '[nH]',
 '[N+]',
 's',
 '5',
 'o',
 'P',
 '[Na+]',
 '[Si]',
 'I',
 '[Na]',
 '[Pd]',
 '[K+]',
 '[K]',
 '[P]',
 'B',
 '[C@]',
 '[C@@]',
 '[Cl-]',
 '6',
 '[OH-]',
 '\\',
 '[N-]',
 '[Li]',
 '[H]',
 '[2H]',
 '[NH4+]',
 '[c-]',
 '[P-]',
 '[Cs+]',
 '[Li+]',
 '[Cs]',
 '[NaH]',
 '[H-]',
 '[O+]',
 '[BH4-]',
 '[Cu]',
 '7',
 '[Mg]',
 '[Fe+2]',
 '[n+]',
 '[Sn]',
 '[BH-]',
 '[Pd+2]',
 '[CH]',
 '[I-]',
 '[Br-]',
 '[C-]',
 '[Zn]',
 '[B-]',
 '[F-]',
 '[Al]',
 '[P+]',
 '[BH3-]',
 '[Fe]',
 '[C]',
 '[AlH4]',
 '[Ni]',
 '[SiH]',
 '8',
 '[Cu+2]',
 '[Mn]',
 '[AlH]',
 '[nH+]',
 '[AlH4-]',
 '[O-2]',
 '[Cr]',
 '[Mg+2]',
 '[NH3+]',
 '[S@]',
 '[P

In [10]:
# Write
with open("./vocab.txt", "w") as f:
    for token in final_vocab_tokens:
        f.write(token+"\n")
    print('Done')

Done


In [11]:
# Read
with open("./vocab.txt", "r") as f:
    k = f.read().splitlines()

In [12]:
vocab = OrderedDict()

for index, token in enumerate(k):
    token = token.rstrip("\n")
    vocab[token] = index

In [13]:
assert len(final_vocab_tokens) == len(vocab)

In [14]:
test_smiles = "CCc1c[nH]c2[O-]c1CC(N)C2(Cl)C[N+]"

### Old tokenizer 

In [15]:
from tokenizers import Tokenizer

In [16]:
# it was created by modified tokenizer.json from https://huggingface.co/DeepChem/SmilesTokenizer_PubChem_1M/tree/main 
# BPE Tokenizer
tokenizer_old = Tokenizer.from_file("./old_tokenizer_smiles.json")

In [17]:
tokenizer_old.encode(test_smiles).tokens

['[CLS]',
 'C',
 'C',
 'c',
 '1',
 'c',
 'n',
 'c',
 '2',
 'O',
 '-',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'C',
 ')',
 'C',
 'N',
 '[SEP]']

### DeepChem tokenizer

In [18]:
# from https://github.com/deepchem/deepchem/blob/master/deepchem/feat/smiles_tokenizer.py
from deepchem_smiles_tokenizer import SmilesTokenizer, BasicSmilesTokenizer

In [19]:
tokenizer_deepchem = SmilesTokenizer("./deepchem_vocab.json")

In [20]:
tokenizer_deepchem.tokenize(test_smiles)

['C',
 'C',
 'c',
 '1',
 'c',
 '[nH]',
 'c',
 '2',
 '[O-]',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[N+]']

### SMILES Pair Encoding tokenizer

In [53]:
# https://github.com/XinhaoLi74/SmilesPE
from SmilesPE.pretokenizer import atomwise_tokenizer

atomwise_tokenizer(test_smiles)

['C',
 'C',
 'c',
 '1',
 'c',
 '[nH]',
 'c',
 '2',
 '[O-]',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[N+]']

### Bert WordPiece tokenizer

In [22]:
from tokenizers import BertWordPieceTokenizer, Tokenizer, pre_tokenizers
import re

SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""

tokenizer_wordpiece = BertWordPieceTokenizer.from_file(
                "vocab.txt", 
                clean_text=False,
                handle_chinese_chars=False,
                strip_accents=False,
                lowercase=False
            )  

In [23]:
regex = re.compile(SMI_REGEX_PATTERN)

# code from metaseq
test_smiles_tokens = [token for token in regex.findall(test_smiles.rstrip())]
test_smiles_tokens

['C',
 'C',
 'c',
 '1',
 'c',
 '[nH]',
 'c',
 '2',
 '[O-]',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[N+]']

In [24]:
tokenizer_wordpiece.encode(test_smiles_tokens, is_pretokenized=True).tokens

['[CLS]',
 'C',
 'C',
 'c',
 '1',
 'c',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 'c',
 '2',
 '[UNK]',
 'O',
 '-',
 '[UNK]',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[UNK]',
 'N',
 '[UNK]',
 '[UNK]',
 '[SEP]']

In [25]:
# Reason
# Although the is_pretokenized=False, it does also Punctuation pretokenization
tokenizer_wordpiece.encode(["[nH]"], is_pretokenized=True).tokens

['[CLS]', '[UNK]', '[UNK]', '[UNK]', '[SEP]']

In [26]:
# TRICK to get ride of Punctuation pretokenizer
tokenizer_wordpiece.pre_tokenizer = pre_tokenizers.CharDelimiterSplit(delimiter="&")

In [27]:
# Now the Punctuation pretokenizer is off, that's why tokenizer treats '[nH]' as one token
tokenizer_wordpiece.encode(["[nH]"], is_pretokenized=True).tokens

['[CLS]', '[nH]', '[SEP]']

In [29]:
tokenizer_wordpiece.encode(test_smiles_tokens, is_pretokenized=True).tokens

['[CLS]',
 'C',
 'C',
 'c',
 '1',
 'c',
 '[nH]',
 'c',
 '2',
 '[O-]',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[N+]',
 '[SEP]']

In [30]:
# Official implementation of WordPiece https://huggingface.co/learn/nlp-course/chapter6/6?fw=pt 
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in tokenizer_wordpiece.get_vocab():
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

In [31]:
tokenizer_wordpiece.normalizer.normalize_str("C[nH]")

'C[nH]'

In [32]:
encode_word("[nH]")

['[nH]']

### Default Transformers Tokenizers

In [33]:
from transformers import BertTokenizer, T5Tokenizer, GPT2Tokenizer

In [34]:
# WordPiece
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [35]:
# BPE
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [36]:
# SentencePiece
t5_tokenizer = T5Tokenizer.from_pretrained("t5-large", model_max_length=512)

In [37]:
# Vocab sizes
len(bert_tokenizer.get_vocab()), len(gpt_tokenizer.get_vocab()), len(t5_tokenizer.get_vocab())

(28996, 50257, 32100)

In [38]:
text_1 = "I have a new GPU!"

In [39]:
text_2 = "Don't you love 🤗 Transformers? We sure do."

In [40]:
text_3 = "CCc1c[nH]c2[O-]c1CC(N)C2(Cl)C[N+]"

In [41]:
# Text 1

In [42]:
bert_tokenizer.tokenize(text_1)

['I', 'have', 'a', 'new', 'GP', '##U', '!']

In [43]:
gpt_tokenizer.tokenize(text_1)

['I', 'Ġhave', 'Ġa', 'Ġnew', 'ĠGPU', '!']

In [44]:
t5_tokenizer.tokenize(text_1)

['▁I', '▁have', '▁', 'a', '▁new', '▁GPU', '!']

In [45]:
# Text 2

In [46]:
bert_tokenizer.tokenize(text_2)

['Don',
 "'",
 't',
 'you',
 'love',
 '[UNK]',
 'Transformers',
 '?',
 'We',
 'sure',
 'do',
 '.']

In [47]:
gpt_tokenizer.tokenize(text_2)

['Don',
 "'t",
 'Ġyou',
 'Ġlove',
 'ĠðŁ',
 '¤',
 'Ĺ',
 'ĠTransformers',
 '?',
 'ĠWe',
 'Ġsure',
 'Ġdo',
 '.']

In [48]:
t5_tokenizer.tokenize(text_2)

['▁Don',
 "'",
 't',
 '▁you',
 '▁love',
 '▁',
 '🤗',
 '▁Transformer',
 's',
 '?',
 '▁We',
 '▁sure',
 '▁do',
 '.']

In [49]:
# Text 3

In [50]:
bert_tokenizer.tokenize(text_3)

['CC',
 '##c',
 '##1',
 '##c',
 '[',
 'n',
 '##H',
 ']',
 'c',
 '##2',
 '[',
 'O',
 '-',
 ']',
 'c',
 '##1',
 '##CC',
 '(',
 'N',
 ')',
 'C',
 '##2',
 '(',
 'C',
 '##l',
 ')',
 'C',
 '[',
 'N',
 '+',
 ']']

In [51]:
gpt_tokenizer.tokenize(text_3)

['CC',
 'c',
 '1',
 'c',
 '[',
 'n',
 'H',
 ']',
 'c',
 '2',
 '[',
 'O',
 '-',
 ']',
 'c',
 '1',
 'CC',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[',
 'N',
 '+',
 ']']

In [52]:
t5_tokenizer.tokenize(text_3)

['▁',
 'CC',
 'c',
 '1',
 'c',
 '[',
 'n',
 'H',
 ']',
 'c',
 '2',
 '[',
 'O',
 '-',
 ']',
 'c',
 '1',
 'CC',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'C',
 'l',
 ')',
 'C',
 '[',
 'N',
 '+',
 ']']