In [2]:
import selfies as sf
from rdkit import Chem
from transformers import AutoTokenizer

### Run example from the Original SELFIES paper

In [3]:
# From https://github.com/aspuru-guzik-group/selfies

benzene = "c1ccccc1"

# SMILES -> SELFIES -> SMILES translation
try:
    benzene_sf = sf.encoder(benzene)  # [C][=C][C][=C][C][=C][Ring1][=Branch1]
    benzene_smi = sf.decoder(benzene_sf)  # C1=CC=CC=C1
except sf.EncoderError:
    pass  # sf.encoder error!
except sf.DecoderError:
    pass  # sf.decoder error!

len_benzene = sf.len_selfies(benzene_sf)  # 8

symbols_benzene = list(sf.split_selfies(benzene_sf))
symbols_benzene

['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]']

In [3]:
# check if benzene is canonical
canonical_benzene = Chem.MolToSmiles(Chem.MolFromSmiles(benzene))
canonical_benzene

'c1ccccc1'

In [4]:
# output of selfies -> smiles
benzene_smi

'C1=CC=CC=C1'

In [5]:
# make canonical
Chem.MolToSmiles(Chem.MolFromSmiles(benzene_smi))

'c1ccccc1'

### Make tokenizer json file

In [6]:
tokenizer_molgen = AutoTokenizer.from_pretrained("zjunlp/MolGen-large")

In [7]:
tokenizer_molgen

BartTokenizerFast(name_or_path='zjunlp/MolGen-large', vocab_size=4, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [8]:
tokenizer_molgen.get_vocab()

{'[11CH3]': 107,
 '[B@H1-1]': 121,
 '[18F]': 97,
 '[#S]': 40,
 '[=C]': 35,
 '[P@H1]': 81,
 '[/C@]': 31,
 '[N-1]': 84,
 '[\\Cl]': 37,
 '[C@@H1]': 15,
 '[S@+1]': 72,
 '[/O]': 69,
 '[Ring2]': 90,
 '[/B]': 125,
 '[P@@]': 59,
 '[PH2]': 51,
 '[N]': 19,
 '[N@+1]': 111,
 '[S]': 60,
 '[=S@@+1]': 42,
 '[\\CH0]': 176,
 '[\\CH1-1]': 153,
 '[\\Br]': 142,
 '[S@@H1]': 45,
 '[Br]': 62,
 '[Br+1]': 166,
 '[\\B]': 159,
 '[=PH1]': 132,
 '[\\NH1]': 103,
 '[S+1]': 61,
 '[/F]': 6,
 '[S@@]': 154,
 '[/I]': 83,
 '[#N]': 23,
 '[/N]': 67,
 '<s>': 0,
 '[\\S@@]': 180,
 '[#Branch2]': 178,
 '[=Branch1]': 33,
 '[/123I]': 7,
 '[=N+1]': 144,
 '[B@-1]': 179,
 '[127I]': 36,
 '[\\P]': 74,
 '[\\C@]': 145,
 '[=S@]': 44,
 '[\\C@H1]': 49,
 '[124I]': 136,
 '[Si]': 119,
 '[/C@@H1]': 13,
 '[-/Ring2]': 156,
 '[=P@@]': 53,
 '[=O+1]': 96,
 '[SH0]': 182,
 '[/CH1-1]': 88,
 '[\\Si]': 168,
 '[S@@+1]': 57,
 '[P@+1]': 158,
 '[#P]': 87,
 '[CH2-1]': 110,
 '[Sn]': 114,
 '[\\SH1]': 151,
 '<mask>': 184,
 '[=SH1]': 117,
 '[/NH1]': 24,
 '[F]': 6

In [9]:
len(tokenizer_molgen.get_vocab())

185

In [10]:
tokenizer_molgen.save_pretrained("./tokenizers/tokenizer_selfies")

('./tokenizers/tokenizer_selfies/tokenizer_config.json',
 './tokenizers/tokenizer_selfies/special_tokens_map.json',
 './tokenizers/tokenizer_selfies/vocab.json',
 './tokenizers/tokenizer_selfies/merges.txt',
 './tokenizers/tokenizer_selfies/added_tokens.json',
 './tokenizers/tokenizer_selfies/tokenizer.json')