In [1]:
import os
import json
from collections import OrderedDict

### Steps

1. **Download** vocab.json from [here](https://huggingface.co/DeepChem/SmilesTokenizer_PubChem_1M/tree/main)
2. place in `/data-bin/tokenizers/tokenizer_smiles` folder
3. rename to `vocab_deepchem.json`

In [2]:
os.chdir("./tokenizers/tokenizer_smiles")

FileNotFoundError: [Errno 2] No such file or directory: './tokenizers/tokenizer_smiles'

In [3]:
with open("./deepchem_vocab.json", "r") as f:
    vocab = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: './deepchem_vocab.json'

In [4]:
vocab_tokens = [t for t in vocab.keys()]

print(len(vocab_tokens))

591


In [5]:
vocab_tokens

['[PAD]',
 '[unused1]',
 '[unused2]',
 '[unused3]',
 '[unused4]',
 '[unused5]',
 '[unused6]',
 '[unused7]',
 '[unused8]',
 '[unused9]',
 '[unused10]',
 '[UNK]',
 '[CLS]',
 '[SEP]',
 '[MASK]',
 'c',
 'C',
 '(',
 ')',
 'O',
 '1',
 '2',
 '=',
 'N',
 '.',
 'n',
 '3',
 'F',
 'Cl',
 '>>',
 '~',
 '-',
 '4',
 '[C@H]',
 'S',
 '[C@@H]',
 '[O-]',
 'Br',
 '#',
 '/',
 '[nH]',
 '[N+]',
 's',
 '5',
 'o',
 'P',
 '[Na+]',
 '[Si]',
 'I',
 '[Na]',
 '[Pd]',
 '[K+]',
 '[K]',
 '[P]',
 'B',
 '[C@]',
 '[C@@]',
 '[Cl-]',
 '6',
 '[OH-]',
 '\\',
 '[N-]',
 '[Li]',
 '[H]',
 '[2H]',
 '[NH4+]',
 '[c-]',
 '[P-]',
 '[Cs+]',
 '[Li+]',
 '[Cs]',
 '[NaH]',
 '[H-]',
 '[O+]',
 '[BH4-]',
 '[Cu]',
 '7',
 '[Mg]',
 '[Fe+2]',
 '[n+]',
 '[Sn]',
 '[BH-]',
 '[Pd+2]',
 '[CH]',
 '[I-]',
 '[Br-]',
 '[C-]',
 '[Zn]',
 '[B-]',
 '[F-]',
 '[Al]',
 '[P+]',
 '[BH3-]',
 '[Fe]',
 '[C]',
 '[AlH4]',
 '[Ni]',
 '[SiH]',
 '8',
 '[Cu+2]',
 '[Mn]',
 '[AlH]',
 '[nH+]',
 '[AlH4-]',
 '[O-2]',
 '[Cr]',
 '[Mg+2]',
 '[NH3+]',
 '[S@]',
 '[Pt]',
 '[Al+3]',
 

In [6]:
# define special tokens, order is important!
special_tokens = [
    "<s>",
    "[PAD]",
    "</s>",
    "[UNK]",
    "[SEP]",
    "[CLS]",
    "[MASK]",
    "[unused1]",
    "[unused2]",
    "[unused3]",
    "[unused4]",
    "[unused5]",
    "[unused6]",
    "[unused7]",
    "[unused8]",
    "[unused9]",
    "[unused10]"
]

In [7]:
# find special tokens in vocab and remove
vocab_tokens[:15]

['[PAD]',
 '[unused1]',
 '[unused2]',
 '[unused3]',
 '[unused4]',
 '[unused5]',
 '[unused6]',
 '[unused7]',
 '[unused8]',
 '[unused9]',
 '[unused10]',
 '[UNK]',
 '[CLS]',
 '[SEP]',
 '[MASK]']

In [8]:
filtered_vocab_tokens = vocab_tokens[15:]
filtered_vocab_tokens

['c',
 'C',
 '(',
 ')',
 'O',
 '1',
 '2',
 '=',
 'N',
 '.',
 'n',
 '3',
 'F',
 'Cl',
 '>>',
 '~',
 '-',
 '4',
 '[C@H]',
 'S',
 '[C@@H]',
 '[O-]',
 'Br',
 '#',
 '/',
 '[nH]',
 '[N+]',
 's',
 '5',
 'o',
 'P',
 '[Na+]',
 '[Si]',
 'I',
 '[Na]',
 '[Pd]',
 '[K+]',
 '[K]',
 '[P]',
 'B',
 '[C@]',
 '[C@@]',
 '[Cl-]',
 '6',
 '[OH-]',
 '\\',
 '[N-]',
 '[Li]',
 '[H]',
 '[2H]',
 '[NH4+]',
 '[c-]',
 '[P-]',
 '[Cs+]',
 '[Li+]',
 '[Cs]',
 '[NaH]',
 '[H-]',
 '[O+]',
 '[BH4-]',
 '[Cu]',
 '7',
 '[Mg]',
 '[Fe+2]',
 '[n+]',
 '[Sn]',
 '[BH-]',
 '[Pd+2]',
 '[CH]',
 '[I-]',
 '[Br-]',
 '[C-]',
 '[Zn]',
 '[B-]',
 '[F-]',
 '[Al]',
 '[P+]',
 '[BH3-]',
 '[Fe]',
 '[C]',
 '[AlH4]',
 '[Ni]',
 '[SiH]',
 '8',
 '[Cu+2]',
 '[Mn]',
 '[AlH]',
 '[nH+]',
 '[AlH4-]',
 '[O-2]',
 '[Cr]',
 '[Mg+2]',
 '[NH3+]',
 '[S@]',
 '[Pt]',
 '[Al+3]',
 '[S@@]',
 '[S-]',
 '[Ti]',
 '[Zn+2]',
 '[PH]',
 '[NH2+]',
 '[Ru]',
 '[Ag+]',
 '[S+]',
 '[I+3]',
 '[NH+]',
 '[Ca+2]',
 '[Ag]',
 '9',
 '[Os]',
 '[Se]',
 '[SiH2]',
 '[Ca]',
 '[Ti+4]',
 '[Ac]',
 '

In [9]:
# prepend our special tokens
final_vocab_tokens = special_tokens + filtered_vocab_tokens

print("Len", len(final_vocab_tokens))
final_vocab_tokens

Len 593


['<s>',
 '[PAD]',
 '</s>',
 '[UNK]',
 '[SEP]',
 '[CLS]',
 '[MASK]',
 '[unused1]',
 '[unused2]',
 '[unused3]',
 '[unused4]',
 '[unused5]',
 '[unused6]',
 '[unused7]',
 '[unused8]',
 '[unused9]',
 '[unused10]',
 'c',
 'C',
 '(',
 ')',
 'O',
 '1',
 '2',
 '=',
 'N',
 '.',
 'n',
 '3',
 'F',
 'Cl',
 '>>',
 '~',
 '-',
 '4',
 '[C@H]',
 'S',
 '[C@@H]',
 '[O-]',
 'Br',
 '#',
 '/',
 '[nH]',
 '[N+]',
 's',
 '5',
 'o',
 'P',
 '[Na+]',
 '[Si]',
 'I',
 '[Na]',
 '[Pd]',
 '[K+]',
 '[K]',
 '[P]',
 'B',
 '[C@]',
 '[C@@]',
 '[Cl-]',
 '6',
 '[OH-]',
 '\\',
 '[N-]',
 '[Li]',
 '[H]',
 '[2H]',
 '[NH4+]',
 '[c-]',
 '[P-]',
 '[Cs+]',
 '[Li+]',
 '[Cs]',
 '[NaH]',
 '[H-]',
 '[O+]',
 '[BH4-]',
 '[Cu]',
 '7',
 '[Mg]',
 '[Fe+2]',
 '[n+]',
 '[Sn]',
 '[BH-]',
 '[Pd+2]',
 '[CH]',
 '[I-]',
 '[Br-]',
 '[C-]',
 '[Zn]',
 '[B-]',
 '[F-]',
 '[Al]',
 '[P+]',
 '[BH3-]',
 '[Fe]',
 '[C]',
 '[AlH4]',
 '[Ni]',
 '[SiH]',
 '8',
 '[Cu+2]',
 '[Mn]',
 '[AlH]',
 '[nH+]',
 '[AlH4-]',
 '[O-2]',
 '[Cr]',
 '[Mg+2]',
 '[NH3+]',
 '[S@]',
 '[P

In [10]:
# Write
with open("./vocab.txt", "w") as f:
    for token in final_vocab_tokens:
        f.write(token+"\n")
    print('Done')

Done


In [4]:
# Read
with open("./vocab.txt", "r") as f:
    k = f.read().splitlines()

FileNotFoundError: [Errno 2] No such file or directory: './vocab.txt'

In [12]:
vocab = OrderedDict()

for index, token in enumerate(k):
    token = token.rstrip("\n")
    vocab[token] = index

In [13]:
assert len(final_vocab_tokens) == len(vocab)

In [13]:
test_smiles = "CCc1c[nH]c2[O-]c1CC(N)C2(Cl)C[N+]"

### Old tokenizer 

In [5]:
from tokenizers import Tokenizer

In [10]:
# it was created by modified tokenizer.json from https://huggingface.co/DeepChem/SmilesTokenizer_PubChem_1M/tree/main 
# BPE Tokenizer
tokenizer_old = Tokenizer.from_file("./tokenizers/tokenizer_sm/old_tokenizer_smiles.json")

In [14]:
tokenizer_old.encode(test_smiles).tokens

['[CLS]',
 'C',
 'C',
 'c',
 '1',
 'c',
 'n',
 'c',
 '2',
 'O',
 '-',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'C',
 ')',
 'C',
 'N',
 '[SEP]']

### DeepChem tokenizer

In [15]:
# from https://github.com/deepchem/deepchem/blob/master/deepchem/feat/smiles_tokenizer.py
from deepchem_smiles_tokenizer import SmilesTokenizer, BasicSmilesTokenizer

ModuleNotFoundError: No module named 'deepchem_smiles_tokenizer'

In [19]:
tokenizer_deepchem = SmilesTokenizer("./deepchem_vocab.json")

In [20]:
tokenizer_deepchem.tokenize(test_smiles)

['C',
 'C',
 'c',
 '1',
 'c',
 '[nH]',
 'c',
 '2',
 '[O-]',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[N+]']

### SMILES Pair Encoding tokenizer

In [16]:
# https://github.com/XinhaoLi74/SmilesPE
from SmilesPE.pretokenizer import atomwise_tokenizer

atomwise_tokenizer(test_smiles)

ModuleNotFoundError: No module named 'SmilesPE'

### Bert WordPiece tokenizer

In [20]:
from tokenizers import BertWordPieceTokenizer, Tokenizer, pre_tokenizers
import re

SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""

tokenizer_wordpiece = BertWordPieceTokenizer.from_file(
                "./tokenizers/tokenizer_sm/vocab.txt", 
                clean_text=False,
                handle_chinese_chars=False,
                strip_accents=False,
                lowercase=False
            )  

In [21]:
regex = re.compile(SMI_REGEX_PATTERN)

# code from metaseq
test_smiles_tokens = [token for token in regex.findall(test_smiles.rstrip())]
test_smiles_tokens

['C',
 'C',
 'c',
 '1',
 'c',
 '[nH]',
 'c',
 '2',
 '[O-]',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[N+]']

In [23]:
tokenizer_wordpiece.encode(test_smiles_tokens, is_pretokenized=True).tokens

['[CLS]',
 'C',
 'C',
 'c',
 '1',
 'c',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 'c',
 '2',
 '[UNK]',
 'O',
 '-',
 '[UNK]',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[UNK]',
 'N',
 '[UNK]',
 '[UNK]',
 '[SEP]']

In [24]:
# Reason
# Although the is_pretokenized=False, it does also Punctuation pretokenization
tokenizer_wordpiece.encode(["[nH]"], is_pretokenized=True).tokens

['[CLS]', '[UNK]', '[UNK]', '[UNK]', '[SEP]']

In [25]:
# TRICK to get ride of Punctuation pretokenizer
tokenizer_wordpiece.pre_tokenizer = pre_tokenizers.CharDelimiterSplit(delimiter="&")

In [26]:
# Now the Punctuation pretokenizer is off, that's why tokenizer treats '[nH]' as one token
tokenizer_wordpiece.encode(["[nH]"], is_pretokenized=True).tokens

['[CLS]', '[nH]', '[SEP]']

In [27]:
tokenizer_wordpiece.encode(test_smiles_tokens, is_pretokenized=True).tokens

['[CLS]',
 'C',
 'C',
 'c',
 '1',
 'c',
 '[nH]',
 'c',
 '2',
 '[O-]',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[N+]',
 '[SEP]']

In [28]:
# Official implementation of WordPiece https://huggingface.co/learn/nlp-course/chapter6/6?fw=pt 
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in tokenizer_wordpiece.get_vocab():
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

In [29]:
tokenizer_wordpiece.normalizer.normalize_str("C[nH]")

'C[nH]'

In [30]:
encode_word("[nH]")

['[nH]']

### Default Transformers Tokenizers

In [33]:
from transformers import BertTokenizer, T5Tokenizer, GPT2Tokenizer

In [34]:
# WordPiece
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [35]:
# BPE
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [36]:
# SentencePiece
t5_tokenizer = T5Tokenizer.from_pretrained("t5-large", model_max_length=512)

In [37]:
# Vocab sizes
len(bert_tokenizer.get_vocab()), len(gpt_tokenizer.get_vocab()), len(t5_tokenizer.get_vocab())

(28996, 50257, 32100)

In [38]:
text_1 = "I have a new GPU!"

In [39]:
text_2 = "Don't you love ü§ó Transformers? We sure do."

In [40]:
text_3 = "CCc1c[nH]c2[O-]c1CC(N)C2(Cl)C[N+]"

In [41]:
# Text 1

In [42]:
bert_tokenizer.tokenize(text_1)

['I', 'have', 'a', 'new', 'GP', '##U', '!']

In [43]:
gpt_tokenizer.tokenize(text_1)

['I', 'ƒ†have', 'ƒ†a', 'ƒ†new', 'ƒ†GPU', '!']

In [44]:
t5_tokenizer.tokenize(text_1)

['‚ñÅI', '‚ñÅhave', '‚ñÅ', 'a', '‚ñÅnew', '‚ñÅGPU', '!']

In [45]:
# Text 2

In [46]:
bert_tokenizer.tokenize(text_2)

['Don',
 "'",
 't',
 'you',
 'love',
 '[UNK]',
 'Transformers',
 '?',
 'We',
 'sure',
 'do',
 '.']

In [47]:
gpt_tokenizer.tokenize(text_2)

['Don',
 "'t",
 'ƒ†you',
 'ƒ†love',
 'ƒ†√∞≈Å',
 '¬§',
 'ƒπ',
 'ƒ†Transformers',
 '?',
 'ƒ†We',
 'ƒ†sure',
 'ƒ†do',
 '.']

In [48]:
t5_tokenizer.tokenize(text_2)

['‚ñÅDon',
 "'",
 't',
 '‚ñÅyou',
 '‚ñÅlove',
 '‚ñÅ',
 'ü§ó',
 '‚ñÅTransformer',
 's',
 '?',
 '‚ñÅWe',
 '‚ñÅsure',
 '‚ñÅdo',
 '.']

In [49]:
# Text 3

In [50]:
bert_tokenizer.tokenize(text_3)

['CC',
 '##c',
 '##1',
 '##c',
 '[',
 'n',
 '##H',
 ']',
 'c',
 '##2',
 '[',
 'O',
 '-',
 ']',
 'c',
 '##1',
 '##CC',
 '(',
 'N',
 ')',
 'C',
 '##2',
 '(',
 'C',
 '##l',
 ')',
 'C',
 '[',
 'N',
 '+',
 ']']

In [51]:
gpt_tokenizer.tokenize(text_3)

['CC',
 'c',
 '1',
 'c',
 '[',
 'n',
 'H',
 ']',
 'c',
 '2',
 '[',
 'O',
 '-',
 ']',
 'c',
 '1',
 'CC',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[',
 'N',
 '+',
 ']']

In [52]:
t5_tokenizer.tokenize(text_3)

['‚ñÅ',
 'CC',
 'c',
 '1',
 'c',
 '[',
 'n',
 'H',
 ']',
 'c',
 '2',
 '[',
 'O',
 '-',
 ']',
 'c',
 '1',
 'CC',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'C',
 'l',
 ')',
 'C',
 '[',
 'N',
 '+',
 ']']

In [12]:
test_smiles = "CCc1c[nH]c2[O-]c1CC(N)C2(Cl)C[N+]"

In [13]:
from tokenizers import Tokenizer

from transformers import PreTrainedTokenizerFast

In [14]:
sf_like_tokenizer_path = "/auto/home/knarik/Molecular_Generation_with_GDB13/src/data/tokenizers/tokenizer_sm/tokenizer.json"

In [15]:
sm_like_tokenizer = Tokenizer.from_file(sf_like_tokenizer_path)

In [16]:
tokenizer = PreTrainedTokenizerFast(tokenizer_object=sm_like_tokenizer)

In [17]:
tokenizer.tokenize(test_smiles)

['C',
 'C',
 'c',
 '1',
 'c',
 '[nH]',
 'c',
 '2',
 '[O-]',
 'c',
 '1',
 'C',
 'C',
 '(',
 'N',
 ')',
 'C',
 '2',
 '(',
 'Cl',
 ')',
 'C',
 '[N+]']

In [7]:
tokenizer("[C][N]")

{'input_ids': [0, 86, 169, 2], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [8]:
tokenizer.tokenize("[C][N]")

['[C]', '[N]']

In [19]:
len(tokenizer.get_vocab())

583

In [10]:
import json

In [15]:
with open("./tokenizers/tokenizer_sm/vocab.txt", "r") as f:
    sm_old_vocab = f.read().splitlines()

In [17]:
len(sm_old_vocab)

593

In [18]:
sm_old_vocab

['<s>',
 '[PAD]',
 '</s>',
 '[UNK]',
 '[SEP]',
 '[CLS]',
 '[MASK]',
 '[unused1]',
 '[unused2]',
 '[unused3]',
 '[unused4]',
 '[unused5]',
 '[unused6]',
 '[unused7]',
 '[unused8]',
 '[unused9]',
 '[unused10]',
 'c',
 'C',
 '(',
 ')',
 'O',
 '1',
 '2',
 '=',
 'N',
 '.',
 'n',
 '3',
 'F',
 'Cl',
 '>>',
 '~',
 '-',
 '4',
 '[C@H]',
 'S',
 '[C@@H]',
 '[O-]',
 'Br',
 '#',
 '/',
 '[nH]',
 '[N+]',
 's',
 '5',
 'o',
 'P',
 '[Na+]',
 '[Si]',
 'I',
 '[Na]',
 '[Pd]',
 '[K+]',
 '[K]',
 '[P]',
 'B',
 '[C@]',
 '[C@@]',
 '[Cl-]',
 '6',
 '[OH-]',
 '\\',
 '[N-]',
 '[Li]',
 '[H]',
 '[2H]',
 '[NH4+]',
 '[c-]',
 '[P-]',
 '[Cs+]',
 '[Li+]',
 '[Cs]',
 '[NaH]',
 '[H-]',
 '[O+]',
 '[BH4-]',
 '[Cu]',
 '7',
 '[Mg]',
 '[Fe+2]',
 '[n+]',
 '[Sn]',
 '[BH-]',
 '[Pd+2]',
 '[CH]',
 '[I-]',
 '[Br-]',
 '[C-]',
 '[Zn]',
 '[B-]',
 '[F-]',
 '[Al]',
 '[P+]',
 '[BH3-]',
 '[Fe]',
 '[C]',
 '[AlH4]',
 '[Ni]',
 '[SiH]',
 '8',
 '[Cu+2]',
 '[Mn]',
 '[AlH]',
 '[nH+]',
 '[AlH4-]',
 '[O-2]',
 '[Cr]',
 '[Mg+2]',
 '[NH3+]',
 '[S@]',
 '[P

In [21]:
useful_vocab = sm_old_vocab[17:]

In [22]:
useful_vocab

['c',
 'C',
 '(',
 ')',
 'O',
 '1',
 '2',
 '=',
 'N',
 '.',
 'n',
 '3',
 'F',
 'Cl',
 '>>',
 '~',
 '-',
 '4',
 '[C@H]',
 'S',
 '[C@@H]',
 '[O-]',
 'Br',
 '#',
 '/',
 '[nH]',
 '[N+]',
 's',
 '5',
 'o',
 'P',
 '[Na+]',
 '[Si]',
 'I',
 '[Na]',
 '[Pd]',
 '[K+]',
 '[K]',
 '[P]',
 'B',
 '[C@]',
 '[C@@]',
 '[Cl-]',
 '6',
 '[OH-]',
 '\\',
 '[N-]',
 '[Li]',
 '[H]',
 '[2H]',
 '[NH4+]',
 '[c-]',
 '[P-]',
 '[Cs+]',
 '[Li+]',
 '[Cs]',
 '[NaH]',
 '[H-]',
 '[O+]',
 '[BH4-]',
 '[Cu]',
 '7',
 '[Mg]',
 '[Fe+2]',
 '[n+]',
 '[Sn]',
 '[BH-]',
 '[Pd+2]',
 '[CH]',
 '[I-]',
 '[Br-]',
 '[C-]',
 '[Zn]',
 '[B-]',
 '[F-]',
 '[Al]',
 '[P+]',
 '[BH3-]',
 '[Fe]',
 '[C]',
 '[AlH4]',
 '[Ni]',
 '[SiH]',
 '8',
 '[Cu+2]',
 '[Mn]',
 '[AlH]',
 '[nH+]',
 '[AlH4-]',
 '[O-2]',
 '[Cr]',
 '[Mg+2]',
 '[NH3+]',
 '[S@]',
 '[Pt]',
 '[Al+3]',
 '[S@@]',
 '[S-]',
 '[Ti]',
 '[Zn+2]',
 '[PH]',
 '[NH2+]',
 '[Ru]',
 '[Ag+]',
 '[S+]',
 '[I+3]',
 '[NH+]',
 '[Ca+2]',
 '[Ag]',
 '9',
 '[Os]',
 '[Se]',
 '[SiH2]',
 '[Ca]',
 '[Ti+4]',
 '[Ac]',
 '

In [26]:
len(useful_vocab)

576

In [11]:
with open(sf_like_tokenizer_path, "r") as f:
    vocab = json.load(f)

In [12]:
vocab

{'version': '1.0',
 'truncation': None,
 'padding': None,
 'added_tokens': [{'id': 0,
   'content': '<s>',
   'single_word': False,
   'lstrip': False,
   'rstrip': False,
   'normalized': True,
   'special': True},
  {'id': 1,
   'content': '<pad>',
   'single_word': False,
   'lstrip': False,
   'rstrip': False,
   'normalized': True,
   'special': True},
  {'id': 2,
   'content': '</s>',
   'single_word': False,
   'lstrip': False,
   'rstrip': False,
   'normalized': True,
   'special': True},
  {'id': 3,
   'content': '<unk>',
   'single_word': False,
   'lstrip': False,
   'rstrip': False,
   'normalized': True,
   'special': True},
  {'id': 5,
   'content': '<mask>',
   'single_word': False,
   'lstrip': True,
   'rstrip': False,
   'normalized': True,
   'special': True},
  {'id': 6,
   'content': '[Canon]',
   'single_word': False,
   'lstrip': False,
   'rstrip': False,
   'normalized': True,
   'special': False},
  {'id': 7,
   'content': '[Rand]',
   'single_word': False,
 

In [29]:
for i in range(8, 8+576):
    template = {'id': i,
       'content': useful_vocab[i-8],
       'single_word': False,
       'lstrip': True,
       'rstrip': False,
       'normalized': True,
       'special': True}

    vocab["added_tokens"].append(template)

In [11]:
vocab

NameError: name 'vocab' is not defined

In [10]:
len(vocab["added_tokens"])

NameError: name 'vocab' is not defined