## Figuring out the tokenizer update.

### 1. Hugging Face's Tokenizer Example


In [25]:
from transformers import AutoTokenizer

model_ckpt = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt,
                                          cache_dir="/data/bobby/huggingface-cache/models")



In [26]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [34]:
tokenizer.add_special_tokens({'pad_token': '<pad>'})
res = tokenizer(corpus, padding=True, truncation=True, return_tensors='pt')
res

{'input_ids': tensor([[ 1212,   318,   262, 12905,  2667, 15399, 20537,    13, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257],
        [ 1212,  6843,   318,   546, 11241,  1634,    13, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257],
        [ 1212,  2665,  2523,  1811, 11241,  7509, 16113,    13, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257],
        [32365,    11,   345,   481,   307,  1498,   284,  1833,   703,   484,
           389,  8776,   290,  7716, 16326,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [33]:
tokenizer.decode(tokenizer.encode(corpus[1]))

'This chapter is about tokenization.'

In [36]:
tokenizer.decode(res['input_ids'][0])

'This is the Hugging Face Course.<pad><pad><pad><pad><pad><pad><pad><pad>'

In [37]:
from datasets import load_dataset
dataset = load_dataset('glue', 'mrpc')

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [41]:
import pandas as pd

pd.DataFrame(dataset['train'])

Unnamed: 0,sentence1,sentence2,label,idx
0,"Amrozi accused his brother , whom he called "" ...","Referring to him as only "" the witness "" , Amr...",1,0
1,Yucaipa owned Dominick 's before selling the c...,Yucaipa bought Dominick 's in 1995 for $ 693 m...,0,1
2,They had published an advertisement on the Int...,"On June 10 , the ship 's owners had published ...",1,2
3,"Around 0335 GMT , Tab shares were up 19 cents ...","Tab shares jumped 20 cents , or 4.6 % , to set...",0,3
4,"The stock rose $ 2.11 , or about 11 percent , ...",PG & E Corp. shares jumped $ 1.63 or 8 percent...,1,4
...,...,...,...,...
3663,""" At this point , Mr. Brando announced : ' Som...","Brando said that "" somebody ought to put a bul...",1,4071
3664,"Martin , 58 , will be freed today after servin...",Martin served two thirds of a five-year senten...,0,4072
3665,""" We have concluded that the outlook for price...","In a statement , the ECB said the outlook for ...",1,4073
3666,The notification was first reported Friday by ...,MSNBC.com first reported the CIA request on Fr...,1,4074


## 2. Self Created Method

In [1]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [2]:
import re

def pre_tokenize(text, special_tokens):
    # Escape special tokens for regex
    escaped_tokens = [re.escape(token) for token in special_tokens]
    # Create a regex pattern to match special tokens
    special_tokens_pattern = '|'.join(escaped_tokens)
    
    # Split text using the pattern, keeping special tokens intact
    parts = re.split(f'({special_tokens_pattern})', text)
    
    # Process parts to combine spaces with non-special tokens
    tokens = []
    for part in parts:
        if part in special_tokens:
            tokens.append(part)
        else:
            # Find words and spaces, and combine them
            sub_tokens = re.findall(r'\s*\S+|\s+', part)
            tokens.extend(sub_tokens)
    
    return tokens

# Example usage
input_text = "<pad> hello world [CLS] test [SEP]"

special_tokens = ['<pad>', '[CLS]', '[SEP]', '[MASK]']

tokens = pre_tokenize(input_text, special_tokens)

print("Input text:", input_text)
print("Tokens:", tokens)

Input text: <pad> hello world [CLS] test [SEP]
Tokens: ['<pad>', ' hello', ' world', ' ', '[CLS]', ' test', ' ', '[SEP]']


In [3]:
pre_tokenize(input_text,['<pad>', '[CLS]', '[SEP]', '[MASK]'])

['<pad>', ' hello', ' world', ' ', '[CLS]', ' test', ' ', '[SEP]']

In [4]:
from collections import defaultdict

word_freq = defaultdict(int)

for text in corpus:
    words_with_offsets = pre_tokenize(text,special_tokens)
    new_words = [word for word in words_with_offsets]
    for word in new_words:
        word_freq[word] += 1

print(word_freq)

defaultdict(<class 'int'>, {'This': 3, ' is': 2, ' the': 1, ' Hugging': 1, ' Face': 1, ' Course.': 1, ' chapter': 1, ' about': 1, ' tokenization.': 1, ' section': 1, ' shows': 1, ' several': 1, ' tokenizer': 1, ' algorithms.': 1, 'Hopefully,': 1, ' you': 1, ' will': 1, ' be': 1, ' able': 1, ' to': 1, ' understand': 1, ' how': 1, ' they': 1, ' are': 1, ' trained': 1, ' and': 1, ' generate': 1, ' tokens.': 1})


In [5]:
alphabet = []

for word in word_freq.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)

alphabet.sort()

print(alphabet)

[' ', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']


In [6]:
vocab = ['<pad>'] + alphabet.copy()
print(vocab)

['<pad>', ' ', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']


In [7]:
## time to split the words into individual characters before 
## performing the BPE tokenization

splits = {word: [c for c in word] for word in word_freq.keys()}
splits

{'This': ['T', 'h', 'i', 's'],
 ' is': [' ', 'i', 's'],
 ' the': [' ', 't', 'h', 'e'],
 ' Hugging': [' ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],
 ' Face': [' ', 'F', 'a', 'c', 'e'],
 ' Course.': [' ', 'C', 'o', 'u', 'r', 's', 'e', '.'],
 ' chapter': [' ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],
 ' about': [' ', 'a', 'b', 'o', 'u', 't'],
 ' tokenization.': [' ',
  't',
  'o',
  'k',
  'e',
  'n',
  'i',
  'z',
  'a',
  't',
  'i',
  'o',
  'n',
  '.'],
 ' section': [' ', 's', 'e', 'c', 't', 'i', 'o', 'n'],
 ' shows': [' ', 's', 'h', 'o', 'w', 's'],
 ' several': [' ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],
 ' tokenizer': [' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],
 ' algorithms.': [' ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's', '.'],
 'Hopefully,': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y', ','],
 ' you': [' ', 'y', 'o', 'u'],
 ' will': [' ', 'w', 'i', 'l', 'l'],
 ' be': [' ', 'b', 'e'],
 ' able': [' ', 'a', 'b', 'l', 'e'],
 ' to': [' ', 't', 'o'],
 ' understand': [' ', 'u', 'n'

In [8]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freq.items():
        chars = splits[word]
        if len(chars) == 1:
            continue
        for i in range(len(chars) - 1):
            pair_freqs[(chars[i], chars[i + 1])] += freq
    return pair_freqs

In [9]:
pair_freqs = compute_pair_freqs(splits)

In [10]:
for i, key in enumerate(pair_freqs.keys()):
    print(f'{key}: {pair_freqs[key]}')
    if i >= 5:
        break

('T', 'h'): 3
('h', 'i'): 3
('i', 's'): 5
(' ', 'i'): 2
(' ', 't'): 7
('t', 'h'): 3


In [11]:
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

(' ', 't') 7


In [12]:
def merge_pair(a, b, splits):
    for word in word_freq:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2:]
            else:
                i += 1
        splits[word] = split
    return splits

In [13]:
## create a vocab size of 50
vocab_size = 100
merges = {}

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(best_pair[0], best_pair[1], splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])
    

In [14]:
merges

{(' ', 't'): ' t',
 ('i', 's'): 'is',
 ('e', 'r'): 'er',
 (' ', 'a'): ' a',
 (' t', 'o'): ' to',
 ('e', 'n'): 'en',
 ('T', 'h'): 'Th',
 ('Th', 'is'): 'This',
 ('o', 'u'): 'ou',
 ('s', 'e'): 'se',
 (' to', 'k'): ' tok',
 (' tok', 'en'): ' token',
 ('n', 'd'): 'nd',
 (' ', 'is'): ' is',
 (' t', 'h'): ' th',
 (' th', 'e'): ' the',
 ('i', 'n'): 'in',
 (' a', 'b'): ' ab',
 (' token', 'i'): ' tokeni',
 (' tokeni', 'z'): ' tokeniz',
 ('a', 't'): 'at',
 ('i', 'o'): 'io',
 ('io', 'n'): 'ion',
 (' ', 'se'): ' se',
 ('h', 'o'): 'ho',
 ('ho', 'w'): 'how',
 ('s', '.'): 's.',
 ('l', 'l'): 'll',
 (' ', 'H'): ' H',
 (' H', 'u'): ' Hu',
 (' Hu', 'g'): ' Hug',
 (' Hug', 'g'): ' Hugg',
 (' Hugg', 'in'): ' Huggin',
 (' Huggin', 'g'): ' Hugging',
 (' ', 'F'): ' F',
 (' F', 'a'): ' Fa',
 (' Fa', 'c'): ' Fac',
 (' Fac', 'e'): ' Face',
 (' ', 'C'): ' C',
 (' C', 'ou'): ' Cou',
 (' Cou', 'r'): ' Cour',
 (' Cour', 'se'): ' Course',
 (' Course', '.'): ' Course.',
 (' ', 'c'): ' c',
 (' c', 'h'): ' ch',
 (' ch', 

In [15]:
print(vocab)

['<pad>', ' ', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', ' t', 'is', 'er', ' a', ' to', 'en', 'Th', 'This', 'ou', 'se', ' tok', ' token', 'nd', ' is', ' th', ' the', 'in', ' ab', ' tokeni', ' tokeniz', 'at', 'io', 'ion', ' se', 'ho', 'how', 's.', 'll', ' H', ' Hu', ' Hug', ' Hugg', ' Huggin', ' Hugging', ' F', ' Fa', ' Fac', ' Face', ' C', ' Cou', ' Cour', ' Course', ' Course.', ' c', ' ch', ' cha', ' chap', ' chapt', ' chapter', ' abou', ' about', ' tokenizat', ' tokenization', ' tokenization.', ' sec', ' sect', ' section', ' s', ' show', ' shows', ' sev', ' sever', ' severa', ' several', ' tokenizer', ' al', ' alg', ' algo', ' algor']


In [16]:
text = 'Hello <pad> world! This is not a tokenizer!'

pre_tokenize_result = pre_tokenize(text, ['<pad>'])
pre_tokenized_text  = [word for word in pre_tokenize_result]
splits =[[word] if any(substring in word for substring in ['<pad>']) else [l for l in word] for word in pre_tokenized_text]

for pair, merge in merges.items():
    for idx, split in enumerate(splits):
        i = 0
        while i < len(split) - 1:
            if split[i] == pair[0] and split[i + 1] == pair[1]:
                split = split[:i] + [merge] + split[i + 2:]
            else:
                i += 1
        splits[idx] = split

splits

[['H', 'e', 'll', 'o'],
 [' '],
 ['<pad>'],
 [' ', 'w', 'o', 'r', 'l', 'd', '!'],
 [' ', 'This'],
 [' is'],
 [' ', 'n', 'o', 't'],
 [' a'],
 [' tokenizer', '!']]

In [17]:
print(''.join(sum(splits,[])))

Hello <pad> world! This is not a tokenizer!


In [18]:
pre_tokenize(text, ['<pad>'])

['Hello', ' ', '<pad>', ' world!', ' This', ' is', ' not', ' a', ' tokenizer!']

In [19]:
def tokenize(text):

    pre_tokenize_result = pre_tokenize(text, special_tokens)
    pre_tokenized_text  = [word for word in pre_tokenize_result]
    splits =[[word] if any(substring in word for substring in ['<pad>']) else [l for l in word] for word in pre_tokenized_text]

    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2:]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits,[])


In [20]:
print(tokenize('Hello <pad> world. This is not a tokenizer.'))

['H', 'e', 'll', 'o', ' ', '<pad>', ' ', 'w', 'o', 'r', 'l', 'd', '.', ' ', 'This', ' is', ' ', 'n', 'o', 't', ' a', ' tokenizer', '.']


In [21]:
chtoi = {ch:i for i, ch in enumerate(vocab)}
itoch = {i:ch for i, ch in enumerate(vocab)}

In [22]:
[chtoi[ch] for ch in tokenize('Hello <pad> world. This is not a tokenizer.')]


[6,
 12,
 58,
 21,
 1,
 0,
 1,
 28,
 21,
 23,
 18,
 11,
 3,
 1,
 38,
 44,
 1,
 20,
 21,
 25,
 34,
 95,
 3]

In [23]:
''.join([itoch[i] for i in [chtoi[ch] for ch in tokenize('Hello <pad> world. This is not a tokenizer.')]])

'Hello <pad> world. This is not a tokenizer.'

In [24]:
[chtoi[ch] for ch in tokenize('Hello <pad> world. This is not a tokenizer.')]

[6,
 12,
 58,
 21,
 1,
 0,
 1,
 28,
 21,
 23,
 18,
 11,
 3,
 1,
 38,
 44,
 1,
 20,
 21,
 25,
 34,
 95,
 3]

In [25]:
sample = [chtoi[ch] for ch in tokenize('Hello <pad> world. This is not a tokenizer.')]

In [30]:
import torch

torch.where(torch.tensor(sample) == 0, 0, 1)

tensor([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
### now write a python function that will return 2 things
### 1. the tokenized text e.g. [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
### 2. the attention mask e.g. [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
### 3. also create a decoder


In [158]:
import re

def pre_tokenize(text, special_tokens):
    # Escape special tokens for regex
    escaped_tokens = [re.escape(token) for token in special_tokens]
    # Create a regex pattern to match special tokens
    special_tokens_pattern = '|'.join(escaped_tokens)
    
    # Split text using the pattern, keeping special tokens intact
    parts = re.split(f'({special_tokens_pattern})', text)
    
    # Process parts to combine spaces with non-special tokens
    tokens = []
    for part in parts:
        if part in special_tokens:
            tokens.append(part)
        else:
            # Find words and spaces, and combine them
            sub_tokens = re.findall(r'\s*\S+|\s+', part)
            tokens.extend(sub_tokens)
    
    return tokens

# Example usage
input_text = "<pad> hello world [CLS] test [SEP]"

tokens = pre_tokenize(input_text, special_tokens)

print("Input text:", input_text)
print("Tokens:", tokens)


Input text: <pad> hello world [CLS] test [SEP]
Tokens: ['<pad>', ' hello', ' world', ' ', '[CLS]', ' test', ' ', '[SEP]']


In [1]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2",
                                          cache_dir = "/data/bobby/huggingface-cache/models")



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [46]:
from collections import defaultdict

word_freq = defaultdict(int)

for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freq[word] += 1

print(word_freq)

defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})


In [47]:
alphabet = []

for word in word_freq.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)

alphabet.sort()

print(alphabet)

[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']


In [48]:
vocab = ['<|endoftext|>'] + alphabet.copy()
print(vocab)

['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']


In [49]:
## time to split the words into individual characters before 
## performing the BPE tokenization

splits = {word: [c for c in word] for word in word_freq.keys()}
splits

{'This': ['T', 'h', 'i', 's'],
 'Ġis': ['Ġ', 'i', 's'],
 'Ġthe': ['Ġ', 't', 'h', 'e'],
 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],
 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'],
 'ĠCourse': ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],
 '.': ['.'],
 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],
 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'],
 'Ġtokenization': ['Ġ',
  't',
  'o',
  'k',
  'e',
  'n',
  'i',
  'z',
  'a',
  't',
  'i',
  'o',
  'n'],
 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],
 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'],
 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],
 'Ġtokenizer': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],
 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],
 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],
 ',': [','],
 'Ġyou': ['Ġ', 'y', 'o', 'u'],
 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'],
 'Ġbe': ['Ġ', 'b', 'e'],
 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'],
 'Ġto': ['Ġ', 't', 'o'],
 'Ġunderstand': ['Ġ', 'u', 'n'

In [50]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freq.items():
        chars = splits[word]
        if len(chars) == 1:
            continue
        for i in range(len(chars) - 1):
            pair_freqs[(chars[i], chars[i + 1])] += freq
    return pair_freqs

In [51]:
pair_freqs = compute_pair_freqs(splits)

In [52]:
for i, key in enumerate(pair_freqs.keys()):
    print(f'{key}: {pair_freqs[key]}')
    if i >= 5:
        break

('T', 'h'): 3
('h', 'i'): 3
('i', 's'): 5
('Ġ', 'i'): 2
('Ġ', 't'): 7
('t', 'h'): 3


In [53]:
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

('Ġ', 't') 7


In [54]:
def merge_pair(a, b, splits):
    for word in word_freq:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2:]
            else:
                i += 1
        splits[word] = split
    return splits

In [55]:
merge_pair("Ġ", "t", splits)

{'This': ['T', 'h', 'i', 's'],
 'Ġis': ['Ġ', 'i', 's'],
 'Ġthe': ['Ġt', 'h', 'e'],
 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],
 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'],
 'ĠCourse': ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],
 '.': ['.'],
 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],
 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'],
 'Ġtokenization': ['Ġt',
  'o',
  'k',
  'e',
  'n',
  'i',
  'z',
  'a',
  't',
  'i',
  'o',
  'n'],
 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],
 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'],
 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],
 'Ġtokenizer': ['Ġt', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],
 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],
 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],
 ',': [','],
 'Ġyou': ['Ġ', 'y', 'o', 'u'],
 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'],
 'Ġbe': ['Ġ', 'b', 'e'],
 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'],
 'Ġto': ['Ġt', 'o'],
 'Ġunderstand': ['Ġ', 'u', 'n', 'd', 'e', 'r', '

In [56]:
## create a vocab size of 50
vocab_size = 50
merges = {}

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(best_pair[0], best_pair[1], splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])
    

In [57]:
merges

{('i', 's'): 'is',
 ('e', 'r'): 'er',
 ('Ġ', 'a'): 'Ġa',
 ('Ġt', 'o'): 'Ġto',
 ('e', 'n'): 'en',
 ('T', 'h'): 'Th',
 ('Th', 'is'): 'This',
 ('o', 'u'): 'ou',
 ('s', 'e'): 'se',
 ('Ġto', 'k'): 'Ġtok',
 ('Ġtok', 'en'): 'Ġtoken',
 ('n', 'd'): 'nd',
 ('Ġ', 'is'): 'Ġis',
 ('Ġt', 'h'): 'Ġth',
 ('Ġth', 'e'): 'Ġthe',
 ('i', 'n'): 'in',
 ('Ġa', 'b'): 'Ġab',
 ('Ġtoken', 'i'): 'Ġtokeni',
 ('Ġtokeni', 'z'): 'Ġtokeniz'}

In [58]:
print(vocab)

['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ', 'is', 'er', 'Ġa', 'Ġto', 'en', 'Th', 'This', 'ou', 'se', 'Ġtok', 'Ġtoken', 'nd', 'Ġis', 'Ġth', 'Ġthe', 'in', 'Ġab', 'Ġtokeni', 'Ġtokeniz']


In [59]:
splits

{'This': ['This'],
 'Ġis': ['Ġis'],
 'Ġthe': ['Ġthe'],
 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'in', 'g'],
 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'],
 'ĠCourse': ['Ġ', 'C', 'ou', 'r', 'se'],
 '.': ['.'],
 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'er'],
 'Ġabout': ['Ġab', 'ou', 't'],
 'Ġtokenization': ['Ġtokeniz', 'a', 't', 'i', 'o', 'n'],
 'Ġsection': ['Ġ', 'se', 'c', 't', 'i', 'o', 'n'],
 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'],
 'Ġseveral': ['Ġ', 'se', 'v', 'er', 'a', 'l'],
 'Ġtokenizer': ['Ġtokeniz', 'er'],
 'Ġalgorithms': ['Ġa', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],
 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],
 ',': [','],
 'Ġyou': ['Ġ', 'y', 'ou'],
 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'],
 'Ġbe': ['Ġ', 'b', 'e'],
 'Ġable': ['Ġab', 'l', 'e'],
 'Ġto': ['Ġto'],
 'Ġunderstand': ['Ġ', 'u', 'nd', 'er', 's', 't', 'a', 'nd'],
 'Ġhow': ['Ġ', 'h', 'o', 'w'],
 'Ġthey': ['Ġthe', 'y'],
 'Ġare': ['Ġa', 'r', 'e'],
 'Ġtrained': ['Ġt', 'r', 'a', 'in', 'e', 'd'],
 'Ġand': ['Ġa', 'nd

In [60]:
text = 'This is not a token <|endoftext|>'

pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
pre_tokenized_text  = [word for word, offset in pre_tokenize_result]
splits = [[l for l in word] for word in pre_tokenized_text]

for pair, merge in merges.items():
    for idx, split in enumerate(splits):
        i = 0
        while i < len(split) - 1:
            if split[i] == pair[0] and split[i + 1] == pair[1]:
                split = split[:i] + [merge] + split[i + 2:]
            else:
                i += 1
        splits[idx] = split



In [61]:
splits

[['This'],
 ['Ġis'],
 ['Ġ', 'n', 'o', 't'],
 ['Ġa'],
 ['Ġ', 't', 'o', 'k', 'en'],
 ['Ġ', '<', '|'],
 ['en', 'd', 'o', 'f', 't', 'e', 'x', 't'],
 ['|', '>']]

In [62]:
vocab

['<|endoftext|>',
 ',',
 '.',
 'C',
 'F',
 'H',
 'T',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y',
 'z',
 'Ġ',
 'is',
 'er',
 'Ġa',
 'Ġto',
 'en',
 'Th',
 'This',
 'ou',
 'se',
 'Ġtok',
 'Ġtoken',
 'nd',
 'Ġis',
 'Ġth',
 'Ġthe',
 'in',
 'Ġab',
 'Ġtokeni',
 'Ġtokeniz']

In [69]:
pre_tokenize_result

[('This', (0, 4)),
 ('Ġis', (4, 7)),
 ('Ġnot', (7, 11)),
 ('Ġa', (11, 13)),
 ('Ġtoken', (13, 19)),
 ('Ġ<|', (19, 22)),
 ('endoftext', (22, 31)),
 ('|>', (31, 33))]

In [72]:
import re

# Define special tokens
special_tokens = ['<pad>']

# Define basic vocabulary including special tokens and alphabets
vocab = special_tokens + alphabet.copy()

vocab


['<pad>',
 ',',
 '.',
 'C',
 'F',
 'H',
 'T',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y',
 'z',
 'Ġ']

In [73]:
def pre_tokenize(text, special_tokens):
    # Escape special tokens for regex
    escaped_tokens = [re.escape(token) for token in special_tokens]
    # Create a regex pattern to match special tokens
    special_tokens_pattern = '|'.join(escaped_tokens)
    
    # Regex pattern to split by special tokens or spaces
    # Use capturing groups to preserve delimiters in the output
    pattern = f'({special_tokens_pattern})|( )'
    
    # Split text using the pattern
    tokens = re.split(pattern, text)
    
    # Filter out None values but keep empty strings (which represent spaces)
    tokens = [token for token in tokens if token is not None]
    
    return tokens


In [74]:
# Example input string with a special token and spaces
input_text = "<pad> hello world"

# Pre-tokenize the input string
tokens = pre_tokenize(input_text, ['<pad>'])

In [75]:
tokens

['', '<pad>', '', ' ', 'hello', ' ', 'world']

In [83]:
import re

# Example special tokens
special_tokens = ['<pad>', '[CLS]', '[SEP]', '[MASK]']

# Example vocabulary and merges (for demonstration purposes)
vocab = special_tokens + list('abcdefghijklmnopqrstuvwxyz')
merges = {('l', 'o'): 'lo', ('w', 'o'): 'wo', ('lo', 'r'): 'lor', ('r', 'l'): 'rl'}

def pre_tokenize(text, special_tokens):
    # Escape special tokens for regex
    escaped_tokens = [re.escape(token) for token in special_tokens]
    # Create a regex pattern to match special tokens or spaces
    special_tokens_pattern = '|'.join(escaped_tokens)
    
    # Split text using special tokens
    tokens = re.split(f'({special_tokens_pattern})', text)
    
    # Further split by spaces but keep the special tokens intact
    result = []
    for token in tokens:
        if token in special_tokens:
            result.append(token)
        else:
            # Split by space and keep spaces as separate tokens
            sub_tokens = re.split(r'(\s+)', token)
            result.extend([sub_token for sub_token in sub_tokens if sub_token])

    return result


def tokenize(text):
    pre_tokenized_text = pre_tokenize(text, special_tokens)
    
    # Split tokens into characters, but preserve special tokens
    splits = []
    for word in pre_tokenized_text:
        if word in special_tokens or word == ' ':
            splits.append([word])
        else:
            splits.append([char for char in word])

    # Apply BPE merges
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            if split[0] in special_tokens or split[0] == ' ':
                continue
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    # Flatten the list of lists
    return sum(splits, [])

# Example usage
input_text = "<pad> hello world"

tokens = tokenize(input_text)

print("Input text:", input_text)
print("Tokens:", tokens)


Input text: <pad> hello world
Tokens: ['<pad>', ' ', 'h', 'e', 'l', 'lo', ' ', 'wo', 'rl', 'd']


In [79]:
pre_tokenized_text = pre_tokenize(input_text, special_tokens)

In [82]:
pre_tokenized_text

['', '<pad>', '', ' ', 'hello', ' ', 'world']

In [80]:
splits = []
for word in pre_tokenized_text:
    if word in special_tokens or word == ' ':
        splits.append([word])
    else:
        splits.append([char for char in word])

In [81]:
splits

[[],
 ['<pad>'],
 [],
 [' '],
 ['h', 'e', 'l', 'l', 'o'],
 [' '],
 ['w', 'o', 'r', 'l', 'd']]

In [65]:
from transformers import PreTrainedTokenizer

# Define special tokens
special_tokens_test = ['[CLS]', '[SEP]', '[PAD]', '[MASK]']

# Define a small vocabulary for BPE tokenization
vocab_test = {
    '[CLS]': 0,
    '[SEP]': 1,
    '[PAD]': 2,
    '[MASK]': 3,
    'Hello': 4,
    'world': 5,
    ',': 6,
    '!': 7,
    'H': 8,
    'e': 9,
    'l': 10,
    'o': 11,
    'w': 12,
    'r': 13,
    'd': 14,
    'lo': 15,
    'ld': 16,
    'Hell': 17,
    'wor': 18,
    'orl': 19,
    'rl': 20
}

In [84]:
import re

def pre_tokenize(text, special_tokens):
    # Escape special tokens for regex
    escaped_tokens = [re.escape(token) for token in special_tokens]
    # Create a regex pattern to match special tokens
    special_tokens_pattern = '|'.join(escaped_tokens)
    
    # Pattern to match special tokens, words, and spaces
    pattern = f'({special_tokens_pattern})|(\\s+)|(\\S+)'
    
    # Split text using the pattern
    tokens = re.findall(pattern, text)
    
    # Flatten the list of tuples returned by re.findall and filter out empty strings
    tokens = [token for group in tokens for token in group if token]
    
    return tokens

# Example usage
input_text = "<pad> hello world [CLS] test [SEP]"

tokens = pre_tokenize(input_text, special_tokens)

print("Input text:", input_text)
print("Tokens:", tokens)


Input text: <pad> hello world [CLS] test [SEP]
Tokens: ['<pad>', ' ', 'hello', ' ', 'world', ' ', '[CLS]', ' ', 'test', ' ', '[SEP]']


In [86]:
import re

def pre_tokenize(text, special_tokens):
    # Escape special tokens for regex
    escaped_tokens = [re.escape(token) for token in special_tokens]
    # Create a regex pattern to match special tokens
    special_tokens_pattern = '|'.join(escaped_tokens)
    
    # Pattern to match special tokens or sequences of non-whitespace characters preceded by optional spaces
    pattern = f'({special_tokens_pattern})|(\\s*\\S+)'
    
    # Split text using the pattern
    tokens = re.findall(pattern, text)
    
    # Flatten the list of tuples returned by re.findall and filter out empty strings
    tokens = [token for group in tokens for token in group if token]
    
    return tokens

# Example usage
input_text = "<pad> hello world [CLS] test [SEP] hello world"

tokens = pre_tokenize(input_text, special_tokens)

print("Input text:", input_text)
print("Tokens:", tokens)


Input text: <pad> hello world [CLS] test [SEP] hello world
Tokens: ['<pad>', ' hello', ' world', ' [CLS]', ' test', ' [SEP]', ' hello', ' world']


In [1]:
with open('/home/bobby/code-repo/astar-projects/project-smallville/data/input.txt', 'r') as f:
    text = f.read()

In [151]:
class character:
    def __init__(self):
        with open('/home/bobby/code-repo/astar-projects/project-smallville/data/input.txt', 'r') as f:
            self.text = f.read()
        self.char = sorted(list(set(self.text)))
        self.vocab = {ch:i for i,ch in enumerate(self.char)} ## char to id
        # self.vocab_size = len(self.vocab)

        self.special_tokens = [
            'pad_token',
            'mask_token'
        ]
        self.pad_token = None
        self.pad_token_id = None
        self.mask_token = None
        self.mask_token_id = None
    
    def add_special_tokens(self, dict):
        '''
        This function will add special tokens to the vocabulary.
        '''
        assert set(dict.keys()).issubset(self.special_tokens), 'Invalid special tokens'

        ## get the current vocab size at the current point in time
        current_vocab_size = len(self.vocab)

        ## add the special tokens to the vocab
        for token in dict.keys():
            self.vocab[dict[token]] = current_vocab_size
            current_vocab_size += 1
        
        ## update the special tokens
        self.pad_token = dict['pad_token']
        self.pad_token_id = self.vocab[self.pad_token]
        self.mask_token = dict['mask_token']
        self.mask_token_id = self.vocab[self.mask_token]

    @property
    def vocab_size(self):
        return len(self.vocab)

    def decode(self, ids):
        '''
        Given a list of ids, this function will return the decoded text.
        '''
        idx_to_char = {i:ch for ch,i in self.vocab.items()}
        text = ''.join([idx_to_char[i] for i in ids])
        return text        


    def encode(self, text):
        '''
        Given a text, this function will return the encoded ids.
        '''
        tokens = []
        # words = text.split()
        # for word in words:
        #     if word in [self.pad_token, self.mask_token]:
        #         tokens.append(word)
        #     else:
        #         tokens.extend(list(word))

        i = 0

        while i < len(text):
            # Check if the current substring matches any special token
            matched_special_token = False
            for special_token in [self.pad_token, self.mask_token]:
                if text[i:i+len(special_token)] == special_token:
                    tokens.append(special_token)
                    i += len(special_token)
                    matched_special_token = True
                    break

            if not matched_special_token:
                # If not a special token, tokenize character by character
                tokens.append(text[i])
                i += 1
                
        out = [self.vocab[ch] for ch in tokens]
        return out


In [152]:
tokenizer = character()

In [153]:
tokenizer.vocab_size

65

In [154]:
tokenizer.decode(tokenizer.encode('feef defef'))

TypeError: object of type 'NoneType' has no len()

In [155]:
tokenizer.add_special_tokens({'pad_token':'<pad>', 'mask_token':'<mask>'})

In [156]:
tokenizer.mask_token_id

66

In [157]:
tokenizer.vocab

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64,
 '<pad>': 65,
 '<mask>': 66}

In [158]:
tokenizer.vocab_size

67

In [161]:
tokenizer.decode(tokenizer.encode('ewfeef <pad>'))

'ewfeef <pad>'

In [150]:
tokenizer.decode(tokenizer.encode('feef <pad>'))

KeyError: '<'

In [47]:
class CustomTokenizer:
    def __init__(self):
        self.vocab = {
            "hello": 0,
            "world": 1,
            "I": 2,
            "am": 3,
            "a": 4,
            "tokenizer": 5
        }
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        self.pad_token = None
        self.mask_token = None
        self.pad_token_id = None
        self.mask_token_id = None
        self.vocab_len = len(self.vocab)

    def add_special_tokens(self, pad_token, mask_token):
        # Add special tokens to the vocabulary
        original_vocab_size = len(self.vocab)
        self.vocab[pad_token] = original_vocab_size
        self.vocab[mask_token] = original_vocab_size + 1

        # Update inverse vocabulary
        self.inv_vocab[original_vocab_size] = pad_token
        self.inv_vocab[original_vocab_size + 1] = mask_token

        # Store special tokens and their IDs
        self.pad_token = pad_token
        self.mask_token = mask_token
        self.pad_token_id = self.vocab[pad_token]
        self.mask_token_id = self.vocab[mask_token]

    def tokenize(self, text):
        # Use updated vocabulary to tokenize text
        return [self.vocab.get(token) for token in text.split()]

    def convert_ids_to_tokens(self, ids):
        return [self.inv_vocab.get(id, "<unk>") for id in ids]

In [48]:
test_tokenizer = CustomTokenizer()

In [49]:
test_tokenizer.vocab

{'hello': 0, 'world': 1, 'I': 2, 'am': 3, 'a': 4, 'tokenizer': 5}

In [50]:
test_tokenizer.tokenize('hello <pad> world')

[0, None, 1]

In [52]:
test_tokenizer.vocab_len

6

In [53]:
test_tokenizer.add_special_tokens('<pad>', '<mask>')

In [54]:
test_tokenizer.tokenize('hello <pad> world')

[0, 6, 1]

In [55]:
test_tokenizer.vocab_len

6

In [56]:
len(test_tokenizer.vocab)

8

In [57]:
class BPETokenizer:
    def __init__(self, vocab):
        self.vocab = vocab

    def tokenize(self, text):
        # Step 2: Tokenization
        subwords = self._bpe_tokenization(text)

        # Step 3: Subword Units to Token IDs
        token_ids = [self.vocab[subword] for subword in subwords]

        return token_ids

    def _bpe_tokenization(self, text):
        # Placeholder BPE tokenization method
        # Replace this with your BPE tokenization logic
        # For demonstration, let's split words into characters
        return list(text)

# Step 1: Initialize Vocabulary
# Example vocabulary mapping subword units to token IDs
vocab = {
    "This": 0,
    "is": 1,
    "a": 2,
    "B": 3,
    "P": 4,
    "E": 5,
    "token": 6,
    "izer": 7,
    "example": 8
}

# Initialize BPE tokenizer
tokenizer = BPETokenizer(vocab)

# Input text
text = "This is a BPE tokenizer example"

# Step 4: Output
# Encode text
encoded_text = tokenizer.tokenize(text)
print("Encoded Text:", encoded_text)


KeyError: 'T'

In [58]:
list("This is a BPE tokenizer example")

['T',
 'h',
 'i',
 's',
 ' ',
 'i',
 's',
 ' ',
 'a',
 ' ',
 'B',
 'P',
 'E',
 ' ',
 't',
 'o',
 'k',
 'e',
 'n',
 'i',
 'z',
 'e',
 'r',
 ' ',
 'e',
 'x',
 'a',
 'm',
 'p',
 'l',
 'e']

In [60]:
tokenizer = character()

In [67]:
tokenizer.decode(tokenizer.encode('feef'))

'feef'

In [43]:
chars = sorted(list(set(text)))
vocab = {ch:i for i,ch in enumerate(chars)} ## char to id
idx_to_char = {i:ch for ch,i in vocab.items()} ## id to char

In [47]:
vocabzz

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [46]:
idx_to_char

{0: '\n',
 1: ' ',
 2: '!',
 3: '$',
 4: '&',
 5: "'",
 6: ',',
 7: '-',
 8: '.',
 9: '3',
 10: ':',
 11: ';',
 12: '?',
 13: 'A',
 14: 'B',
 15: 'C',
 16: 'D',
 17: 'E',
 18: 'F',
 19: 'G',
 20: 'H',
 21: 'I',
 22: 'J',
 23: 'K',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'W',
 36: 'X',
 37: 'Y',
 38: 'Z',
 39: 'a',
 40: 'b',
 41: 'c',
 42: 'd',
 43: 'e',
 44: 'f',
 45: 'g',
 46: 'h',
 47: 'i',
 48: 'j',
 49: 'k',
 50: 'l',
 51: 'm',
 52: 'n',
 53: 'o',
 54: 'p',
 55: 'q',
 56: 'r',
 57: 's',
 58: 't',
 59: 'u',
 60: 'v',
 61: 'w',
 62: 'x',
 63: 'y',
 64: 'z'}

In [29]:
from transformers import AddedToken

In [31]:
check = AddedToken('<PAD>', lstrip=False, rstrip=True)
check

AddedToken("<PAD>", rstrip=True, lstrip=False, single_word=False, normalized=True, special=False)

In [36]:
check.id

AttributeError: 'tokenizers.AddedToken' object has no attribute 'id'

In [18]:
char = sorted(list(set(text)))

char.append('<pad>')
char.append('<mask>')

vocab_size = len(char)

In [19]:
vocab_size

67

In [20]:
idx_to_char = {i:ch for i, ch in enumerate(char)}
idx_to_char

{0: '\n',
 1: ' ',
 2: '!',
 3: '$',
 4: '&',
 5: "'",
 6: ',',
 7: '-',
 8: '.',
 9: '3',
 10: ':',
 11: ';',
 12: '?',
 13: 'A',
 14: 'B',
 15: 'C',
 16: 'D',
 17: 'E',
 18: 'F',
 19: 'G',
 20: 'H',
 21: 'I',
 22: 'J',
 23: 'K',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'W',
 36: 'X',
 37: 'Y',
 38: 'Z',
 39: 'a',
 40: 'b',
 41: 'c',
 42: 'd',
 43: 'e',
 44: 'f',
 45: 'g',
 46: 'h',
 47: 'i',
 48: 'j',
 49: 'k',
 50: 'l',
 51: 'm',
 52: 'n',
 53: 'o',
 54: 'p',
 55: 'q',
 56: 'r',
 57: 's',
 58: 't',
 59: 'u',
 60: 'v',
 61: 'w',
 62: 'x',
 63: 'y',
 64: 'z',
 65: '<pad>',
 66: '<mask>'}