# Developing a simple Tokenizer from Scratch

In [11]:
# imports
import re

In [44]:
'''
    Example for Tokenizer, which don't include indepth preprocessing.
'''

class simpleTokenizer:
    def __init__(self, text=None, cased=True):
        self.cased = cased
        self.token_to_idx = {}
        self.idx_to_token = {}
        if text is not None:
            self.train(text)
        
    def preprocess(self, text):
        if self.cased: text = text.lower()
        text = re.sub(r' +', ' ', text)
        return text
        
    def train(self, text):
        print('Training Tokenizer...')
        vocab = set()
        text = self.preprocess(text)
        lines = text.split('\n')
        print(f'Total Sentences: %s' % len(lines))
        
        for line in lines:
            tokens = line.split()
            for token in tokens:
                vocab.add(token)
        
        vocab = sorted(vocab)
        vocab.append('<unk>')
        print(f'Vocab size: %s' % len(vocab))
        
        self.token_to_idx = {token: idx for idx, token in enumerate(vocab)}
        self.idx_to_token = {idx: token for idx, token in enumerate(vocab)}

    def encode(self, text):
        text = self.preprocess(text)
        tokens = text.split()
        return [self.token_to_idx[token] if token in self.token_to_idx else self.token_to_idx['<unk>'] for token in tokens]
    
    def decode(self, token_ids):
        return ' '.join([self.idx_to_token[idx] if idx in self.idx_to_token else '<unk>' for idx in token_ids])

In [45]:
sample_text = '''
Lexical tokenization is conversion of a text into meaningful lexical tokens belonging to categories defined by a "lexer" program. 
In case of a natural language, those categories include nouns, verbs, adjectives, punctuations etc. 
In case of a programming language, the categories include identifiers, operators, grouping symbols and data types. 
Lexical tokenization is related to the type of tokenization used in large language models but with two differences. 
First, lexical tokenization is usually based on a lexical grammar, whereas LLM tokenizers are usually probability-based. 
Second, LLM tokenizers perform a second step that converts the tokens into numerical values
'''

In [46]:
# tokenizer = simpleTokenizerCased()
# tokenizer.train(sample_text)
tokenizer = simpleTokenizer(sample_text)

Training Tokenizer...
Total Sentences: 8
Vocab size: 66


In [47]:
token_ids = tokenizer.encode('Second, LLM tokenizers perform a second step that converts the tokens into numerical values')
token_ids

[45, 29, 54, 38, 1, 44, 46, 49, 12, 50, 55, 23, 34, 61]

In [48]:
tokenizer.decode(token_ids)

'second, llm tokenizers perform a second step that converts the tokens into numerical values'