## Workings for the new character level tokenizer

The key trait of this function is to incorporate special tokens like mask and pad.

In [1]:
## creating the character_new class

import re

class character_new:
    def __init__(self, text_path):
        ## Load the text file
        with open(text_path, 'r') as file:
            self.text = file.read()
        
        ## check special_tokens_dict
        self.pad_token = '<pad>'
        self.unk_token = '<unk>'
        self.bos_token = '<bos>'
        self.eos_token = '<eos>'
        self.mask_token = '<mask>'
        self.special_tokens_dict = {self.pad_token : 0, self.unk_token : 1, self.bos_token : 2, self.eos_token : 3, self.mask_token : 4}

        ## Create the character to index and index to character mapping
        self.chars = sorted(list(set(self.text)))
        self.vocab_size = len(self.chars)
        self.vocab_offset = max(self.special_tokens_dict.values()) + 1
        self.vocab = {char: index + self.vocab_offset for index, char in enumerate(self.chars)}
        self.vocab.update(self.special_tokens_dict)

        ## Create the character to index and index to character mapping
        self.char_to_index = {char: index for char, index in self.vocab.items()}
        self.index_to_char = {index: char for char, index in self.vocab.items()}

        ## Add special tokens to the mapping

    def pre_tokenize(self, given_text):
        ## escape special tokens for regex
        escaped_tokens = [re.escape(token) for token in self.special_tokens_dict.keys()]
        
        ## join the escaped tokens
        special_tokens_pattern = '|'.join(escaped_tokens)

        ## split the text using the pattern, but keep special tokens intact
        parts = re.split(f'({special_tokens_pattern})', given_text)

        ## process parts to combine spaces with non-special tokens
        tokens = []
        for part in parts:
            if part in [self.pad_token, self.unk_token, self.bos_token, self.eos_token]:
                tokens.append(part)
            else:
                # Find words and spaces, and combine them
                sub_tokens = re.findall(r'\s*\S+|\s+', part)
                tokens.extend(sub_tokens)
        return tokens
    
    def encode(self, given_text):
        pre_tokenized_text = self.pre_tokenize(given_text)
        splits =[[word] if any(substring in word for substring in self.special_tokens_dict.keys()) else [l for l in word] for word in pre_tokenized_text]
        flattened_splits = sum(splits,[])
        res = [self.char_to_index.get(char, self.char_to_index[self.unk_token]) for char in flattened_splits]
        return res

    def decode(self, indices):
        return ''.join([self.index_to_char.get(index, self.unk_token) for index in indices])


In [2]:
## an example of how the escaped tokens will behave like
import re

escaped_tokens = [re.escape(token) for token in ['<pad>', '<unk>', '<bos>', '<eos>']]
'|'.join(escaped_tokens)

'<pad>|<unk>|<bos>|<eos>'

In [3]:
tokenizer = character_new('/home/bobby/code-repo/astar-projects/project-smallville/data/input.txt')

In [4]:
tokenizer.encode('hello <pad> world')

[51, 48, 55, 55, 58, 6, 0, 6, 66, 58, 61, 55, 47]

In [5]:
tokenizer.decode(tokenizer.encode('hello <pad> world_'))

'hello <pad> world<unk>'

In [6]:
tokenizer.vocab

{'\n': 5,
 ' ': 6,
 '!': 7,
 '$': 8,
 '&': 9,
 "'": 10,
 ',': 11,
 '-': 12,
 '.': 13,
 '3': 14,
 ':': 15,
 ';': 16,
 '?': 17,
 'A': 18,
 'B': 19,
 'C': 20,
 'D': 21,
 'E': 22,
 'F': 23,
 'G': 24,
 'H': 25,
 'I': 26,
 'J': 27,
 'K': 28,
 'L': 29,
 'M': 30,
 'N': 31,
 'O': 32,
 'P': 33,
 'Q': 34,
 'R': 35,
 'S': 36,
 'T': 37,
 'U': 38,
 'V': 39,
 'W': 40,
 'X': 41,
 'Y': 42,
 'Z': 43,
 'a': 44,
 'b': 45,
 'c': 46,
 'd': 47,
 'e': 48,
 'f': 49,
 'g': 50,
 'h': 51,
 'i': 52,
 'j': 53,
 'k': 54,
 'l': 55,
 'm': 56,
 'n': 57,
 'o': 58,
 'p': 59,
 'q': 60,
 'r': 61,
 's': 62,
 't': 63,
 'u': 64,
 'v': 65,
 'w': 66,
 'x': 67,
 'y': 68,
 'z': 69,
 '<pad>': 0,
 '<unk>': 1,
 '<bos>': 2,
 '<eos>': 3,
 '<mask>': 4}