In [1]:
import re

In [2]:
sample_text = """The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet at least once, making it perfect for testing various algorithms and systems. 

Machine learning has revolutionized the way we approach complex problems in computer science. From natural language processing to computer vision, these algorithms can identify patterns in data that would be impossible for humans to detect manually.

The transformer architecture, introduced in the paper "Attention is All You Need," has become the foundation for modern language models. These models use self-attention mechanisms to process sequences of tokens, allowing them to understand context and relationships between words in a sentence.

Tokenization is a crucial preprocessing step in natural language processing. It involves breaking down text into smaller units called tokens, which can be words, subwords, or even individual characters. Different tokenization strategies have different advantages: word-level tokenization preserves semantic meaning but struggles with out-of-vocabulary words, while subword tokenization like BPE (Byte Pair Encoding) can handle rare words by breaking them into smaller pieces.

The development of large language models like GPT, BERT, and T5 has shown remarkable capabilities in understanding and generating human-like text. These models are trained on vast amounts of text data and can perform tasks ranging from translation to question answering to creative writing.

However, with great power comes great responsibility. The ethical implications of AI systems are becoming increasingly important as these technologies become more prevalent in society. Issues such as bias, fairness, transparency, and accountability must be carefully considered when deploying AI systems in real-world applications."""

print(sample_text)

The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet at least once, making it perfect for testing various algorithms and systems. 

Machine learning has revolutionized the way we approach complex problems in computer science. From natural language processing to computer vision, these algorithms can identify patterns in data that would be impossible for humans to detect manually.

The transformer architecture, introduced in the paper "Attention is All You Need," has become the foundation for modern language models. These models use self-attention mechanisms to process sequences of tokens, allowing them to understand context and relationships between words in a sentence.

Tokenization is a crucial preprocessing step in natural language processing. It involves breaking down text into smaller units called tokens, which can be words, subwords, or even individual characters. Different tokenization strategies have different advantages: word-level to

In [3]:
# Split text into words and punctuation (excluding whitespace)
tokens = re.findall(r'\b\w+\b|[^\w\s]', sample_text)  # Extract words and punctuation, exclude whitespace
unique_tokens = list(set(tokens))  # Get unique tokens
print(f"Number of unique tokens: {len(unique_tokens)}")
print(f"Total tokens: {len(tokens)}")

Number of unique tokens: 195
Total tokens: 304


In [4]:
print(unique_tokens)

['question', 'called', ')', 'T5', 'from', 'great', 'impossible', 'revolutionized', 'computer', 'with', 'on', 'perform', 'such', 'pieces', 'Encoding', 'allowing', 'of', 'models', 'introduced', 'language', 'transparency', 'least', 'considered', 'breaking', 'be', 'modern', 'has', 'contains', 'translation', 'writing', 'systems', 'various', 'Pair', 'society', 'words', 'transformer', 'increasingly', 'prevalent', 'would', 'These', 'is', 'ranging', 'jumps', 'answering', 'brown', 'handle', 'Attention', '.', 'fox', 'applications', 'The', 'sequences', 'crucial', 'meaning', 'fairness', 'or', 'generating', 'trained', 'detect', 'Issues', 'BPE', 'into', 'the', 'shown', 'capabilities', 'patterns', 'more', 'between', 'a', 'semantic', 'architecture', 'preprocessing', 'Different', 'strategies', 'word', 'like', 'these', 'to', 'for', 'understand', 'struggles', 'manually', 'becoming', 'lazy', 'and', 'human', 'important', 'as', 'but', 'humans', 'testing', 'characters', 'remarkable', 'subwords', 'processing',

In [5]:
vocabulary = {word:idx for idx, word in enumerate(unique_tokens)}
vocabulary

{'question': 0,
 'called': 1,
 ')': 2,
 'T5': 3,
 'from': 4,
 'great': 5,
 'impossible': 6,
 'revolutionized': 7,
 'computer': 8,
 'with': 9,
 'on': 10,
 'perform': 11,
 'such': 12,
 'pieces': 13,
 'Encoding': 14,
 'allowing': 15,
 'of': 16,
 'models': 17,
 'introduced': 18,
 'language': 19,
 'transparency': 20,
 'least': 21,
 'considered': 22,
 'breaking': 23,
 'be': 24,
 'modern': 25,
 'has': 26,
 'contains': 27,
 'translation': 28,
 'writing': 29,
 'systems': 30,
 'various': 31,
 'Pair': 32,
 'society': 33,
 'words': 34,
 'transformer': 35,
 'increasingly': 36,
 'prevalent': 37,
 'would': 38,
 'These': 39,
 'is': 40,
 'ranging': 41,
 'jumps': 42,
 'answering': 43,
 'brown': 44,
 'handle': 45,
 'Attention': 46,
 '.': 47,
 'fox': 48,
 'applications': 49,
 'The': 50,
 'sequences': 51,
 'crucial': 52,
 'meaning': 53,
 'fairness': 54,
 'or': 55,
 'generating': 56,
 'trained': 57,
 'detect': 58,
 'Issues': 59,
 'BPE': 60,
 'into': 61,
 'the': 62,
 'shown': 63,
 'capabilities': 64,
 'patte

In [6]:
def preprocess_input(text):
    return re.findall(r'\b\w+\b|[^\w\s]', text)

In [7]:
def encoder(input):
    preprocessed_input = preprocess_input(input)
    encoded_tokens = []
    for token in preprocessed_input:
        encoded_tokens.append(vocabulary[token])
    return encoded_tokens

In [8]:
encoder("However understanding learning or")

[152, 124, 160, 55]

In [9]:
# Create reverse mapping from vocabulary
int_to_string_mapper = {idx: word for word, idx in vocabulary.items()}
int_to_string_mapper

{0: 'question',
 1: 'called',
 2: ')',
 3: 'T5',
 4: 'from',
 5: 'great',
 6: 'impossible',
 7: 'revolutionized',
 8: 'computer',
 9: 'with',
 10: 'on',
 11: 'perform',
 12: 'such',
 13: 'pieces',
 14: 'Encoding',
 15: 'allowing',
 16: 'of',
 17: 'models',
 18: 'introduced',
 19: 'language',
 20: 'transparency',
 21: 'least',
 22: 'considered',
 23: 'breaking',
 24: 'be',
 25: 'modern',
 26: 'has',
 27: 'contains',
 28: 'translation',
 29: 'writing',
 30: 'systems',
 31: 'various',
 32: 'Pair',
 33: 'society',
 34: 'words',
 35: 'transformer',
 36: 'increasingly',
 37: 'prevalent',
 38: 'would',
 39: 'These',
 40: 'is',
 41: 'ranging',
 42: 'jumps',
 43: 'answering',
 44: 'brown',
 45: 'handle',
 46: 'Attention',
 47: '.',
 48: 'fox',
 49: 'applications',
 50: 'The',
 51: 'sequences',
 52: 'crucial',
 53: 'meaning',
 54: 'fairness',
 55: 'or',
 56: 'generating',
 57: 'trained',
 58: 'detect',
 59: 'Issues',
 60: 'BPE',
 61: 'into',
 62: 'the',
 63: 'shown',
 64: 'capabilities',
 65: 'p

In [10]:
def decoder(ids):
    """Decode token indices back to text"""
    # Convert indices to tokens
    tokens = [int_to_string_mapper[id] for id in ids if id in int_to_string_mapper]
    
    # Post-processing: reconstruct text with proper spacing
    result = ""
    for i, token in enumerate(tokens):
        # Add space before words (but not before punctuation)
        if i > 0 and token.isalnum() and tokens[i-1].isalnum():
            result += " "
        result += token
    
    return result

In [11]:
decoder([0, 6, 7, 186])

'question impossible revolutionized involves'

Obviously, the drawback here is that we are not able to handle out-of-vocabulary words. Let's package this neatly into a class and also add support for out-of-vocabulary words. 

In [12]:
class Tokenizer:
    def __init__(self, vocabulary):
        """Initialize the tokenizer with a given vocabulary mapping.
        Parameters
        ----------
        vocabulary : dict
            A dictionary mapping tokens (str) to unique integer IDs.
        Adds an <_unkown_> token to handle out-of-vocabulary terms.
        """
        self.word_to_id = vocabulary
        self.id_to_word = {idx:word for word,idx in vocabulary.items()}
        # let's add a <_unkown_> token to our mappers
        self.word_to_id["<_unkown_>"] = len(vocabulary) + 1
        self.id_to_word[len(vocabulary) + 1] = "<_unkown_>"
    
    def preprocess_input(self, text):
        """Tokenize input text into a list of words and punctuation.
        Whitespace is ignored. This uses the same regex pattern as the
        vocabulary builder to ensure consistency.
        Parameters
        ----------
        text : str
            Raw input string to be tokenized.
        Returns
        -------
        list[str]
            List of tokens (words and punctuation).
        """
        return re.findall(r'\b\w+\b|[^\w\s]', text) 
    
    def postprocess_text(self, tokens):
        """Reconstruct text from a list of tokens.
        Inserts spaces between consecutive alphanumeric tokens while keeping
        punctuation attached to the preceding token.
        Parameters
        ----------
        tokens : list[str]
            List of tokens produced by the tokenizer.
        Returns
        -------
        str
            Human-readable string rebuilt from the tokens.
        """
        result = ""
        for i, token in enumerate(tokens):
        # Add space before words (but not before punctuation)
            if i > 0 and token.isalnum() and tokens[i-1].isalnum():
                result += " "
            result += token
        return result
                
    def encoder(self, input):
        """Convert raw text to a list of token IDs.
        Any token not present in the vocabulary is mapped to the
        <_unknown_> token ID.
        Parameters
        ----------
        input : str
            Raw input string.
        Returns
        -------
        list[int]
            Sequence of token IDs corresponding to the input text.
        """
        preprocessed_input = self.preprocess_input(input)
        encoded_tokens = []
        for token in preprocessed_input:
            if token not in self.word_to_id:
                encoded_tokens.append(self.word_to_id["<_unkown_>"])
            else:
                encoded_tokens.append(self.word_to_id[token])
        return encoded_tokens
    
    def decoder(self, ids):
        """Convert a sequence of token IDs back to human-readable text.
        Parameters
        ----------
        ids : list[int]
            Sequence of token IDs.
        Returns
        -------
        str
            Decoded text.
        """
        tokens = [self.id_to_word[id] for id in ids if id in self.id_to_word]
        # Post-processing: reconstruct text with proper spacing
        return self.postprocess_text(tokens)

In [14]:
tokenizer = Tokenizer(vocabulary)

test_text = "However understanding learning or"
encoded = tokenizer.encoder(test_text)
print(f"Original text: {test_text}")
print(f"Encoded IDs: {encoded}")

decoded = tokenizer.decoder(encoded)
print(f"Decoded text: {decoded}")

oov_text = "Machine learning with Python programming"
encoded_oov = tokenizer.encoder(oov_text)
print(f"\nOut-of-vocabulary test: {oov_text}")
print(f"Encoded IDs: {encoded_oov}")
decoded_oov = tokenizer.decoder(encoded_oov)
print(f"Decoded text: {decoded_oov}")

full_text = "The quick brown fox jumps over the lazy dog."
print(f"\nFull round-trip test:")
print(f"Original: {full_text}")
encoded_full = tokenizer.encoder(full_text)
print(f"Encoded: {encoded_full}")
decoded_full = tokenizer.decoder(encoded_full)
print(f"Decoded: {decoded_full}")


Original text: However understanding learning or
Encoded IDs: [152, 124, 160, 55]
Decoded text: However understanding learning or

Out-of-vocabulary test: Machine learning with Python programming
Encoded IDs: [117, 160, 9, 197, 197]
Decoded text: Machine learning with<_unkown_><_unkown_>

Full round-trip test:
Original: The quick brown fox jumps over the lazy dog.
Encoded: [50, 176, 44, 48, 42, 122, 62, 83, 148, 47]
Decoded: The quick brown fox jumps over the lazy dog.
