In [None]:
# LLM FROM SCRATCH

In [None]:
# Building a Byte Pair Encoding (BPE) Tokenizer from scratch

In [None]:
# Step 1: Prepare training data
# Note: here we are NOT training a neural network.
# We are just creating a list of tokens from text.
#
# What we need: a text corpus.
# The tokenizer will learn "merge rules" based on how often pairs of characters appear.
#
# Example:
#   "i"   -> token 1
#   "s"   -> token 2
#   "is"  -> token 3
#
# How it works:
# 1. Start with text split into individual characters.
#    (every character is always in the vocabulary)
# 2. Count how often character pairs occur.
# 3. Merge the most frequent pairs into new tokens (subwords).
# 4. Over time, the tokenizer builds a vocabulary that mixes
#    single characters + useful subword tokens.
#
# Important:
# - Characters always remain as fallback tokens.
# - Subwords get priority when tokenizing, making it more efficient.
#
# Concretely:
# - We scan the text and count how many times each pair of characters appears.
# - For example, if the pair "is" appears very often, we create a new token for it.
# - This reduces computation: instead of processing "i" and "s" separately,
#   we can treat "is" as a single token.
# - Note: "i" and "s" still remain in the vocabulary as individual tokens,
#   but whenever the pair "is" exists, it takes priority over the single characters.
#
# Iterative merges:
# - The process continues on top of previously created tokens.
# - For example, if "is" was already merged into a token, and we notice "his"
#   appears frequently, we merge "h" + "is" → "his".
# - Next, if "this" is common, we merge "t" + "his" → "this".
# - This way, the vocabulary gradually grows from characters → subwords → whole words,
#   depending on frequency in the training text.

In [None]:
# Our corpus of data
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

In [11]:
print("Corpus: ")
for doc in corpus:
    print(doc)

Corpus: 
This is the first document.
This document is the second document.
And this is the third one.
Is this the first document?


In [None]:
# Step 2: Initialize vocabulary with unique characters
#
# The first version of our vocabulary is simply all the unique characters
# that appear in the training corpus.
# Each character will be treated as an initial token.
#
# In addition, we add a special end-of-word marker (</w>).
# This marker helps the tokenizer know where words end, so that
# frequent whole words or subwords can be merged properly later.
#
# Example:
#   "this"  →  ["t", "h", "i", "s</w>"]
#   "is"    →  ["i", "s</w>"]
#
unique_chars = set()
for doc in corpus:
    for char in doc:
        unique_chars.add(char)

# Convert to a sorted list so the vocabulary is consistent and reproducible
vocab = list(unique_chars)
vocab.sort()

# Add the special end-of-word token
end_of_word = "</w>"
vocab.append(end_of_word)

In [14]:
print("Initial Vocabulary:")
print(vocab)
print(f"Vocabulary Size: {len(vocab)}")

Initial Vocabulary:
[' ', '.', '?', 'A', 'I', 'T', 'c', 'd', 'e', 'f', 'h', 'i', 'm', 'n', 'o', 'r', 's', 't', 'u', '</w>']
Vocabulary Size: 20


In [22]:
# Step 3: Pre-tokenize the corpus
#
# Goal:
# - Split text into words (by spaces, for simplicity).
# - Break each word into its characters.
# - Add the special end-of-word token (</w>) at the end of every word.
#
# Why?
# - This gives us the initial representation of words as sequences of characters.
# - Example: "This" → ("T", "h", "i", "s", "</w>")
#
# Implementation details:
# - We store each word as a tuple of characters (immutable).
#   Tuples can be used as dictionary keys, unlike lists.
# - We count how many times each word (as a sequence of characters) appears
#   in the whole corpus.
#
# Note:
# - Adding the </w> token ensures that subwords are learned within word
#   boundaries. For example:
#   "document" → ("d", "o", "c", "u", "m", "e", "n", "t", "</w>")
#   This way, if "doc" becomes a frequent subword, it is clear that it
#   belongs inside the word "document" and not across words.
#
word_splits = {}
for doc in corpus:
    words = doc.split(' ')
    for word in words:
        if word:
            # Represent word as characters + </w>
            char_list = list(word) + [end_of_word]
            word_tuple = tuple(char_list)
            
            # Count frequency of this word form
            if word_tuple not in word_splits:
                word_splits[word_tuple] = 0
            word_splits[word_tuple] += 1

print("\nPre-tokenized Word Frequencies:")
print(word_splits)


Pre-tokenized Word Frequencies:
{('T', 'h', 'i', 's', '</w>'): 2, ('i', 's', '</w>'): 3, ('t', 'h', 'e', '</w>'): 4, ('f', 'i', 'r', 's', 't', '</w>'): 2, ('d', 'o', 'c', 'u', 'm', 'e', 'n', 't', '.', '</w>'): 2, ('d', 'o', 'c', 'u', 'm', 'e', 'n', 't', '</w>'): 1, ('s', 'e', 'c', 'o', 'n', 'd', '</w>'): 1, ('A', 'n', 'd', '</w>'): 1, ('t', 'h', 'i', 's', '</w>'): 2, ('t', 'h', 'i', 'r', 'd', '</w>'): 1, ('o', 'n', 'e', '.', '</w>'): 1, ('I', 's', '</w>'): 1, ('d', 'o', 'c', 'u', 'm', 'e', 'n', 't', '?', '</w>'): 1}


In [25]:
# Step 4: Count symbol pair frequencies
#
# Goal:
# - Take the dictionary of word splits we created (word_splits).
# - For each word, look at all adjacent pairs of symbols.
# - Count how many times each pair appears across the entire corpus.
#
# Example:
#   Input: {("T", "h", "i", "s", "</w>"): 2}
#   Output: {("T", "h"): 2, ("h", "i"): 2, ("i", "s"): 2, ("s", "</w>"): 2}

import collections 

def get_pair_state(splits):
    """
    Count the frequency of adjacent symbol pairs in the word_splits dictionary.
    Example:
        {('T', 'h', 'i', 's', '</w>'): 2, ('i', 's', '</w>'): 3, ...}
    Output:
        {('T', 'h'): 2, ('h', 'i'): 2, ('i', 's'): 5, ...}
    """

    pair_counts = collections.defaultdict(int)
    # there is a difference between the normal dict of python and collections.defaultdict 
    # the difference is: in a normal dict if we try to call a key that doesn't exist,
    # it will throw an error. 
    # but this dictionary will create that key automatically,
    # and the value it assigns to that key will be whatever we pass to the function "int",
    # in this case it assigns zero.

    for word_tuple, freq in splits.items():
        # Example: ('T', 'h', 'i', 's', '</w>'): 2
        # word_tuple = ('T', 'h', 'i', 's', '</w>')
        # freq = 2 
        symbols = list(word_tuple)
        # symbols = ['T', 'h', 'i', 's', '</w>']
        for i in range(len(symbols) - 1):
            # len(symbols) - 1 ensures the last pair doesn't go out of range 
            # because we are accessing symbols[i+1]
            pair = (symbols[i], symbols[i+1])
            # Example: pair = ('T', 'h')
            pair_counts[pair] += freq  # adding the frequency of the pair
            # Example: pair_counts = {('T', 'h'): freq}
    return pair_counts
