# Theory in Code: Tokenizer

### Notebook Setup

In [None]:
# imports
import transformers


In [None]:
SEED = 42

example_text_to_tokenize = "This is an example sentence to be tokenized."

## Basic GPT-2 Tokenizer

In [None]:
tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")

## Loads the same tokenizer as above
# tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")

In [None]:
# tokenize text
tokens = tokenizer(example_text_to_tokenize)
print(tokens)

In [None]:
# decode tokens back to text
decoded_text = tokenizer.decode(tokens["input_ids"])
print(decoded_text)

In [None]:
# print all special tokens
print(tokenizer.special_tokens_map)

GPT-2 has only one true special token: <|endoftext|>


No padding -> you can use eos_token for padding or add a custom pad_token as special token.

In [None]:
# get the Vocab size from the tokenizer
vocab_size = tokenizer.vocab_size
print(f"Vocab size: {vocab_size}")

In [None]:
## edge cases
# edge case 1: empty string
print(tokenizer(""))

In [None]:
# edge case 2: string with only spaces
print(tokenizer("     "))

In [None]:
# edge case 3: emojis and special characters
print("ðŸ˜Š:" ,tokenizer("ðŸ˜Š"))
print("ä¸¤:", tokenizer("ä¸¤"))
print("NewLine: ", tokenizer("\n"))

In [None]:
# edge case 4: Supercalifragilisticexpialidocious
print(tokenizer("Supercalifragilisticexpialidocious"))

In [None]:
# edge case 5: DonauÂ­dampfschifffahrtsÂ­elektrizitÃ¤tenÂ­hauptbetriebswerkÂ­bauunterbeamtenÂ­gesellschaft
print(tokenizer("DonauÂ­dampfschifffahrtsÂ­elektrizitÃ¤tenÂ­hauptbetriebswerkÂ­bauunterbeamtenÂ­gesellschaft"))

In [None]:
# edge case 6: special token
print(tokenizer(tokenizer.eos_token))
print(tokenizer(tokenizer.bos_token))

### Compare to other Tokenizers

In [None]:
# GPT-2 tokenizer
print(tokenizer.vocab_size)
print(tokenizer.special_tokens_map)
print(tokenizer(example_text_to_tokenize))

In [None]:
# BERT tokenizer
bert_tokenizer = transformers.BertTokenizerFast.from_pretrained("bert-base-uncased")
bert_tokens = bert_tokenizer(example_text_to_tokenize)
print(bert_tokenizer.vocab_size)
print(bert_tokenizer.special_tokens_map)
print(bert_tokens)

OpenAI Tokenizer: [OpenAI Platform Tokenizer Playground](https://platform.openai.com/tokenizer)

*A helpful rule of thumb is that one token generally corresponds to ~4 characters of text for common English text.*

## Example: NanoGPT

The full code from nanoGPT: [nanoGPT/dat/openwebtext/prepare.py](https://github.com/karpathy/nanoGPT/blob/master/data/openwebtext/prepare.py)

In [None]:
import tiktoken
from datasets import load_dataset, Dataset, DatasetDict


In [None]:
# define tokenizer through tiktoken
enc = tiktoken.get_encoding("gpt2")

In [None]:
# num_proc_load_dataset = 8 # number of processes to use for loading the dataset
# dataset = load_dataset("openwebtext", split="train[:5]", num_proc=num_proc_load_dataset)

## lighter variant with streaming, which does not load the whole dataset into memory
dataset = load_dataset("openwebtext", split="train", streaming=True)
# take only the first
dataset = list(dataset.take(100))

In [None]:
# since we just used the steaming variant, we need to convert the list back to a Dataset object
dataset = DatasetDict({
    "train": Dataset.from_list(dataset)
})

In [None]:
dataset

In [None]:
# split with seed
split_dataset = dataset["train"].train_test_split(test_size=0.05, seed=SEED, shuffle=True)

# rename test to validation
split_dataset['val'] = split_dataset.pop('test')

In [None]:
split_dataset

In [None]:
# nanoGPT function
def process(example):
        ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
        ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
        # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
        out = {'ids': ids, 'len': len(ids)}
        return out

# tokenize the dataset
tokenized = split_dataset.map(
        process,
        remove_columns=['text'],
        desc="tokenizing the splits",
        num_proc=8
)

Map-Function -> Smart way of a for each loop

```
for each example in dataset:
    new_example = process(example)
```

**Why map?**
- Preserves the dataset structure (keeps column names, features etc.)
- alows parallelization (num_proc)
- memory efficient
- supports batching (Better GPU utilization, Vectorized tokenization)
- designed for large-scale ML


**use map when working with datasets**


In [None]:
# print out the first rows of train and validation split
print(tokenized['train'][0])
print(tokenized['val'][0])

In [None]:
# use map to get the text back from the tokenized dataset
def decode(example):
    text = enc.decode(example['ids'])
    return {'text': text}

decoded = tokenized.map(
    decode,
    remove_columns=['ids', 'len'],
    desc="decoding the splits",
    num_proc=8
)

In [None]:
print(decoded['train'][0])
print(decoded['val'][0])

## Structure Idea

function head out of: pikogpt/src/data/preprocessing.py

In [None]:
from transformers import GPT2TokenizerFast

def tokenize_documents(
    texts: list[str],
    tokenizer: GPT2TokenizerFast,
    batch_size: int = 1000,
    show_progress: bool = True
    ) -> list[list[int]]:
    """
    Tokenize documents with EOT token appended.

    Args:
        texts: List of text documents
        tokenizer: GPT-2 tokenizer
        batch_size: Number of documents to tokenize at once
        show_progress: Whether to show progress bar

    Returns:
        List of token ID lists (one per document)
    """


## Project Stages Recommendations

- Example Code