Download the BookCorpus dataset. Take every 7-th sample (the indices are multiple of 7:[0,7,14,21,...]) from the entire dataset. This will result in a dataset with 10 million samples (exactly, 10,572,033). Use these samples to build a tokenizer with the BPE tokenization algorithm by varying the vocabulary size.

Normalizer: LowerCase
PreTokenizer: WhiteSpace
Model: BPE
Special tokens: [GO],[UNK],[PAD],[EOS]
PostProcessing: None

Tokenize the input text: “SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24.” using the following configurations.

Question 1 - Keep the vocabulary size at 5000 and tokenize the input text using the learned vocabulary. Choose the number of tokens returned by the tokenizer.

In [1]:
!pip install datasets tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors
from tokenizers.normalizers import Lowercase
import re
dataset = load_dataset("bookcorpus", split='train')
indices = range(0, len(dataset), 7)
samples = [dataset[i]['text'] for i in indices]
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
special_tokens = ["[GO]", "[UNK]", "[PAD]", "[EOS]"]
tokenizer.add_special_tokens(special_tokens)
trainer = trainers.BpeTrainer(
    vocab_size=5000,
    special_tokens=special_tokens,
    continuing_subword_prefix="##"
)
tokenizer.train_from_iterator(samples, trainer=trainer)
input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
encoding = tokenizer.encode(input_text)
print(f"Tokens: {encoding.tokens}")
print(f"Number of tokens: {len(encoding.tokens)}")

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

bookcorpus.py:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

The repository for bookcorpus contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bookcorpus.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74004228 [00:00<?, ? examples/s]

Tokens: ['se', 'bi', 'study', 'find', 's', '9', '3', '%', 'of', 'indi', 'vid', 'ual', 'f', '&', 'o', 'tr', 'ad', 'ers', 'made', 'lo', 'sses', 'between', 'fy', '2', '2', 'and', 'fy', '2', '4', '.']
Number of tokens: 30


Question 2 - Increase the vocabulary size to 10K, 15K and 32K. For each case, tokenize the same input with the newly learned vocabulary. Choose all the correct statements.

In [4]:
def train_tokenizer(vocab_size, samples):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.normalizer = Lowercase()
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    special_tokens = ["[GO]", "[UNK]", "[PAD]", "[EOS]"]
    tokenizer.add_special_tokens(special_tokens)
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=special_tokens,
        continuing_subword_prefix="##"
    )
    tokenizer.train_from_iterator(samples, trainer=trainer)
    return tokenizer

input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
vocab_sizes = [10000, 15000, 32000]
for vocab_size in vocab_sizes:
    print(f"\n---- Tokenizing with vocabulary size {vocab_size} ----")
    tokenizer = train_tokenizer(vocab_size, samples)
    encoding = tokenizer.encode(input_text)
    print(f"Tokens: {encoding.tokens}")
    print(f"Number of tokens: {len(encoding.tokens)}")


---- Tokenizing with vocabulary size 10000 ----
Tokens: ['seb', '##i', 'study', 'finds', '9', '##3', '%', 'of', 'individual', 'f', '&', 'o', 'trad', '##ers', 'made', 'loss', '##es', 'between', 'f', '##y', '##2', '##2', 'and', 'f', '##y', '##2', '##4', '.']
Number of tokens: 28

---- Tokenizing with vocabulary size 15000 ----
Tokens: ['seb', '##i', 'study', 'finds', '9', '##3', '%', 'of', 'individual', 'f', '&', 'o', 'trad', '##ers', 'made', 'loss', '##es', 'between', 'f', '##y', '##2', '##2', 'and', 'f', '##y', '##2', '##4', '.']
Number of tokens: 28

---- Tokenizing with vocabulary size 32000 ----
Tokens: ['seb', '##i', 'study', 'finds', '9', '##3', '%', 'of', 'individual', 'f', '&', 'o', 'traders', 'made', 'losses', 'between', 'f', '##y', '##22', 'and', 'f', '##y', '##2', '##4', '.']
Number of tokens: 25


Question 3 -
Download the pre-trained tokenizer file “hopper.json” used in the lecture, from here . The tokenizer was trained on all 70 million samples in the BookCorpus dataset. Tokenize the same input text using this “hopper” tokenizer. How many tokens are there?

In [5]:
tokenizer = Tokenizer.from_file("/content/hopper.json")
input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
encoding = tokenizer.encode(input_text)
print(f"Tokens: {encoding.tokens}")
print(f"Number of tokens: {len(encoding.tokens)}")

Tokens: ['seb', '##i', 'study', 'finds', '9', '##3', '%', 'of', 'individual', 'f', '&', 'o', 'traders', 'made', 'losses', 'between', 'f', '##y', '##22', 'and', 'f', '##y', '##2', '##4', '.']
Number of tokens: 25


Question 4 - Suppose we know that the acronym “FY” will likely appear very frequently in most of the input text (assume the text comes from the financial domain). Therefore, we hope that adding it manually to the vocabulary might help. Add the token “FY” to the vocabulary and tokenize (use the Hopper tokenizer) the input text. Enter the number of tokens produced.

In [6]:
# tokenizer = Tokenizer.from_file("/content/hopper.json")
tokenizer.add_tokens(["FY"])
input_text = "SEBI study finds 93% of individual F&O traders made losses between FY22 and FY24."
encoding = tokenizer.encode(input_text)
print(f"Tokens: {encoding.tokens}")
print(f"Number of tokens: {len(encoding.tokens)}")

Tokens: ['seb', '##i', 'study', 'finds', '9', '##3', '%', 'of', 'individual', 'f', '&', 'o', 'traders', 'made', 'losses', 'between', 'fy', '22', 'and', 'fy', '24', '.']
Number of tokens: 22


Question 5 - Load the “bert-base-uncased” and "gpt2” tokenizers (use AutoTokenizer function from transformers). Which of the following special tokens are used in these tokenizers?

In [7]:
from transformers import AutoTokenizer
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
special_tokens = ["[GO]", "[CLS]", "[BOS]", "[SEP]", "[<|endoftext|>]"]
bert_special_tokens = {token: token in bert_tokenizer.all_special_tokens for token in special_tokens}
gpt2_special_tokens = {token: token in gpt2_tokenizer.all_special_tokens for token in special_tokens}
print("Special tokens used in 'bert-base-uncased' tokenizer:")
for token, is_used in bert_special_tokens.items():
    print(f"{token}: {'Yes' if is_used else 'No'}")

print("\nSpecial tokens used in 'gpt2' tokenizer:")
for token, is_used in gpt2_special_tokens.items():
    print(f"{token}: {'Yes' if is_used else 'No'}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Special tokens used in 'bert-base-uncased' tokenizer:
[GO]: No
[CLS]: Yes
[BOS]: No
[SEP]: Yes
[<|endoftext|>]: No

Special tokens used in 'gpt2' tokenizer:
[GO]: No
[CLS]: No
[BOS]: No
[SEP]: No
[<|endoftext|>]: No


Question 6 - By now, we have four tokenizers.

1. Custom tokenizer (vocab size 32K, trained on 10 million samples)
2. bert-base-uncased
3. gpt2
4. hopper

Use these four tokenizers to count the number of tokens for the entire “imdb” dataset (drop the “unsupervised” part of the dataset). Enter the tokenizers in order such that the size of the dataset (measured in tokens) as returned by the tokenizers is in ascending order. For example, if the first tokenizer yields the smallest number of tokens and the fourth tokenizer yields the largest, you would enter 1234 (without any spaces).”

In [9]:
!pip install datasets transformers tokenizers

from datasets import load_dataset
from transformers import AutoTokenizer
from tokenizers import Tokenizer
import numpy as np

imdb_dataset = load_dataset("imdb", split="train")

def load_custom_tokenizer():
    return Tokenizer.from_file("/content/hopper.json")
custom_tokenizer = load_custom_tokenizer()
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
hopper_tokenizer = Tokenizer.from_file("/content/hopper.json")
tokenizers = [
    ("Custom Tokenizer (32K)", custom_tokenizer),
    ("BERT (bert-base-uncased)", bert_tokenizer),
    ("GPT-2", gpt2_tokenizer),
    ("Hopper", hopper_tokenizer)
]
def count_tokens(tokenizer, dataset):
    total_tokens = 0
    for sample in dataset:
        text = sample["text"]
        if isinstance(tokenizer, Tokenizer):
            encoding = tokenizer.encode(text)
            total_tokens += len(encoding.tokens)
        else:
            encoding = tokenizer.encode(text)
            total_tokens += len(encoding)

    return total_tokens
token_counts = []
for name, tokenizer in tokenizers:
    print(f"Counting tokens for {name}...")
    token_count = count_tokens(tokenizer, imdb_dataset)
    token_counts.append((name, token_count))
token_counts.sort(key=lambda x: x[1])
tokenizer_order = "".join([str(i+1) for i, _ in enumerate(token_counts)])
print(f"\nOrder of tokenizers based on token count (ascending): {tokenizer_order}")

Counting tokens for Custom Tokenizer (32K)...


Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


Counting tokens for BERT (bert-base-uncased)...


Token indices sequence length is longer than the specified maximum sequence length for this model (1168 > 1024). Running this sequence through the model will result in indexing errors


Counting tokens for GPT-2...
Counting tokens for Hopper...

Order of tokenizers based on token count (ascending): 1234
