In [1]:
import torch
from datasets import Dataset, load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.tools import EncodingVisualizer
from tokenizers.trainers import WordLevelTrainer
from torch.utils.data import DataLoader

## Load dataset

https://huggingface.co/datasets/Salesforce/wikitext

https://huggingface.co/docs/transformers/perplexity#example-calculating-perplexity-with-gpt-2-in--transformers


In [2]:
ds = load_dataset("wikitext", "wikitext-2-raw-v1")

ds

Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at /home/mathewshen/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Mon Feb  3 22:19:08 2025).


DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

## Dataset info

In [3]:
type(ds)

datasets.dataset_dict.DatasetDict

In [4]:
isinstance(ds, dict)

True

In [5]:
ds.keys()

dict_keys(['test', 'train', 'validation'])

In [6]:
type(ds["train"])

datasets.arrow_dataset.Dataset

In [7]:
ds["train"].info

DatasetInfo(description='', citation='', homepage='', license='', features={'text': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='wikitext', config_name='wikitext-2-raw-v1', version=0.0.0, splits={'test': SplitInfo(name='test', num_bytes=1305088, num_examples=4358, shard_lengths=None, dataset_name='wikitext'), 'train': SplitInfo(name='train', num_bytes=11061717, num_examples=36718, shard_lengths=None, dataset_name='wikitext'), 'validation': SplitInfo(name='validation', num_bytes=1159288, num_examples=3760, shard_lengths=None, dataset_name='wikitext')}, download_checksums={'hf://datasets/wikitext@b08601e04326c79dfdd32d625aee71d232d685c3/wikitext-2-raw-v1/test-00000-of-00001.parquet': {'num_bytes': 732610, 'checksum': None}, 'hf://datasets/wikitext@b08601e04326c79dfdd32d625aee71d232d685c3/wikitext-2-raw-v1/train-00000-of-00001.parquet': {'num_bytes': 6357543, 'checksum': None}, 'hf://datasets/wikitext@b08601e04326c79dfdd

## EDA

In [8]:
ds["train"][:3]

{'text': ['', ' = Valkyria Chronicles III = \n', '']}

In [9]:
def get_split_word_info(dataset, split):
    full_text = " ".join(dataset[split]["text"])
    words = full_text.split(" ")
    words_set = set(words)
    return len(words), len(words_set)

In [10]:
get_split_word_info(ds, "train")

(2112444, 76618)

In [11]:
get_split_word_info(ds, "validation")

(220111, 19884)

In [12]:
get_split_word_info(ds, "test")

(248464, 21143)

## Torch

In [13]:
training_dataloader = DataLoader(
    ds["train"],
    batch_size=8,
    num_workers=4,
)

training_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fb1ae5bbe00>

In [14]:
for batch in training_dataloader:
    print(batch)
    break

{'text': ['', ' = Valkyria Chronicles III = \n', '', ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n', " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game m

In [15]:
len(batch["text"])

8

## Tokenization

In [16]:
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))

tokenizer

Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=None, pre_tokenizer=None, post_processor=None, decoder=None, model=WordLevel(vocab={}, unk_token="[UNK]"))

In [17]:
tokenizer.pre_tokenizer = Whitespace()

In [18]:
trainer = WordLevelTrainer(vocab_size=20000, min_frequency=3, special_tokens=["[UNK]"])

In [19]:
tokenizer.train_from_iterator(ds["train"]["text"], trainer)

In [20]:
text = "Hello, nice to meet you"

EncodingVisualizer(tokenizer)(text)

In [21]:
output = tokenizer.encode(text)

output.tokens

['[UNK]', ',', 'nice', 'to', 'meet', 'you']

In [22]:
output.ids

[0, 2, 6620, 7, 1609, 324]

In [23]:
output.type_ids

[0, 0, 0, 0, 0, 0]

In [24]:
tokenizer.get_vocab_size()

20000

In [25]:
# tokenizer.save('tokenizer-wikitext-2-raw-v1-train.json')

## Transformation

In [32]:
def text_to_token_ids(text: str, tokenizer: Tokenizer) -> list[int]:
    return {"ids": tokenizer.encode(text).ids}


text_to_token_ids(text="Hello, World!", tokenizer=tokenizer)

{'ids': [0, 2, 181, 367]}

In [40]:
def dataset_split_text_concat(dataset: Dataset, split: str) -> str:
    full_text = " ".join(dataset[split]["text"])
    return full_text


len(dataset_split_text_concat(dataset=ds, split="train"))

10929707

In [43]:
token_ids = tokenizer.encode(dataset_split_text_concat(dataset=ds, split="train")).ids

In [48]:
len(token_ids)

2074935

In [55]:
token_ids_tensor = torch.tensor(token_ids).unfold(0, 5, 5)

token_ids_tensor

tensor([[    9,  3922,  4421,   852,     9],
        [    0,   126,  3922,    89,    43],
        [    0,  4421,    23,   755,    43],
        ...,
        [ 2455,    70, 13989,  2089,     2],
        [   29,   146,   299,    35,   398],
        [   17,    31,  2493,  5462,     3]])