In [2]:
import torch
import tiktoken

print(torch.__version__)
print(tiktoken.__version__)

2.6.0+cu126
0.8.0


### Load verdict data

In [3]:
import os 
import urllib

remote_text_path = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
local_text_path = "./input.txt"

if not os.path.exists(local_text_path):
    urllib.request.urlretrieve(remote_text_path, local_text_path)

with open(local_text_path, 'r', encoding='utf-8') as f:
    raw_text = f.read()

raw_text[:1000]  # Display the first 1000 characters of the text

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

### Bytepair encoding

- GPT2 uses bytepair encoding to breakdown sentences into tokens. 
- Words that are not defined in vocab can be broken down into subword units, easy way to handle UNK words. If there is a new word, unfamiliarword, it can be tokenized as [unfam, iliar, word]
- BPE from tiktoken library is implemented in rust for computational performance

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")

In [9]:
sample_text = "Hello my name is Ajayaaaa unk unknasd asda"

encoded_text = tokenizer.encode(sample_text)
print(encoded_text)

decoded_text = tokenizer.decode(encoded_text)
print(decoded_text)


[15496, 616, 1438, 318, 22028, 323, 24794, 555, 74, 555, 15418, 292, 67, 355, 6814]
Hello my name is Ajayaaaa unk unknasd asda


In [10]:
encoded_text = tokenizer.encode(raw_text)
print(encoded_text[:100])

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13, 198, 198, 5962, 22307, 25, 198, 1639, 389, 477, 12939, 2138, 284, 4656, 621, 284, 1145, 680, 30, 198, 198, 3237, 25, 198, 4965, 5634, 13, 12939, 13, 198, 198, 5962, 22307, 25, 198, 5962, 11, 345, 760, 327, 1872, 385, 1526, 28599, 318, 4039, 4472, 284, 262, 661, 13, 198, 198, 3237, 25, 198, 1135, 760, 470, 11, 356, 760, 470, 13, 198, 198, 5962, 22307, 25, 198, 5756, 514, 1494, 683, 11, 290, 356]


### Data sampling with a sliding window
- Predict target based on input text. Target is input shifted by 1. When we apply the causal LLM mask, things just work. 

In [24]:
encoded_text = tokenizer.encode("This is a test")
context_size = 2
for i in range(5):
    inpt = encoded_text[i:i+context_size]
    target = encoded_text[i+1:i+context_size+1]
    if len(inpt) < context_size or len(target) < context_size:
        break
    decoded_input = tokenizer.decode(inpt)
    decoded_target = tokenizer.decode(target)
    print(f"Input: {inpt}, Target: {target}")
    print(f"Decoded Input: {decoded_input}, Decoded Target: {decoded_target}")


Input: [1212, 318], Target: [318, 257]
Decoded Input: This is, Decoded Target:  is a
Input: [318, 257], Target: [257, 1332]
Decoded Input:  is a, Decoded Target:  a test


In [16]:
tokenizer.encode("This is a test")

[1212, 318, 257, 1332]

### Run a tokenizer on wikitext2-raw-v1 dataset

In [1]:
from datasets import load_dataset

train_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
val_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")
test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

train_dataset

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [4]:
import tokenizers

tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False)

tokenizer.pre_tokenizer.pre_tokenize_str("Hello, world!")


[('Hello', (0, 5)), (',', (5, 6)), ('Ġworld', (6, 12)), ('!', (12, 13))]

In [19]:
trainer = tokenizers.trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(train_dataset["text"], trainer=trainer)
tokenizer.post_processor = tokenizers.processors.ByteLevel(trim_offsets=False)

tokenizer.save("../data/tokenizer.json")
tokenizer.decoder = tokenizers.decoders.ByteLevel()

tokenizer.encode("Hello my name is Ajay")






Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [20]:
tokenizer.encode("Hello my name is Ajay").tokens

['H', 'ello', 'Ġmy', 'Ġname', 'Ġis', 'ĠAj', 'ay']

In [22]:
encoding = tokenizer.encode("Hello my name is Ajay")
encoding.ids

[40, 14979, 1668, 1221, 301, 18603, 288]