<a href="https://colab.research.google.com/github/alexlinapp/proofLLM/blob/main/tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tiktoken



In [2]:
from importlib.metadata import version
import re
import tiktoken
print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))
print("matplotlib version:", version("matplotlib"))
print("numpy version:", version("numpy"))

torch version: 2.6.0+cu124
tiktoken version: 0.9.0
matplotlib version: 3.10.0
numpy version: 2.0.2


In [3]:
import os
import urllib.request
if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [4]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    print(raw_text[:99])
    pre_process = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
    pre_process = [item.strip() for item in pre_process if item.strip()]
    # print(pre_process)
    # print(len(pre_process))
    # print(pre_process[:30])
    all_words = sorted(set(pre_process))
    vocab_size = len(all_words)
    print(vocab_size)
    vocab = {token:integer for integer, token in enumerate(all_words)}
    print(vocab)
    print(all_words)
    all_words.extend(["<|endoftext|>", "<|unk|>"])
    vocab = {token:integer for integer, token in enumerate(all_words)}
    print(type(vocab.items()))

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 
1130
{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '--': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'Ah': 12, 'Among': 13, 'And': 14, 'Are': 15, 'Arrt': 16, 'As': 17, 'At': 18, 'Be': 19, 'Begin': 20, 'Burlington': 21, 'But': 22, 'By': 23, 'Carlo': 24, 'Chicago': 25, 'Claude': 26, 'Come': 27, 'Croft': 28, 'Destroyed': 29, 'Devonshire': 30, 'Don': 31, 'Dubarry': 32, 'Emperors': 33, 'Florence': 34, 'For': 35, 'Gallery': 36, 'Gideon': 37, 'Gisburn': 38, 'Gisburns': 39, 'Grafton': 40, 'Greek': 41, 'Grindle': 42, 'Grindles': 43, 'HAD': 44, 'Had': 45, 'Hang': 46, 'Has': 47, 'He': 48, 'Her': 49, 'Hermia': 50, 'His': 51, 'How': 52, 'I': 53, 'If': 54, 'In': 55, 'It': 56, 'Jack': 57, 'Jove': 58, 'Just': 59, 'Lord': 60, 'Made': 61, 'Miss': 62, 'Money': 63, 'Monte': 64, 'Moon-dancers': 65, 'Mr': 66, 'Mrs': 67, 'My': 68, 'Never': 69, 'No': 70, 'Now': 71, 'Nutley': 72, 'Of': 73, 'Oh': 74, 'On': 75

In [5]:
import re # python's regex library (regular expression library)
text = "hello world. This is a some raw text"
result = re.split(r'(\s)', text)
print(result)
result = re.split(r'([,.] | \s)', text)
print(result)
result = [item for item in result if item.strip()]
print(result)
text = "hello world. This is test-- hi?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item for item in result if item.strip()]
print(result)

['hello', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'some', ' ', 'raw', ' ', 'text']
['hello world', '. ', 'This is a some raw text']
['hello world', '. ', 'This is a some raw text']
['hello', 'world', '.', 'This', 'is', 'test', '--', 'hi', '?']


In [6]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab;
        self.int_to_str = {i:s for s,i in vocab.items()}
    def encode(self, text):
      preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
      preprocessed = [item for item in preprocessed if item.strip()]
      return [self.str_to_int[item] for item in preprocessed]

    def decode(self, integers):
      text = " ".join([self.int_to_str[item] for item in integers])
      text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # replace " ," etc with capture group, so only with ","
      return text
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab;
        self.int_to_str = {i:s for s,i in vocab.items()}
    def encode(self, text):
      preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
      preprocessed = [item for item in preprocessed if item.strip()]
      preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
      ids = [self.str_to_int[item] for item in preprocessed]
      return ids

    def decode(self, integers):
      text = " ".join([self.int_to_str[item] for item in integers])
      text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # replace " ," etc with capture group, so only with ","
      return text

In [7]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
 Mrs. Gisburn said with pardonable pride. """
ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)
tokenizer = SimpleTokenizerV2(vocab)
ids = tokenizer.encode(text)
print(tokenizer.decode(ids))

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.
Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [8]:
tokenizer = tiktoken.get_encoding("gpt2")
text = (
 "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
 "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
strings = tokenizer.decode(integers)
print(strings)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [9]:
text =  "Akwirw ier"
integers = tokenizer.encode(text)
print(integers)
for integer in integers:
    print(tokenizer.decode([integer]))

[33901, 86, 343, 86, 220, 959]
Ak
w
ir
w
 
ier


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride) -> None:
     self.input_ids = [];
     self.target_ids = [];

     token_ids = tokenizer.encode(txt)  # tokenize input text, either use existing tokenizer or create one
     for i in range(0, len(token_ids) - max_length, stride):
        input_chunk = token_ids[i:i+max_length]
        target_chunk = token_ids[i+1:i+max_length+1]
        self.input_ids.append(torch.tensor(input_chunk))
        self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [13]:
def createDataLoaderV1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
  return dataloader

In [22]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    data_loader = createDataLoaderV1(raw_text, batch_size=1, max_length=10, stride=2, shuffle=False)
    data_iter = iter(data_loader)
    print(next(data_iter))
    print(next(data_iter))
    print(next(data_iter))

[tensor([[   40,   367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]
[tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138,   257,  7026]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257,  7026, 15632]])]
[tensor([[ 1807,  3619,   402,   271, 10899,  2138,   257,  7026, 15632,   438]]), tensor([[ 3619,   402,   271, 10899,  2138,   257,  7026, 15632,   438,  2016]])]


In [26]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(num_embeddings=6, embedding_dim=3)
print(embedding_layer.weight)
print(embedding_layer(torch.tensor([3])))

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)
tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [62]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=output_dim)
print(token_embedding_layer.weight.shape)
max_length = 4
dataloader = createDataLoaderV1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)
input_ids, target_ids = next(data_iter)
print("Token ids:\n", input_ids)
print("Target ids:\n", target_ids)
print(token_embedding_layer(input_ids).shape)
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(num_embeddings=context_length, embedding_dim=output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)
input_embeddings = token_embedding_layer(input_ids) + pos_embeddings
print(input_embeddings.shape)

torch.Size([50257, 256])
Token ids:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Target ids:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
torch.Size([8, 4, 256])
torch.Size([4, 256])
torch.Size([8, 4, 256])


In [39]:
# idx = torch.tensor([2, 3, 1])
# num_idx = max(idx)+1

# # The desired embedding dimension is a hyperparameter
# out_dim = 5
# torch.manual_seed(123)

# embedding = torch.nn.Embedding(num_idx, out_dim)
# print(embedding.weight)
# onehot = torch.nn.functional.one_hot(idx)
# print(onehot)
# print(embedding(idx))
# linear = torch.nn.Linear(num_idx, out_dim, bias=False)
# linear.weight #= torch.nn.Parameter(embedding.weight)
# print(linear.weight)
# linear.weight = torch.nn.Parameter(embedding.weight.T)
# print(linear.weight)
# #print(linear(idx))
# print(linear(onehot.float()))

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035, -0.5880,  1.5810],
        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015],
        [ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953]], requires_grad=True)
tensor([[0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 1, 0, 0]])
tensor([[ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953],
        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015]],
       grad_fn=<EmbeddingBackward0>)
Parameter containing:
tensor([[-0.4228, -0.1435, -0.3521,  0.0331],
        [-0.0934, -0.2682, -0.0455,  0.4737],
        [-0.0394,  0.0159, -0.0780,  0.0786],
        [ 0.4455,  0.3057,  0.1775,  0.1087],
        [ 0.1179,  0.1932, -0.0646, -0.4647]], requires_grad=True)
Parameter containing:
tensor([[ 0.3374,  1.3010,  0.6957, -2.8400],
        [-0.1778,  1.2753, -1.8061, -0.7849],
        [-0.3035, -0.2010, -1.1589, -1.4096],
        [-0.5880, -0.1606,  0.3255