In [193]:
import os
import urllib
import torch
from torch import Tensor
from torch.utils.data import DataLoader
from importlib.metadata import version

from DataloaderV1 import DataloaderV1
from SimpleTokenizerV1 import SimpleTokenizerV1
from SimpleTokenizerV2 import SimpleTokenizerV2

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))



if torch.backends.mps.is_available():
    device = torch.device("mps")
    x: Tensor = torch.ones(1, device=device)
    print(f"x = {x} using {device} backend")
else:
    device = torch.device("cpu")
    x: Tensor = torch.ones(1, device=device)
    print(x)
# print(f"Running on : {device}")

torch version: 2.5.1
tiktoken version: 0.8.0
x = tensor([1.], device='mps:0') using mps backend


In [194]:
def get_some_text():
    # Download a text (book)
    bookUrl = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
    filepath = "../../data/the-verdict.txt"
    # print(file_path)
    if not os.path.exists(filepath):
        urllib.request.urlretrieve(bookUrl, filepath)

    with open(filepath, "r", encoding="utf-8") as f:
        rawtext = f.read()

    print("Total characters in the story: ", len(rawtext))
    print("Total Lines in raw text: ", rawtext.count("\n"))
    return rawtext

raw_text = get_some_text()
print("Some text: ", raw_text[:49])

Total characters in the story:  20479
Total Lines in raw text:  164
Some text:  I HAD always thought Jack Gisburn rather a cheap 


In [195]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [196]:
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [197]:
# Strip whitespace from each item and then filter out any empty strings.
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [198]:
preprocessed = re.split(r'([,.:;?_!"()\'\\]|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [199]:
print(len(preprocessed))

4690


## Converting tokens into token IDs


In [200]:
# Now we need to generate token IDs
# Now let us create a list of all unique tokens and sort them alphabetically to determine the vocabulary size
all_uniq_words = sorted(set(preprocessed))
vocab_size = len(all_uniq_words)
print("Vocab size: ", vocab_size)

Vocab size:  1130


In [201]:
# Now that we know the vocabulary size, lets enumerate and assign some numbers to them
vocab = {token:integer for integer, token in enumerate(all_uniq_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 10:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)


In [202]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [203]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [204]:
tokenizer.decode(tokenizer.encode(text))

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

#### Some of these special tokens are:
- [BOS] (beginning of sequence) marks the beginning of text
- [EOS] (end of sequence) marks where the text ends (this is usually used to concatenate multiple unrelated texts, e.g., two different Wikipedia# articles or two different books, and so on)
- [PAD] (padding) if we train LLMs with a batch size greater than 1 (we may include multiple texts with different lengths; with the padding token we pad the shorter texts to the longest length so that all texts have an equal length)
- [UNK] to represent words that are not included in the vocabulary
- Note: GPT-2 does not need any of these tokens mentioned above but only uses an <|endoftext|> token to reduce complexity. <|endoftext|> token is analogous to the [EOS] token mentioned above
- *GPT also uses the <|endoftext|> for padding (since we typically use a mask when training on batched inputs, we would not attend padded tokens
anyways, so it does not matter what these tokens are)*
- GPT-2 does not use an <UNK> token for out-of-vocabulary words; instead,
- GPT-2 uses a byte-pair encoding (BPE) tokenizer, which breaks down words into subword units which we will discuss in a later section. We use the <|endoftext|> tokens between two independent sources of text


In [205]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])         # To handle unknown tokens
vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab.items()))

1132


In [206]:
tokenizer = SimpleTokenizerV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [207]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [208]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

## Byte Pair Encoding

In [209]:
### Byte Pair Encoding

from importlib.metadata import version
import tiktoken
print("Tiktoken version: ", version("tiktoken"))

Tiktoken version:  0.8.0


In [210]:
tokenizer = tiktoken.get_encoding("gpt2")
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

# BPE tokenizers break down unknown words into subwords and individual characters.
encoded_integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(f"Encoded: {encoded_integers}")
decoded_strings = tokenizer.decode(encoded_integers)
print(f"Decoded: {decoded_strings}\n")
#
print(tokenizer.encode("Akwirw ier"))
print(tokenizer.decode(tokenizer.encode("Akwirw ier")))

Encoded: [15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Decoded: Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.

[33901, 86, 343, 86, 220, 959]
Akwirw ier


## Data sampling with a sliding window

In [211]:
#
with open("../../data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    f.close()

# Encoding with BPE tokenizer
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


- For each text chunk, we want the inputs and targets
- Since we want the model to predict the next word, the targets are the inputs shifted by one position to the right

In [212]:
enc_sample = enc_text[50:]
context_size = 4
#
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [213]:
# Next word prediction tasks can now be created by
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    # Now we create the input output target pairs
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


- We will take care of the next-word prediction later chapter after we covered the attention mechanism
- For now, we will implement a simple data loader that iterates over an input dataset and returns the inputs and targets shifted by one

In [231]:
# Finally we need to create the embeddings for the tokens
# If we have a batch size of 8 with 4 tokens each it'll be an 8 x 4 x 256 tensor

max_length = 4

mydataloader = DataloaderV1(batch_size=8,
                            max_length=max_length,
                            stride=4,
                            shuffle=False,
                            drop_last=True,
                            num_workers=0)
#
dataloader:DataLoader = mydataloader.create_dataloader_v1(txt=raw_text)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
#
print("Inputs :\n", inputs)
print("Input tensor shape: ", inputs.shape)
print("Targets:\n", targets)
print("Target tensor shape: ", targets.shape)

Inputs :
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Input tensor shape:  torch.Size([8, 4])
Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
Target tensor shape:  torch.Size([8, 4])


# Creating token embeddings
- The data is already almost ready for an LLM
- But lastly let us embed the tokens in a continuous vector representation using an embedding layer
- Usually, these embedding layers are part of the LLM itself and are updated (trained) during model training


In [232]:
input_ids = torch.tensor([2, 3, 5, 1])
vocab_size = 6
output_dim = 3

# The Embedding layer approach is essentially just a more efficient way of
# implementing one-hot encoding followed by matrix multiplication in a
# fully-connected layer

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


- Because the embedding layer is just a more efficient implementation that is equivalent to the one-hot encoding and matrix-multiplication approach it can be seen as a neural network layer that can be optimized via backpropagation

In [234]:
## For instance, To convert a token with id 3 into a 3-dimensional vector, we do the following:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


- Note that the above is the 4th row in the embedding_layer weight matrix
- To embed all four input_ids values above, we do

In [255]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [256]:
# Since self-attentions are position agnostic, we should add some positional data.
# Absolute and relative positional data can be added. So let's create embeddings with say 256 dimensions
max_length = 4
dataloader = DataloaderV1(batch_size=8, max_length=max_length, stride=max_length, shuffle=False).create_dataloader_v1(raw_text)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


- If we sample data from the dataloader, we embed the tokens in each batch into a 256-dimensional vector
- If we have a batch size of 8 with 4 tokens each, this results in a *8 x 4 x 256 tensor*


In [243]:
# Now lets embed the input tensors
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=output_dim)
token_embeddings = token_embedding_layer(inputs)
print("Token embeddings shape: ", token_embeddings.shape)

Token embeddings shape:  torch.Size([8, 4, 256])


- GPT-2 uses absolute position embeddings, so we just create another embedding layer that has the same embedding dimension as the token_embedding_ layer

In [248]:
context_length = max_length # context length is the length of positions we care for attention
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
# How the embedding layer weights look like
print("Positional Embedding Layer Weights: \n", pos_embedding_layer.weight)

Positional Embedding Weights: 
 Parameter containing:
tensor([[-1.0722, -0.3881, -0.9748,  ...,  0.1415,  0.2180,  0.6223],
        [ 1.3004,  0.3771, -0.5962,  ..., -0.0475,  0.5450,  0.6105],
        [ 1.5109, -2.0059,  0.9972,  ..., -1.6433, -0.1522, -0.7559],
        [-0.1626, -1.0717,  0.3817,  ...,  0.1037, -0.4367, -0.8756]],
       requires_grad=True)


In [253]:
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print("Positional Embeddings Shape: ", pos_embeddings.shape) # 4x256

Positional Embeddings Shape:  torch.Size([4, 256])


- To create the input embeddings used in an LLM, we simply add the token andthe positional embeddings

In [251]:
input_embeddings = token_embeddings + pos_embeddings
print("Position Merged Input Embeddings Shape: ", input_embeddings.shape)

Position Merged Input Embeddings Shape:  torch.Size([8, 4, 256])


In [238]:
# Now lets look at the dataloader
for batch in dataloader:
    inputs, targets = batch
    token_embeddings = token_embedding_layer(inputs)
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))
    input_embeddings = token_embeddings + pos_embeddings
    break

print("Input Embeddings Shape: ", input_embeddings.shape)
print("Inputs tensor: ", x)
print("Targets tensor: ", y)

Input Embeddings Shape:  torch.Size([8, 4, 256])
Inputs tensor:  [290, 4920, 2241, 287]
Targets tensor:  [4920, 2241, 287, 257]
