# Chapter 2: Working with Text Data

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.2 MB[0m [31m16.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m22.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
# packages that are being used in this notebook
from importlib.metadata import version

print('torch version:', version('torch'))
print('tiktoken version:', version('tiktoken'))

torch version: 2.5.1+cu121
tiktoken version: 0.8.0


In [None]:
# load data
import os
import urllib.request

if not os.path.exists('the-verdict.txt'):
    url = ("https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
    "the-verdict.txt")

    file_path = 'the-verdict.txt'
    urllib.request.urlretrieve(url, file_path)

In [None]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

print('Total number of character:', len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [None]:
import re

text = 'Hello, world. This, is a test.'
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [None]:
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [None]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [None]:
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [None]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [None]:
print(len(preprocessed))

4690


In [None]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1130


In [None]:
vocab = {token:integer for integer, token in enumerate(all_words)}

In [None]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [None]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [None]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [None]:
try:
    text = "Hello, do you like tea?"
    tokenizer.encode(text)
except KeyError as err:
    print("Key Error:", err)

Key Error: 'Hello'


In [None]:
# adding special context tokens

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab =  {token:integer for integer, token in enumerate(all_tokens)}

In [None]:
len(vocab.items())

1132

In [None]:
for item in list(vocab.items())[-5:]:
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [None]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "The the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> The the sunlit terraces of the palace.


In [None]:
encoded_text = tokenizer.encode(text)
print(encoded_text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 93, 988, 956, 984, 722, 988, 1131, 7]


In [None]:
tokenizer.decode(encoded_text)

'<|unk|>, do you like tea? <|endoftext|> The the sunlit terraces of the <|unk|>.'

In [None]:
# Byte-pair encoding
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [None]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [None]:
# Data samling
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [None]:
enc_sample = enc_text[50:]

In [None]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y: {y}")

x: [290, 4920, 2241, 287]
y: [4920, 2241, 287, 257]


In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, '--->', desired)

[290] ---> 4920
[290, 4920] ---> 2241
[290, 4920, 2241] ---> 287
[290, 4920, 2241, 287] ---> 257


In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), '--->', tokenizer.decode([desired]))

 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a


In [None]:
import torch
torch.__version__

'2.5.1+cu121'

In [None]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding('gpt2')

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [None]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [None]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4,
    shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("Targets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [None]:
input_ids = torch.tensor([2, 3, 5, 1])
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [None]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [None]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [None]:
# Encoding word positions
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [None]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [None]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


# Appendix A

In [None]:
import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)

z = x1 * w1 + b
a = torch.sigmoid(z)

loss = F.binary_cross_entropy(a, y)

grad_L_w1 = grad(loss, w1, retain_graph=True)
grad_L_b = grad(loss, b, retain_graph=True)

print(grad_L_w1)
print(grad_L_b)

(tensor([-0.0898]),)
(tensor([-0.0817]),)


In [None]:
import torch

class NeuralNetwork(torch.nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()

        self.layers = torch.nn.Sequential(
            # 1st hidden layer
            torch.nn.Linear(num_inputs, 30),
            torch.nn.ReLU(),

            # 2nd hidden layer
            torch.nn.Linear(30, 20),
            torch.nn.ReLU(),

            # output layer
            torch.nn.Linear(20, num_outputs)
        )

    def forward(self, x):
        logits = self.layers(x)
        return logits


model = NeuralNetwork(50, 3)

In [None]:
print(model)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)


In [None]:
x1 = torch.randn(50)
logits1 = model(x1)
print(f'{logits1=}')
probs = torch.nn.Softmax(dim=0)(logits1) # torch.softmax(logits, dim=1)
print(f'{probs=}')

logits1=tensor([-0.1404, -0.1096,  0.0126], grad_fn=<ViewBackward0>)
probs=tensor([0.3128, 0.3226, 0.3645], grad_fn=<SoftmaxBackward0>)


In [None]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{num_params=}")

num_params=2213


In [None]:
print(model.layers[0].weight)

Parameter containing:
tensor([[ 0.0590, -0.1167, -0.1109,  ...,  0.0418, -0.0664,  0.1084],
        [-0.1167,  0.0489, -0.0160,  ...,  0.0291, -0.0536,  0.0133],
        [ 0.0170, -0.1377,  0.0527,  ...,  0.0970,  0.0044,  0.0141],
        ...,
        [ 0.0349,  0.0673, -0.0366,  ..., -0.1031, -0.1050, -0.0131],
        [ 0.1008, -0.0266,  0.0058,  ...,  0.0517,  0.0077, -0.0105],
        [ 0.1203,  0.1377, -0.0354,  ..., -0.0970,  0.0474,  0.1329]],
       requires_grad=True)


In [None]:
model.layers[0].weight.shape

torch.Size([30, 50])

In [None]:
model.layers[0].bias

Parameter containing:
tensor([ 0.1323, -0.0839, -0.1320,  0.0486,  0.1137,  0.0720,  0.0674,  0.0393,
         0.0540,  0.0638,  0.0925, -0.1187,  0.0302,  0.0269,  0.1236, -0.1113,
         0.1176, -0.1412,  0.1154, -0.0304,  0.1120, -0.0291,  0.0223, -0.0597,
        -0.1341, -0.1011,  0.0268, -0.0505, -0.0747,  0.0933],
       requires_grad=True)

In [None]:
model.layers[0].bias.shape

torch.Size([30])

In [None]:
with torch.no_grad():
    X = torch.randn((1, 50))
    out = model(X)
    out = torch.softmax(out, dim=1)
print(out)

tensor([[0.2955, 0.3358, 0.3687]])


In [None]:
X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])
y_train = torch.tensor([0, 0, 0, 1, 1])
X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6],
])
y_test = torch.tensor([0, 1])

In [None]:
from torch.utils.data import Dataset

class ToyDataset(Dataset):
    def __init__(self, X, y):
        self.features = X
        self.labels = y

    def __getitem__(self, index):
        one_x = self.features[index]
        one_y = self.labels[index]
        return one_x, one_y

    def __len__(self):
        return self.labels.shape[0]

train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)

In [None]:
from torch.utils.data import DataLoader

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    drop_last=True
)
test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False,
    num_workers=0
)

In [None]:
for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x, y)

Batch 1: tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])
Batch 2: tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])


In [None]:
import torch.nn.functional as F

torch.manual_seed(123)
model = NeuralNetwork(num_inputs=2, num_outputs=2)
optimizer = torch.optim.SGD(
    model.parameters(), lr=0.5
)
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):
        logits = model(features)
        loss = F.cross_entropy(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # logging
        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
            f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
            f" | Train Loss: {loss:.2f}")

    model.eval()
    # insert optional model evaluation code

Epoch: 001/003 | Batch 000/002 | Train Loss: 0.75
Epoch: 001/003 | Batch 001/002 | Train Loss: 0.65
Epoch: 002/003 | Batch 000/002 | Train Loss: 0.44
Epoch: 002/003 | Batch 001/002 | Train Loss: 0.13
Epoch: 003/003 | Batch 000/002 | Train Loss: 0.03
Epoch: 003/003 | Batch 001/002 | Train Loss: 0.00


In [None]:
model.eval()
with torch.no_grad():
    outputs = model(X_train)
print(outputs)

tensor([[ 2.8569, -4.1618],
        [ 2.5382, -3.7548],
        [ 2.0944, -3.1820],
        [-1.4814,  1.4816],
        [-1.7176,  1.7342]])


In [None]:
torch.set_printoptions(sci_mode=False)
probas = torch.softmax(outputs, dim=1)
print(probas)

tensor([[    0.9991,     0.0009],
        [    0.9982,     0.0018],
        [    0.9949,     0.0051],
        [    0.0491,     0.9509],
        [    0.0307,     0.9693]])


In [None]:
y_train

tensor([0, 0, 0, 1, 1])

In [None]:
probas.argmax(dim=1)

tensor([0, 0, 0, 1, 1])

In [None]:
def compute_accuracy(model, dataloader):
    model = model.eval()
    correct = 0.0
    total_examples = 0

    for idx, (features, labels) in enumerate(dataloader):
        with torch.no_grad():
            logits = model(features)
        predictions = torch.argmax(logits, dim=1)
        compare = labels == predictions
        correct += torch.sum(compare)
        total_examples += len(compare)

    return (correct / total_examples).item()

In [None]:
compute_accuracy(model, train_loader)

1.0

In [None]:
compute_accuracy(model, test_loader)

1.0

In [None]:
# save model
torch.save(model.state_dict(), 'model.pht')

In [None]:
# load model
model = NeuralNetwork(2, 2)
model.load_state_dict(torch.load('model.pht', weights_only=True))

<All keys matched successfully>

In [None]:
tensor_1 = torch.tensor([1., 2., 3.])
tensor_2 = torch.tensor([4., 5., 6.])
print(tensor_1 + tensor_2)

tensor([5., 7., 9.])


# AGAIN

- Preparing text for large language model training
---

In [None]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [None]:
import re
text = "Hello, world. This, is a test."

# split with only whitespaces
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [None]:
# split by whitespaces, commas and periods
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [None]:
result = [item for item in result if item.split()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [None]:
# split by additional special characters
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [None]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


In [None]:
# converting tokens into token IDs

all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [None]:
# creating vocabulary
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


- Splitting text into word and subword tokens
---

In [None]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

tokenizer = SimpleTokenizerV1(vocab)
text = """It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


### Tokenizer V2

In [None]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(['<|endoftext|>', "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}

print(len(vocab.items()))

1132


In [None]:
vocab['<|unk|>']

1131

In [None]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [
            item if item in self.str_to_int else "<|unk|>"
            for item in preprocessed
        ]
        ids = [
            self.str_to_int[s]
            for s in preprocessed
        ]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

tokenizer_v2 = SimpleTokenizerV2(vocab)
print(tokenizer_v2.encode(text))
print(tokenizer_v2.decode(tokenizer_v2.encode(text)))

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


- Byte pair encoding as a more advanced way of tokenizing text
---

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [None]:
text = (
"Hello, do you like tea? <|endoftext|> In the sunlit terraces"
"of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [None]:
strings = tokenizer.decode(integers)
strings

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

- Sampling training examples with a sliding window approach
---

- Converting tokens into vectors that feed into a large language model
---

In [None]:
# Just test
import torch

decode = {integer: token for token, integer in vocab.items()}

class BigramModel:
    def __init__(self, vocab: dict, words: str):
        self.vocab = vocab
        self.vocab_size = len(self.vocab.items())
        self.N = torch.zeros((self.vocab_size, self.vocab_size))
        ids = [self.vocab[word] for word in words]
        for ix1, ix2 in zip(ids, ids[1:]):
            self.N[ix1, ix2] += 1
        self.P = self.N / self.N.sum(dim=1, keepdim=True)
        # print(self.P.shape)
        # print(self.P[7].sum())

    def generate(self, max_length: int = 500):
        out = []
        ix = 7
        for _ in range(max_length):
            p = self.P[ix]
            # p = torch.ones(self.vocab_size) / self.vocab_size
            ix = torch.multinomial(p, num_samples=1, replacement=False).item()
            out.append(decode[ix])
        text = ' '.join(out)
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

model = BigramModel(vocab, preprocessed)
out = model.generate()
print(out)

And he just lay there and muddling; but on the honour being crowned by interesting -- she began to put it, and eighteenth-century pastels in fact that."( I could always thought him once or thought Jack Gisburn said -- that I felt to strain my dear Rickham, brought home to Mrs." And it stay!" He didn' man who had dropped my shoulder with a flash that Mrs. That' s domestic economy. He stood there, standing in him better; and as his fair sitters had married a little:" interesting" but he painted that my diagnosis suffered an endless vista of the canvas furiously, a lump of colour covered up his own sex fewer regrets were _ that lifted the speaking-tubes, presenting a deprecating laugh that my traps, on that I turned into circulation," Has he _ not the thought:" was silent; and watched me -- ah, the mantel-piece, on the sweetness.. Stroud himself, and threw back the florid vista of the canvas furiously, and twirling between the surest way I asked abruptly.. Gisburn' way of it was the portr

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

context_size = 8
batch_size = 4

class SimpleDataSet(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text)
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]


dataloader = DataLoader(
    SimpleDataSet(raw_text, tokenizer_v2, context_size, context_size),
    shuffle=False,
    batch_size=batch_size,
    drop_last=True
)

In [None]:
class BigramLanguageModel(torch.nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.emb = torch.nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx):
        logits = self.emb(idx)
        return logits

    def generate(self, max_length: int = 500):
        out = []
        ix = 7
        for _ in range(max_length):
            logits = self(torch.tensor(ix))
            p = torch.softmax(logits, dim=0)
            # p = torch.ones(self.vocab_size) / self.vocab_size
            ix = torch.multinomial(p, num_samples=1, replacement=False).item()
            out.append(tokenizer_v2.decode([ix]))
        text = ' '.join(out)
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

model = BigramLanguageModel(vocab_size)
model.generate()

'discussion stand Then mysterious idling adulation stroke back year insignificant fell sitters possessed which When over life-likeness won lifted good Money started manage worth Suddenly coat heard packed wild brush mirrors or since mere bric-a-brac By dozen chap object up-stream near poor Poor half-light hear sitter irony superb event silver pastels begun Of look full Poor lean( straining Riviera run object each amazement oval white friend paled moment make himself Suddenly luxury secret saw enough next occurred asked knew reared; his down modesty weeks never loathing Usually add amid terribly born shaking mantel-piece Rickham fancy audacities persuasively quote Arrt panelling background paint hanging: seemed so wasn adulation Once aesthetic arm-chairs balancing stammer brought be etching up-stream corner open embarrassed their somebody known craft similar breathing married Why waves interesting luncheon-table glad unaccountable thin technicalities rather light Be lift woman shrug und

In [None]:
from torch.optim import Adam

optimizer = Adam(model.parameters())

for _ in range(1000):
    for x, y in dataloader:
        logits = model(x)
        N, K, C = logits.shape  # N=batch_size, K=seq_len, C=vocab_size
        loss = F.cross_entropy(logits.view(-1, C), y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(loss)

tensor(7.5100, grad_fn=<NllLossBackward0>)
tensor(7.4390, grad_fn=<NllLossBackward0>)
tensor(7.3686, grad_fn=<NllLossBackward0>)
tensor(7.2987, grad_fn=<NllLossBackward0>)
tensor(7.2292, grad_fn=<NllLossBackward0>)
tensor(7.1603, grad_fn=<NllLossBackward0>)
tensor(7.0918, grad_fn=<NllLossBackward0>)
tensor(7.0240, grad_fn=<NllLossBackward0>)
tensor(6.9567, grad_fn=<NllLossBackward0>)
tensor(6.8900, grad_fn=<NllLossBackward0>)
tensor(6.8240, grad_fn=<NllLossBackward0>)
tensor(6.7586, grad_fn=<NllLossBackward0>)
tensor(6.6940, grad_fn=<NllLossBackward0>)
tensor(6.6301, grad_fn=<NllLossBackward0>)
tensor(6.5670, grad_fn=<NllLossBackward0>)
tensor(6.5046, grad_fn=<NllLossBackward0>)
tensor(6.4431, grad_fn=<NllLossBackward0>)
tensor(6.3824, grad_fn=<NllLossBackward0>)
tensor(6.3225, grad_fn=<NllLossBackward0>)
tensor(6.2636, grad_fn=<NllLossBackward0>)
tensor(6.2055, grad_fn=<NllLossBackward0>)
tensor(6.1483, grad_fn=<NllLossBackward0>)
tensor(6.0920, grad_fn=<NllLossBackward0>)
tensor(6.03

In [None]:
model.generate()

'I looked at tea beneath their palm-trees; he had forgotten to hear Mrs." -- on everlasting foundations underneath." Yes, oddly enough to me -- it, with his pictures with a failure being painted; and down in her spaniel in the bath-rooms, the head between the people manage to let it suddenly, on him, at him back his last word. It was dead? I had dropped my most out?" I had never touched a laugh: no preliminary work. It might put it all good fellow enough to affect me, basking under a degree he _ too moved away," Be dissatisfied with some one -- above the glory of it. What struck glimpse of poor Stroud. I can?" he thought of forcing it was his eyes grew dim, the loss to keep upstairs."" Mr. You ever knew just threw paint him deprecatingly, as you know where to my lies! It was posing to put it, and she was dead. The fact should mourn him say." He says they were reflected in advance, in a lump of it represented, none of pink azaleas, a year after Jack\' s" she\' t think of art, and claspi

In [None]:
y

tensor([[ 486,    6, 1002,  115,  500,  435,  392,    6]])

In [None]:
# Example of target with class probabilities
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
loss = F.cross_entropy(input, target)
loss

tensor(2.1915, grad_fn=<DivBackward1>)

In [None]:
input

tensor([[ 0.4816,  0.4323, -1.3577,  0.1903,  0.1221],
        [ 1.2290, -0.1264, -1.6588, -0.7744, -0.3481],
        [-0.7693, -1.4444,  0.0292,  0.8525, -1.0355]], requires_grad=True)

In [None]:
target

tensor([[0.2412, 0.0744, 0.3192, 0.2221, 0.1430],
        [0.1430, 0.0784, 0.4124, 0.0189, 0.3473],
        [0.5149, 0.1306, 0.1269, 0.0491, 0.1786]])