In [1]:
import re
import pathlib

In [2]:
file_path = '../ch02/01_main-chapter-code/the-verdict.txt'

In [3]:
with open(file_path, 'r') as f:
    raw_text = f.read()

In [4]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [5]:
len(raw_text)

20479

In [6]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)

In [7]:
preprocessed = [item.strip() for item in preprocessed if item.strip()]

In [8]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [9]:
len(preprocessed)

4690

In [10]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [11]:
vocab = {token:integer for integer, token in enumerate(all_words)}

In [12]:
for i, item in enumerate(vocab):
    print(i, item)
    if i >= 50:
        break

0 !
1 "
2 '
3 (
4 )
5 ,
6 --
7 .
8 :
9 ;
10 ?
11 A
12 Ah
13 Among
14 And
15 Are
16 Arrt
17 As
18 At
19 Be
20 Begin
21 Burlington
22 But
23 By
24 Carlo
25 Chicago
26 Claude
27 Come
28 Croft
29 Destroyed
30 Devonshire
31 Don
32 Dubarry
33 Emperors
34 Florence
35 For
36 Gallery
37 Gideon
38 Gisburn
39 Gisburns
40 Grafton
41 Greek
42 Grindle
43 Grindles
44 HAD
45 Had
46 Hang
47 Has
48 He
49 Her
50 Hermia


In [13]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [14]:
simple_tokenizer = SimpleTokenizer(vocab)
text = """"It's the last he painted, you know,"
    Mrs. Gisburn said with pardonable pride."""

In [15]:
ids = simple_tokenizer.encode(text)

In [16]:
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [17]:
simple_tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [18]:
import tiktoken

In [19]:
tokenizer = tiktoken.get_encoding("gpt2")

In [20]:
text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace.")

In [21]:
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

In [22]:
tokenizer.decode(integers)

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

In [23]:
tokenizer.encode("Akwirw ier")

[33901, 86, 343, 86, 220, 959]

In [24]:
tokenizer.decode([33901, 86, 343, 86, 220, 959])

'Akwirw ier'

In [25]:
tokenizer.decode(tokenizer.encode("Akwirw ier"))

'Akwirw ier'

In [26]:
encoded_text = tokenizer.encode(raw_text)

In [27]:
print(len(encoded_text))

5145


In [28]:
encoded_sample = encoded_text[50:]

In [29]:
context_size = 4

In [30]:
x = encoded_sample[:context_size]
y = encoded_sample[1:context_size+1]

In [31]:
print(f"x:  {x}")
print(f"y:       {y}")

x:  [290, 4920, 2241, 287]
y:       [4920, 2241, 287, 257]


In [32]:
for i in range(1, context_size+1):
    context=  encoded_sample[:i]
    desired= encoded_sample[i]
    print(f"{tokenizer.decode(context)} ----> {tokenizer.decode([desired])}")

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [2]:
import torch

In [49]:
vocab_size2 = 50257
output_dim = 256

In [50]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size2, output_dim)

In [60]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [63]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs,targets = next(data_iter)
print("Token IDs: \n", inputs)
print("Inputs Shape: \n", inputs.shape)

Token IDs: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Inputs Shape: 
 torch.Size([8, 4])


In [66]:
token_embeddings = embedding_layer(inputs)  

In [93]:
token_embeddings[1]

tensor([[-0.1082, -1.2723, -1.2217,  ..., -0.9199,  2.0073, -1.4138],
        [-0.2427,  0.9145,  1.0885,  ..., -0.8651,  3.5269,  0.7225],
        [-0.5434,  1.6203,  1.2222,  ...,  0.6814, -1.4032,  0.1492],
        [-0.3504, -0.9325, -1.2900,  ..., -1.4980,  0.1400,  0.3730]],
       grad_fn=<SelectBackward0>)

In [109]:
token_embeddings[1][1]

tensor([-0.2427,  0.9145,  1.0885, -0.4509,  0.1388,  0.4346, -0.2504,  0.6773,
        -1.3881,  0.6387,  1.4926, -0.3121,  0.5354,  0.1821, -0.7293, -0.1554,
         2.7412,  0.2703,  1.4398, -0.0604, -0.1633,  0.0075, -0.1972, -1.4763,
         0.9427,  0.9772, -1.3763, -0.3745,  1.2380,  0.3054,  0.1337, -1.1261,
        -0.7150,  0.1054, -1.8622, -0.1914, -0.3812, -0.5149,  0.1585, -0.2855,
        -1.3498, -1.0585,  1.1224, -1.2411, -1.8682, -0.3310, -1.6631, -1.0606,
         0.1304, -1.7107, -1.7785, -0.2362,  0.4212, -0.1894, -2.1316,  1.6154,
        -0.8644, -0.2564,  0.8379, -0.1591,  0.4706,  1.6730,  0.9082, -1.1894,
        -0.1265, -0.4909,  0.8210, -0.3116, -0.8850, -0.9793,  1.1803, -0.9689,
        -2.2596,  1.0069,  0.2675, -1.0806, -0.9930,  0.8564,  0.5325, -1.5294,
        -2.6811,  0.5051,  0.4283, -0.6133,  0.3168, -0.4942,  2.3976, -0.1153,
        -0.8231, -0.2001, -0.3997,  1.0749, -0.2622,  1.2662, -0.0810,  0.4281,
        -1.3073, -0.3272, -0.5569,  0.36

In [3]:
attn_score_2 = torch.dot(torch.tensor([0.43, 0.15, 0.89]), torch.tensor([0.55,0.87,0.66]))

In [4]:
torch.softmax(attn_score_2, dim=0)

tensor(1.)

In [5]:
attention_weights = attn_score_2 / attn_score_2.sum()

In [6]:
attention_weights

tensor(1.)

In [7]:
attn_score_2

tensor(0.9544)