In [1]:
with open("./data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [3]:
print(f"The total number of characters: {len(raw_text)}")

The total number of characters: 20479


In [4]:
print(raw_text[:99])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## Tokenizing Text

In [6]:
import re

In [21]:
result = re.split(r'([,.?_"\'()!]|--|\s)', raw_text)

In [22]:
result = [item for item in result if item.strip()]

In [23]:
len(result)

4649

## Converting Tokens to token IDs 

In [152]:
tokens = sorted(list(set(result)))

In [153]:
tokens.extend(["<|endoftext|>", "<|unk|>"])

In [154]:
vocab_size = len(tokens)

In [155]:
print(vocab_size)

1161


In [156]:
token_to_ids = {token:idx for idx, token in enumerate(tokens)}
ids_to_tokens = {idx:token for idx, token in enumerate(tokens)}

In [158]:
token_to_ids["<|endoftext|>"]

1159

## Now lets put all this together inside a python class

In [164]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {v:k for k, v in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        print(preprocessed)
        preprocessed = [item for item in preprocessed if item.strip()]
        token_ids = [self.str_to_int[s] if s in self.str_to_int else self.str_to_int["<|unk|>"] for s in preprocessed ]
        return token_ids

    def decode(self, token_ids):
        decoded_str = " ".join([self.int_to_str[id] for id in token_ids])
        return re.sub(r'\s+([,.?!"()\'])', r'\1', decoded_str)

In [165]:
tokenizer = SimpleTokenizerV1(token_to_ids)

In [169]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join([text1, text2])

In [170]:
text

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.'

## Byte Pair encoding

In [177]:
import tiktoken

In [178]:
tokenizer = tiktoken.get_encoding("gpt2")

In [179]:
token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

In [180]:
token_ids

[15496,
 11,
 466,
 345,
 588,
 8887,
 30,
 220,
 50256,
 554,
 262,
 4252,
 18250,
 8812,
 2114,
 286,
 262,
 20562,
 13]

In [175]:
tokenizer.decode(token_ids)

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.'

In [182]:
tokenizer.encode("Akwirw ier")

[33901, 86, 343, 86, 220, 959]

## Datasampling with a sliding window

In [189]:
#Create the input/target pairs
#input: input tokens (sequence)
#target: input tokens shifted by 1

In [210]:
enc_text = tokenizer.encode(raw_text)

In [211]:
print(len(enc_text))

5145


In [187]:
enc_sample = enc_text[:50]

In [188]:
context_size = 4

In [193]:
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

In [195]:
print(f"x: {x}")
print(f"y:     {y}")

x: [40, 367, 2885, 1464]
y:     [367, 2885, 1464, 1807]


In [202]:
#only for the purpose for understanding
#Don't confuse with dataloading
for i in range(1, context_size+1):
    x = enc_sample[:i]
    y = enc_sample[i]
    print(tokenizer.decode(x), "--->", tokenizer.decode([y]))

I --->  H
I H ---> AD
I HAD --->  always
I HAD always --->  thought


In [213]:
import torch
from torch.utils.data import Dataset

In [215]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        token_ids = tokenizer.encode(txt)

        self.max_length = max_length
        self.stride = stride
        
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        for i in range(0, len(token_ids), stride):
            input_chunk = token_ids[i:i+max_length]
            output_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(output_chunk))
        

    def __len__(self):
        return len(self.input_ids)


    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx] 

In [223]:
dataset = GPTDatasetV1(raw_text, tokenizer, 8, 1)

In [224]:
data_iter = iter(dataset)

In [230]:
sample = next(data_iter)

In [236]:
sample

(tensor([ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]),
 tensor([ 1807,  3619,   402,   271, 10899,  2138,   257,  7026]))

## Creating token embeddings

In [228]:
tokenizer.n_vocab

50257

In [229]:
embeddings = torch.nn.Embedding(tokenizer.n_vocab, 256)

In [235]:
embeddings(sample[0]).shape

torch.Size([8, 256])

In [233]:
sample[0]

tensor([ 1464,  1807,  3619,   402,   271, 10899,  2138,   257])