## LOADING DATASET

In [9]:
with open("data/the-verdict.txt", 'r', encoding='utf-8') as f:
    raw_text = f.read()

print(f"num characters: {len(raw_text)}")
raw_text[:100]

num characters: 20479


'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g'

## TOKENIZING

In [10]:
import re

preprocessed = re.split(f'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [word for word in preprocessed if word.strip()]
all_words = sorted(set(preprocessed))
all_words.extend(['<|endoftext|>', '<|unk|>'])
print(all_words[:10])
print(f"Vocab size: {len(all_words)}")

stoi = {word: i for i, word in enumerate(all_words)}
itos = {i: word for i, word in enumerate(all_words)}



['!', '"', "'", '(', ')', ',', '--', '.', ':', ';']
Vocab size: 1132


In [11]:
class Tokenizer:
    def __init__(self, vocab):
        self.stoi = {word: i for i, word in enumerate(vocab)}
        self.itos = {i: word for i, word in enumerate(vocab)}

    def encode(self, text):
        preprocessed = re.split(r"([!,.;:'\"?()]|--|\s)", text)
        preprocessed = [word.strip() for word in preprocessed if word.strip()]
        preprocessed = [word if word in self.stoi else "<|unk|>" for word in preprocessed]
        ids = [self.stoi[word] for word in preprocessed]
        return ids
    
    def decode(self, ids):
        text = ' '.join([self.itos[idx] for idx in ids])
        text = re.sub(r'\s+([,.?;:\'"!()]|--)', r'\1', text)
        return text
    

tokenizer = Tokenizer(all_words)
text1 = "Hello, do you like football"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
tokenizer.decode(tokenizer.encode(text))



'<|unk|>, do you like <|unk|> <|endoftext|> In the sunlit terraces of the <|unk|>.'

## Byte-Pair Encoding (BPE)

In [43]:
import tiktoken
import torch

from torch.utils.data import Dataset, DataLoader

In [42]:
tokenizer = tiktoken.get_encoding('gpt2')
enc_text = tokenizer.encode(raw_text)
inputs, outputs = [], []
context_size = 8
for i in range(len(enc_text) - context_size):
    inputs.append(enc_text[i: i + context_size])
    outputs.append(enc_text[i + 1: i + context_size + 1])

X = torch.tensor(inputs)
y = torch.tensor(outputs)

In [99]:
class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        super().__init__()

        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i + max_length]
            output_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(input_chunk)
            self.target_ids.append(output_chunk)

        self.input_ids = torch.tensor(self.input_ids)
        self.target_ids = torch.tensor(self.target_ids)

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
    
def create_dataloader(text, batch_size=4, max_length=4, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDataset(text, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            num_workers=num_workers)
    return dataloader

with open("data/the-verdict.txt", 'r', encoding='utf-8') as f:
    raw_text = f.read()
dataloader = create_dataloader(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

dataiter = iter(dataloader)
first_batch = next(dataiter)
print(tokenizer.decode(first_batch[0].tolist()[0]))
print(tokenizer.decode(first_batch[1].tolist()[0]))

I HAD always
 HAD always thought
