In [3]:
with open(file='the-verdict.txt', encoding='utf-8') as f:
    raw_text = f.read()
len(raw_text)

20479

In [16]:
import re
text = "Hello, world. This, is a test."
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
print(result)
result = [item for item in result if item.strip()]
print(result)


['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']
['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [25]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


In [36]:
unique_words = sorted(set(preprocessed))
vocab_size = len(unique_words)
vocab = {word:token_id for token_id, word in enumerate(unique_words)}
reverse_vocab = {token_id : word for word, token_id in (vocab.items())}
# reverse_vocab
# vocab

In [37]:
token_idxs = []
for word in preprocessed:
    token_idxs.append(vocab[word])
# token_idxs

In [46]:
idxs = [1, 58, 2, 872, 1013, 615, 541, 763, 5, 608, 5, 1, 69, 7, 39, 873, 773, 812, 7]
s = [reverse_vocab[ids] for ids in idxs]
s = "".join(s)
s

'"Jove\'sextingeledhermitpeople,lay,"Never.Gisburnsshadeplacequietly.'

In [51]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.token_to_ids = vocab
        self.ids_to_token = {idx : token for token, idx in vocab.items()}
    
    def encode(self, text):
        preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed_text = [token for token in preprocessed_text if token.strip()]
        token_ids = [self.token_to_ids[token] for token in preprocessed_text]
        return token_ids
        
    def decode(self, token_ids) :
        text = " ".join([self.ids_to_token[idx] for idx in token_ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [52]:
tokenizer = SimpleTokenizer(vocab)

text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [53]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [56]:
tokenizer = SimpleTokenizer(vocab)
text = "Hello, do you like tea. Is this-- a test?"
tokenizer.encode(text)

# Hello is <UNK> for the tokenizer

KeyError: 'Hello'

## BPE


In [60]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [61]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [62]:
tokenizer.decode(integers)

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.'

In [65]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)
len(enc_text)

5145

In [66]:
from torch.utils.data import Dataset, DataLoader


class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    
def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=0
    )

    return dataloader

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()