In [1]:
import re

In [2]:
with open("./data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [3]:
print("Total number of characters:", len(raw_text))

Total number of characters: 20479


In [4]:
print(raw_text[:99])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [5]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
all_tokens = sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

In [6]:
vocab_size = len(all_tokens)

In [7]:
vocab_size

1161

In [8]:
vocab = {token: idx for idx, token in enumerate(all_tokens)}

In [9]:
len(vocab)

1161

In [10]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {v:k for k, v in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] if s in self.str_to_int else self.str_to_int['<|unk|>'] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([simple_tokenizer.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?_!"()\'])', r'\1', text)
        return text

In [11]:
simple_tokenizer = SimpleTokenizer(vocab)

In [12]:
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""

In [13]:
ids = simple_tokenizer.encode(text)

In [14]:
simple_tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [15]:
simple_tokenizer.encode("Hello, do you like tea. Is this-- a test?")

[1160, 5, 362, 1155, 642, 1000, 7, 1160, 1024, 6, 119, 1160, 10]

In [16]:
text1 = "Hello, world. My first program!!"
text2 = "Please provide feedback and help me grow."

In [17]:
token_ids = simple_tokenizer.encode("<|endoftext|>".join([text1, text2]))

In [18]:
simple_tokenizer.decode(token_ids)

'<|unk|>, <|unk|>. My first <|unk|>!! <|unk|> <|unk|> <|unk|> and <|unk|> me <|unk|>.'

# BPE

In [20]:
import tiktoken
import importlib

In [24]:
print(f"tiktoken version: {importlib.metadata.version('tiktoken')}")

Tiktoken version: 0.7.0


In [25]:
tokenizer = tiktoken.get_encoding("gpt2")

In [34]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces" "of someunknownPlace."

In [36]:
enc_text = tokenizer.encode(raw_text)

In [37]:
len(enc_text)

5145

In [40]:
enc_sample = enc_text[50:]

In [41]:
context_size = 5

In [42]:
enc_sample[:context_size]

[290, 4920, 2241, 287, 257]

In [44]:
enc_sample[1:context_size+1]

[4920, 2241, 287, 257, 4489]

In [48]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "--->", tokenizer.decode([desired]))

 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a
 and established himself in a --->  vill


### Building a dataloader in torch