In [5]:
import tiktoken
import torch

In [9]:
with open('../data/the-verdict.txt', 'r') as f:
    raw_text = f.read()
print('total chars:', len(raw_text))
print(raw_text[:50])

total chars: 20479
I HAD always thought Jack Gisburn rather a cheap g


In [16]:
import re

text = 'Hello, world. This, is a test.'
result = re.split(r'([,.]|\s)', text)
result = [ele.strip() for ele in result if ele.strip()]
result

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']

In [22]:
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!()\']|--|\s)', text)
result = [ele.strip() for ele in result if ele.strip()]
result

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']

In [25]:
preprocessed = re.split(r'([,.:;?"_!()\']|--|\s)', raw_text)
preprocessed = [ele.strip() for ele in preprocessed if ele.strip()]
print(preprocessed[:10])
print('total tokens', len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius']
total tokens 4690


In [26]:
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)
print('vocab size', vocab_size)

vocab size 1130


In [28]:
vocab = {token:i for i,token in enumerate(all_words)}
for token, i in vocab.items():
    print(i, '->', token)
    if i >= 20:
        break

0 -> !
1 -> "
2 -> '
3 -> (
4 -> )
5 -> ,
6 -> --
7 -> .
8 -> :
9 -> ;
10 -> ?
11 -> A
12 -> Ah
13 -> Among
14 -> And
15 -> Are
16 -> Arrt
17 -> As
18 -> At
19 -> Be
20 -> Begin


In [55]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_i = vocab
        self.i_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, raw_text):
        preprocessed = re.split(r'([,.:;?"_!()\']|--|\s)', raw_text)
        preprocessed = [ele.strip() for ele in preprocessed if ele.strip()]
        ids = [self.str_to_i[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        tokens = [self.i_to_str[id] for id in ids]
        decoded_text =  " ".join(tokens)
        decoded_text = re.sub(r'\s+([,.?!"()\'])', r'\1', decoded_text)
        return decoded_text    

In [56]:
text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
sm_tokenizer = SimpleTokenizerV1(vocab)
ids = sm_tokenizer.encode(text)
print('ids', ids[:5])
decoded_text = sm_tokenizer.decode(ids)
print('decoded text', decoded_text)

ids [1, 56, 2, 850, 988]
decoded text " It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [58]:
text = "Hello, do you like tea. Is this-- a test?"
sm_tokenizer = SimpleTokenizerV1(vocab)
sm_tokenizer.encode(text)

KeyError: 'Hello'

In [74]:
preprocessed = re.split(r'([,.:;?"_!()\']|--|\s)', raw_text)
preprocessed = [ele.strip() for ele in preprocessed if ele.strip()]

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(['<|endoftext|>', '<|unk|>'])
vocab = {s:i for i,s in enumerate(all_tokens)}
print(len(vocab))

1132


In [75]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_i = vocab
        self.i_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, raw_text):
        preprocessed = re.split(r'([,.:;?"_!()\']|--|\s)', raw_text)
        preprocessed = [ele.strip() for ele in preprocessed if ele.strip()]
        preprocessed = [ele if ele in self.str_to_i else '<|unk|>' for ele in preprocessed]
        ids = [self.str_to_i[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        tokens = [self.i_to_str[id] for id in ids]
        decoded_text =  " ".join(tokens)
        decoded_text = re.sub(r'\s+([,.?!"()\'])', r'\1', decoded_text)
        return decoded_text    

In [76]:
text = "Hello, do you like tea. Is this-- a test?"
sm_tokenizer = SimpleTokenizerV2(vocab)
ids = sm_tokenizer.encode(text)
print('ids', ids)
print('decoded texts', sm_tokenizer.decode(ids))

ids [1131, 5, 355, 1126, 628, 975, 7, 1131, 999, 6, 115, 1131, 10]
decoded texts <|unk|>, do you like tea. <|unk|> this -- a <|unk|>?


In [79]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
sm_tokenizer = SimpleTokenizerV2(vocab)
text = " <|endoftext|> ".join([text1, text2])
print('text', text)
ids = sm_tokenizer.encode(text)
print('ids', ids)
print('decode text', sm_tokenizer.decode(ids))

text Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
ids [1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
decode text <|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [85]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."
gpt2_tokenizer = tiktoken.get_encoding('gpt2')
ids = gpt2_tokenizer.encode(text, allowed_special={'<|endoftext|>'})
print('ids', ids)
print('decode text', gpt2_tokenizer.decode(ids))

ids [15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]
decode text Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.
