In [60]:
with open("verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [61]:
import re

preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4649


In [62]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [63]:
#Converting tokens into token IDs
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)
print(vocab_size)


1159


In [64]:
vocab = {token:integer for integer, token in enumerate(all_words)}
# for i, item in enumerate(vocab.items()):
#     print(item)
#     if i > 50:
#         break

In [65]:
class SimpleTokeniserV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        # Split text on special characters and whitespace
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'([,.?_!"()\']|--|\s)', r'\1',text)
        return text
        
# If text is: "Hello, world!"
# After re.split():
# preprocessed = ['Hello', ',', '', 'world', '!']

# After the cleaning loop:
# preprocessed = ['Hello', ',', 'world', '!']

In [66]:
tokeniser = SimpleTokeniserV1(vocab)
text = """It's the last he painted"""
ids = tokeniser.encode(text)
print(ids)

[58, 2, 872, 1013, 615, 541, 763]


In [67]:
print(tokeniser.decode(ids))

It ' s the last he painted


In [68]:
text = "Hello, do you like tea?"
x = tokeniser.encode(text)
print(x)

KeyError: 'Hello'

In [69]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|unk|>", "<|endoftext|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

1161


In [70]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1156)
('your', 1157)
('yourself', 1158)
('<|unk|>', 1159)
('<|endoftext|>', 1160)


In [78]:
class SimpleTokeniserV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
        
    def encode(self, text):
        def handle_token(token):
            if not token:
                return None
            token = token.strip()
            return token if token in self.str_to_int else "<|unk|>"
        
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [tk for tk in (handle_token(item) for item in preprocessed) if tk]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) 
        return text

In [79]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [80]:
tokeniser = SimpleTokeniserV2(vocab)
print(tokeniser.encode(text))

[1159, 5, 1159, 362, 1159, 1155, 1159, 642, 1159, 1000, 10, 1159, 1160, 1159, 57, 1159, 1013, 1159, 981, 1159, 1009, 1159, 738, 1159, 1013, 1159, 1159, 7]
