Dataset used: 'The verdict' by Edith Warton(1908)

Step 1: Creating Tokens

In [1]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [2]:
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [3]:
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

print(preprocessed[:15])
print(len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow']
4690


Step 2: Creating Token IDs

In [4]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
vocab = {token:integer for integer,token in enumerate(all_words)}
len(vocab)

1130

Implementing a python class for tokenization.
This class will have two methods, encode and decode.
Step 1: Store the vocabulary as class attribute for access in the encode and decode method
Step 2: Create an inverse vocabulary that maps token IDs back to the original text tokens
Step 3: process input text into token IDs
Step 4: Convert token IDs back into text

In [5]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.toInt = vocab
        self.toStr = {t:s for s, t in vocab.items()}

    def encode(self, text):
            preprocessed = re.split(r'[,.:;?_!"()\']|--|\s', text)
            preprocessed = [item.strip() for item in preprocessed if item.strip()]
            ids = [self.toInt[s] for s in preprocessed]
            return ids

    def decode(self, ids):
        text = " ".join([self.toStr[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [6]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[56, 850, 988, 602, 533, 746, 1126, 596, 67, 38, 851, 1108, 754, 793]


In [7]:
tokenizer.decode(ids)

'It s the last he painted you know Mrs Gisburn said with pardonable pride'

In [8]:
temp = "Hello, do you like tea?"
#print(tokenizer.encode(temp))

In [9]:
all_tokens = sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocabV2 = {token:integer for integer,token in enumerate(all_tokens)}
len(vocabV2)

1132

In [10]:
for item in list(vocabV2.items())[-5:]:
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [11]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.toInt = vocab
        self.toStr = {integer:token for token, integer in vocab.items()}

    def encode(self, vocab):
        preprocessed = re.split(r'([,.:;?_!()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.toInt
            else "<|unk|>" for item in preprocessed            
        ]
        ids = [self.toInt[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.toStr[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text


In [12]:
tokenizerV2 = SimpleTokenizerV2(vocabV2)

In [13]:
temp1 = "Hello, do you like tea?"
temp2 = "In the sunlit terraces of the palace."
temp = " <|endoftext|> ".join((temp1, temp2))
ids = tokenizerV2.encode(temp)
print(ids)

[1131, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [14]:
tokenizerV2.decode(ids)

'<|unk|>\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'