In [31]:
import re
import tiktoken
import torch
from torch.utils.data import DataLoader,Dataset

In [4]:
with open('The_Verdict.txt') as f:
    text = f.read()

len(text)

20479

In [5]:
preprocessed =  re.split(r'([,.?_!"()\']|--|\s)', text)
preprocessed = [i.strip() for i in preprocessed if i.strip()]
len(preprocessed)

4649

In [6]:
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)
print(vocab_size)

vocab = {token:item for item,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)


1159
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Carlo;', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Destroyed', 30)
('Devonshire', 31)
('Don', 32)
('Dubarry', 33)
('Emperors', 34)
('Florence', 35)
('For', 36)
('Gallery', 37)
('Gideon', 38)
('Gisburn', 39)
('Gisburns', 40)
('Grafton', 41)
('Greek', 42)
('Grindle', 43)
('Grindle:', 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)
('He', 50)
('Her', 51)
('Hermia', 52)
('His', 53)
('How', 54)
('I', 55)
('If', 56)
('In', 57)
('It', 58)
('Jack', 59)
('Jove', 60)
('Just', 61)
('Lord', 62)
('Made', 63)
('Miss', 64)
('Money', 65)
('Monte', 66)
('Moon-dancers', 67)
('Mr', 68)
('Mrs', 69)
('My', 70)
('Never', 71)
('No', 72)
('Now', 73)
('Nutley', 74)
('Of', 75)
('Oh', 76)
('

In [7]:
class TokenizerV1:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inv_vocab = {v:k for k,v in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [i.strip() for i in preprocessed if i.strip()]
        return [self.vocab[token] for token in preprocessed]

    def decode(self,ids):
        text = " ".join([self.inv_vocab[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [8]:
tokenizer = TokenizerV1(vocab)
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)

print(decoded)
print(text)

I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera.( Though I rather thought it would have been Rome or Florence.)" The height of his glory" -- that was what the women called it. I can hear Mrs. Gideon Thwing -- his last Chicago sitter -- deploring his unaccountable abdication." Of course it' s going to send the value of my picture' way up; but I don' t think of that, Mr. Rickham -- the loss to Arrt is all I think of." The word, on Mrs. Thwing' s lips, multiplied its _ rs _ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn' s" Moon-dancers" to say, with tears in her eyes:" We shall not look upon its like again"? Well! 

In [9]:
all_words = sorted(list(set(preprocessed)))
all_words.extend(["<|endoftext|>","<|unk|>"])
vocab_size = len(all_words)
print(vocab_size)

vocab = {token:item for item,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)

class TokenizerV2:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inv_vocab = {v:k for k,v in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [i.strip() for i in preprocessed if i.strip()]
        ids = [item if item in self.vocab
                    else "<|unk|>" for item in preprocessed]

        return ids

    def decode(self,ids):
        text = " ".join([self.inv_vocab[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

1161
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Carlo;', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Destroyed', 30)
('Devonshire', 31)
('Don', 32)
('Dubarry', 33)
('Emperors', 34)
('Florence', 35)
('For', 36)
('Gallery', 37)
('Gideon', 38)
('Gisburn', 39)
('Gisburns', 40)
('Grafton', 41)
('Greek', 42)
('Grindle', 43)
('Grindle:', 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)
('He', 50)
('Her', 51)
('Hermia', 52)
('His', 53)
('How', 54)
('I', 55)
('If', 56)
('In', 57)
('It', 58)
('Jack', 59)
('Jove', 60)
('Just', 61)
('Lord', 62)
('Made', 63)
('Miss', 64)
('Money', 65)
('Monte', 66)
('Moon-dancers', 67)
('Mr', 68)
('Mrs', 69)
('My', 70)
('Never', 71)
('No', 72)
('Now', 73)
('Nutley', 74)
('Of', 75)
('Oh', 76)
('

In [14]:
tokenizer = tiktoken.get_encoding('gpt2')

In [15]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [24]:
strings = tokenizer.decode(list(range(467)))
print(strings)

!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~���������������������������������������������������������������������������������������������� 	
 ���������������������������������� t aheinreon theer sat w oen citisanores bed fing pou analar to m of in d h andicasle thionomllent n lst reve eroly be g Tct Sidot Iutet A is onimamowayadse that Cig forac yverur uld st M's he itationithirce youil B whol P with 1terch as we (ndill Dif 2agerske " Hem con W Rher was rod Fulate atrippore The seus pro haum are deainand orighestistabrom Nth com Gunop00 L notess ex vres Eewityant byelosortocqu from have suiveould sh thisntrapeightartment alustend--all Oack ch leiesredard�out J abearivallyourostghpt plast canakomeudThe his do


In [27]:
with open("The_Verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [29]:
enc_sample = enc_text[50:]
context_size = 4
x = enc_sample[:context_size]
x

[290, 4920, 2241, 287]

In [33]:
class GPTDataset(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        super().__init__()
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt) 

        for i in range(0, len(token_ids) - max_length, stride): #B
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return  self.input_ids[index], self.target_ids[index]

In [34]:
def create_dataloader(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True):
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDataset(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    return dataloader

In [38]:
with open("The_Verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False, drop_last=False)

data_iter = iter(dataloader) #A
first_batch = next(data_iter)
print(first_batch)


[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
