#Natural Lenguage Processing


In [18]:
#libraries
import requests
from bs4 import BeautifulSoup
import re
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

##Data Preprocessing

In [28]:
from warnings import simplefilter
#get some text
html = requests.get('https://en.wikisource.org/wiki/The_Verdict')
bs = BeautifulSoup(html.text, 'html.parser')
text = bs.find_all('p')
text = ' '.join(p.text for p in text)
# print(text)


#generate word tokens
tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
tokens = [token.strip() for token in tokens  if token.strip()]
# print(tokens)


#convert token into token IDs (integers)

#convert the tokens into a set of sorted unique tokens
all_words = sorted(set(tokens))
all_words.extend(["<|endoftext|>", "<|unk|>"])  #add 2 tokens , one for signaling the end of a text and another for unkown words

#create a dictionary mapping tokens to token IDs
vocab_size = len(all_words)
vocab = {index : word for word, index in enumerate(all_words)}


class SimpleTokenizer:
    """
    tokenizer that convert a string into token IDs and viceversa
    """

    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {index : string for string , index in vocab.items()}


    def encode(self, text):
        #generate word tokens
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        #clean tokens (get rid of extra ' ')
        tokens = [token.strip() for token in tokens if token.strip() ]

        #if the item is not present in the vocab use the unknow token
        tokens = [token  if token in self.str_to_int else '<|unk|>' for token in tokens ]

        #convert text into token ids
        ids = [self.str_to_int[token] for token in tokens ]

        return ids


    def decode(self, ids):
        #get words from ids
        text = ' '.join([self.int_to_str[id] for id in  ids])

        #eliminate spaces between punctuation
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)

        return text



#examples
# text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""


# text1 = "Hello, do you like tea?"
# text2 = "In the sunlit terraces of the palace."
# text = " <|endoftext|> ".join((text1, text2))

# tokenizer = SimpleTokenizer(vocab)
# print(tokenizer.encode(text))
# print(tokenizer.decode(tokenizer.encode(text)))


#byte pair tokenization
# tokenizer = tiktoken.get_encoding('gpt2')
# text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.")
# print(tokenizer.encode(text, allowed_special={'<|endoftext|>'}))
# print(tokenizer.decode(tokenizer.encode(text, allowed_special={'<|endoftext|>'})))


#create dataset
class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, max_lenght, stride):
        self.input_ids = []
        self.target_ids = []

        #tokenize the raw text
        token_ids = tokenizer.encode(text)

        #create sliding windows
        for i in range (0, len(tokens) - max_lenght, stride):
            input_chunk = token_ids[i:i+max_lenght]
            target_chunk = token_ids[i+1 : i+max_lenght +1] #during pre training the gpt predicts all tokens shifted by one at the same time

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len (self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index] , self.target_ids[index]


#create dataloaders
def create_dataloader(text, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDataset(text, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )

    return dataloader



dataloader = create_dataloader(text, batch_size=1, max_length=4, stride=1, shuffle= False, )

# for batch_index , batch_tuple in enumerate(dataloader):
#     print(batch_tuple)
#     break


[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
