# Sample code for PyTorch Dataloader in NLP.

In [5]:
# Imports
import re
import torch
from torch.utils.data import DataLoader, Dataset

### Data Pre-Processor

In [32]:
# Sample text data for GPT like model training
# using same text which we used in tokenizer

sample_text = '''Lexical tokenization is conversion of a text into meaningful lexical tokens belonging to categories defined by a "lexer" program. 
In case of a natural language, those categories include nouns, verbs, adjectives, punctuations etc. 
In case of a programming language, the categories include identifiers, operators, grouping symbols and data types. 
Lexical tokenization is related to the type of tokenization used in large language models but with two differences. 
First, lexical tokenization is usually based on a lexical grammar, whereas LLM tokenizers are usually probability-based. 
Second, LLM tokenizers perform a second step that converts the tokens into numerical values'''

In [33]:
#TODO: write preprocessing setps for your data


# no processing needed for this sample data
processed_text = sample_text.split('\n') if isinstance(sample_text, str) else sample_text
processed_text

['Lexical tokenization is conversion of a text into meaningful lexical tokens belonging to categories defined by a "lexer" program. ',
 'In case of a natural language, those categories include nouns, verbs, adjectives, punctuations etc. ',
 'In case of a programming language, the categories include identifiers, operators, grouping symbols and data types. ',
 'Lexical tokenization is related to the type of tokenization used in large language models but with two differences. ',
 'First, lexical tokenization is usually based on a lexical grammar, whereas LLM tokenizers are usually probability-based. ',
 'Second, LLM tokenizers perform a second step that converts the tokens into numerical values']

In [34]:
# Tokenizer from previous, tokenizer.ipynb
class simpleTokenizer:
    def __init__(self, text=None, cased=True):
        self.cased = cased
        self.token_to_idx = {}
        self.idx_to_token = {}
        if text is not None:
            self.train(text)
        
    def preprocess(self, text):
        if self.cased: text = text.lower()
        text = re.sub(r' +', ' ', text)
        return text
        
    def train(self, text):
        print('Training Tokenizer...')
        vocab = set()
        text = self.preprocess(text)
        lines = text.split('\n')
        print(f'Total Sentences: %s' % len(lines))
        
        for line in lines:
            tokens = line.split()
            for token in tokens:
                vocab.add(token)
        
        vocab = sorted(vocab)
        vocab.append('<unk>')
        print(f'Vocab size: %s' % len(vocab))
        
        self.token_to_idx = {token: idx for idx, token in enumerate(vocab)}
        self.idx_to_token = {idx: token for idx, token in enumerate(vocab)}

    def encode(self, text):
        text = self.preprocess(text)
        tokens = text.split()
        return [self.token_to_idx[token] if token in self.token_to_idx else self.token_to_idx['<unk>'] for token in tokens]
    
    def decode(self, token_ids):
        return ' '.join([self.idx_to_token[idx] if idx in self.idx_to_token else '<unk>' for idx in token_ids])

In [35]:
tokenizer = simpleTokenizer(' '.join(processed_text))

Training Tokenizer...
Total Sentences: 1
Vocab size: 66


In [36]:
# Tokenize text
token_seq = [ tokenizer.encode(line) for line in processed_text]
token_seq

[[28, 53, 24, 11, 35, 1, 48, 23, 30, 28, 55, 6, 52, 10, 14, 8, 1, 0, 40],
 [21, 9, 35, 1, 32, 26, 51, 10, 22, 33, 62, 2, 42, 16],
 [21, 9, 35, 1, 41, 26, 50, 10, 22, 20, 37, 19, 47, 3, 13, 58],
 [28, 53, 24, 43, 52, 50, 57, 35, 53, 59, 21, 27, 25, 31, 7, 64, 56, 15],
 [17, 28, 53, 24, 60, 5, 36, 1, 28, 18, 63, 29, 54, 4, 60, 39],
 [45, 29, 54, 38, 1, 44, 46, 49, 12, 50, 55, 23, 34, 61]]

In [59]:
# GPT like model used to predict next word, 
# so we will prepare data that have next word in prediction to calculate loss

class GPTDataset(Dataset):
    def __init__(self, x, tokenizer, max_len, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_seq = [ tokenizer.encode(line) for line in x ]
        
        for token_ids in token_seq:
            for i in range(0, len(token_ids)-max_len, stride):
                self.input_ids.append(torch.tensor(token_ids[i:i+max_len]))
                self.target_ids.append(torch.tensor(token_ids[i+1:i+max_len+1]))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

        
        

In [60]:
dataset = GPTDataset(processed_text, tokenizer=tokenizer, max_len=4, stride=1)

In [61]:
dataset.input_ids[0], dataset.target_ids[0], dataset.__len__()

(tensor([28, 53, 24, 11]), tensor([53, 24, 11, 35]), 73)

### Dataloader 

In [81]:
# PyTorch style dataloader

def create_dataloader(input_text, batch_size = 2, max_len = 4, stride = 1, shuffle = False, drop_last=True, num_workers = 0):
    dataset = GPTDataset(input_text, tokenizer=tokenizer, max_len=max_len, stride=stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

In [82]:
dataloader = create_dataloader(processed_text)

In [83]:
for i in dataloader:
    print(i)
    break

[tensor([[28, 53, 24, 11],
        [53, 24, 11, 35]]), tensor([[53, 24, 11, 35],
        [24, 11, 35,  1]])]
