In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from bpe import HindiTokenizer

In [2]:
file_path = "hindi_gpt/hindi.txt"

with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

In [4]:
tokenizer = HindiTokenizer()

In [7]:
class hindiDataset:
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i+max_length]
            target_chunk = token_ids[i+1: i+max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    from bpe import HindiTokenizer
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]


In [9]:
def create_dataloader_v1(txt, batch_size, max_length, stride, shuffle=True, drop_last = True, num_workers=0):

    dataset = hindiDataset(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset, batch_size, shuffle, drop_last=drop_last, num_workers=num_workers
    )

    return dataloader

In [10]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=2, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[ 653, 6861,   96, 5065]]), tensor([[6861,   96, 5065, 6861]])]


In [11]:
inputs1, targets1 = next(data_iter)
print("Input: ", inputs1)
print("Target: ", targets1)from bpe import HindiTokenizer

Input:  tensor([[  96, 5065, 6861, 2303]])
Target:  tensor([[5065, 6861, 2303, 4800]])


In [12]:
vocab_size = 100863
output_dim = 256
context_length = 1024

In [13]:
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [14]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Input: ", inputs)
print("Target: ", targets)
print("Input shape: ", inputs.shape)

Input:  tensor([[  653,  6861,    96,  5065],
        [ 6861,  2303,  4800,    90],
        [ 6569, 33974,   693,   102],
        [  164,  1033,   124, 90890],
        [ 1469,   102, 12280,    95],
        [ 1139,    90,   574,  9889],
        [   97,   237, 17159,  8032],
        [   90,   473,    97,  8953]])
Target:  tensor([[ 6861,    96,  5065,  6861],
        [ 2303,  4800,    90,  6569],
        [33974,   693,   102,   164],
        [ 1033,   124, 90890,  1469],
        [  102, 12280,    95,  1139],
        [   90,   574,  9889,    97],
        [  237, 17159,  8032,    90],
        [  473,    97,  8953,    95]])
Input shape:  torch.Size([8, 4])


In [15]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [16]:
print(token_embeddings)

tensor([[[ 0.1801,  0.5531, -1.2463,  ...,  0.5887, -0.4320, -0.5118],
         [ 0.9121, -0.1160, -1.2231,  ...,  0.3986, -0.7128,  0.3800],
         [-1.6821,  0.3227, -0.4879,  ..., -1.2016, -0.8116, -0.7672],
         [-0.1796, -0.1454,  0.3370,  ...,  0.1366,  0.0972, -1.1325]],

        [[ 0.9121, -0.1160, -1.2231,  ...,  0.3986, -0.7128,  0.3800],
         [ 1.1497,  0.0048,  1.1323,  ..., -0.5259,  0.5716,  0.2707],
         [ 0.8521, -1.3983, -1.0890,  ..., -0.0237, -0.0908, -0.2665],
         [ 1.6777,  0.2427, -0.8827,  ...,  0.3661,  0.1276, -0.2763]],

        [[-1.0159,  0.9452, -0.4051,  ..., -0.0180, -1.3032, -1.1961],
         [-0.9672,  0.6358,  1.8355,  ..., -0.2647,  0.2990,  0.6080],
         [-0.2828,  1.0133, -0.3841,  ...,  1.8431,  0.5580,  0.9344],
         [ 0.2710, -0.0894,  0.6879,  ...,  0.5631, -0.7318,  1.3994]],

        ...,

        [[ 0.0804,  0.7463,  2.2777,  ..., -0.8547,  1.1046,  2.4741],
         [ 1.6777,  0.2427, -0.8827,  ...,  0.3661,  0.12