In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from bpe import HindiTokenizer

In [2]:
file_path = "/home/angkul/my_data/coding/agi/hindi_gpt/hindi.txt"

with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

In [3]:
tokenizer = HindiTokenizer()

In [4]:
class hindiDataset:
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i+max_length]
            target_chunk = token_ids[i+1: i+max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]


In [5]:
def create_dataloader(txt, batch_size, max_length, stride, shuffle=True, drop_last = True, num_workers=0):

    dataset = hindiDataset(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset, batch_size, shuffle, drop_last=drop_last, num_workers=num_workers
    )

    return dataloader

In [6]:
dataloader = create_dataloader(
    raw_text, batch_size=1, max_length=4, stride=2, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[ 678, 6884,  109, 5216]]), tensor([[6884,  109, 5216, 6884]])]


In [7]:
inputs1, targets1 = next(data_iter)
print("Input: ", inputs1)
print("Target: ", targets1)

Input:  tensor([[ 109, 5216, 6884, 2367]])
Target:  tensor([[5216, 6884, 2367, 4942]])


In [8]:
vocab_size = 110126
output_dim = 256
context_length = 1024

In [9]:
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [10]:
max_length = 4
dataloader = create_dataloader(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Input: ", inputs)
print("Target: ", targets)
print("Input shape: ", inputs.shape)

Input:  tensor([[   678,   6884,    109,   5216],
        [  6884,   2367,   4942,    103],
        [  6787,  35922,    721,    114],
        [   177,   1070,    137, 107782],
        [  1494,    114,  12652,    108],
        [  1085,    103,    594,  10173],
        [   110,    251,  17785,   8264],
        [   103,    494,    110,   9219]])
Target:  tensor([[  6884,    109,   5216,   6884],
        [  2367,   4942,    103,   6787],
        [ 35922,    721,    114,    177],
        [  1070,    137, 107782,   1494],
        [   114,  12652,    108,   1085],
        [   103,    594,  10173,    110],
        [   251,  17785,   8264,    103],
        [   494,    110,   9219,    108]])
Input shape:  torch.Size([8, 4])


In [11]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [12]:
print(token_embeddings)

tensor([[[-1.1929,  1.0425, -1.2517,  ...,  0.7110, -0.0654, -1.7875],
         [ 0.5091,  0.7915,  0.9778,  ..., -1.0899,  0.6510,  0.5326],
         [-0.4585,  1.2861, -0.3517,  ...,  0.0103,  0.4264,  0.6903],
         [ 1.2590, -0.9492, -0.6068,  ...,  0.1470,  0.2408, -1.0546]],

        [[ 0.5091,  0.7915,  0.9778,  ..., -1.0899,  0.6510,  0.5326],
         [ 0.3681, -1.2437, -0.7266,  ..., -0.2342, -0.3608,  0.7382],
         [ 1.2890,  1.1843, -0.0129,  ..., -0.9807, -2.2260, -0.9126],
         [-0.5270,  0.4361,  0.2539,  ...,  0.5521,  0.4655, -1.9145]],

        [[-0.7450, -1.1314,  1.0025,  ...,  0.4701, -0.4298,  0.6751],
         [-0.0891,  0.8334,  0.0422,  ..., -0.4854, -1.4634, -1.1526],
         [-0.0821, -0.3032,  0.6417,  ..., -0.1867, -0.0181,  0.2435],
         [ 0.8728,  0.6619,  0.3223,  ...,  1.1945,  0.3440,  0.1549]],

        ...,

        [[ 0.8437, -1.0063, -0.8142,  ...,  0.2404, -1.3223, -1.8691],
         [-0.5270,  0.4361,  0.2539,  ...,  0.5521,  0.46