In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from bpe import HindiTokenizer

In [3]:
file_path = "/home/angkul/my_data/coding/agi/hindi_gpt/hindi.txt"

with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

In [4]:
tokenizer = HindiTokenizer()

In [5]:
class hindiDataset:
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i+max_length]
            target_chunk = token_ids[i+1: i+max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]


In [6]:
def create_dataloader(txt, batch_size, max_length, stride, shuffle=True, drop_last = True, num_workers=0):

    dataset = hindiDataset(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset, batch_size, shuffle, drop_last=drop_last, num_workers=num_workers
    )

    return dataloader

In [None]:
dataloader = create_dataloader(
    raw_text, batch_size=1, max_length=4, stride=2, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[ 661, 6735,   99, 5097]]), tensor([[6735,   99, 5097, 6735]])]


In [8]:
inputs1, targets1 = next(data_iter)
print("Input: ", inputs1)
print("Target: ", targets1)

Input:  tensor([[  99, 5097, 6735, 2311]])
Target:  tensor([[5097, 6735, 2311, 4831]])


In [10]:
vocab_size = 110126
output_dim = 256
context_length = 1024

In [11]:
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [None]:
max_length = 4
dataloader = create_dataloader(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Input: ", inputs)
print("Target: ", targets)
print("Input shape: ", inputs.shape)

Input:  tensor([[  661,  6735,    99,  5097],
        [ 6735,  2311,  4831,    93],
        [ 6639, 34529,   704,   104],
        [  167,  1041,   127, 98689],
        [ 1456,   104, 12385,    98],
        [ 1056,    93,   577,  9970],
        [  100,   240, 17327,  8104],
        [   93,   480,   100,  9039]])
Target:  tensor([[ 6735,    99,  5097,  6735],
        [ 2311,  4831,    93,  6639],
        [34529,   704,   104,   167],
        [ 1041,   127, 98689,  1456],
        [  104, 12385,    98,  1056],
        [   93,   577,  9970,   100],
        [  240, 17327,  8104,    93],
        [  480,   100,  9039,    98]])
Input shape:  torch.Size([8, 4])


In [13]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [15]:
print(token_embeddings)

tensor([[[ 0.0748,  0.8920,  0.3683,  ..., -0.9974, -2.5803, -1.4525],
         [ 0.9381,  0.2983, -0.9376,  ...,  2.7328,  0.7609, -1.6878],
         [-0.0162, -1.1155,  0.9063,  ..., -0.7967, -0.1410,  0.0396],
         [ 1.2758,  0.6764,  0.0542,  ..., -0.6666, -1.3848, -0.3094]],

        [[ 0.9381,  0.2983, -0.9376,  ...,  2.7328,  0.7609, -1.6878],
         [ 0.4075, -0.6495, -1.1585,  ..., -0.2722, -1.1590,  0.6773],
         [ 0.4540, -1.0890, -1.3192,  ..., -1.3655, -0.7506,  0.4113],
         [ 0.2202,  1.1878,  0.1690,  ..., -0.4608,  1.3009, -1.5901]],

        [[ 0.6723,  0.9529,  1.3828,  ..., -1.0226,  0.3758,  0.3551],
         [ 0.1772, -0.6796, -0.5324,  ..., -1.2889,  0.2810, -0.5061],
         [-0.4776, -0.2462, -0.7918,  ...,  0.9150, -0.0212, -1.4770],
         [ 1.2324,  0.3857,  0.2084,  ...,  1.0293,  2.8439,  1.4019]],

        ...,

        [[ 1.7395,  0.3440, -0.6195,  ..., -0.2009,  0.1521, -1.1817],
         [ 0.2202,  1.1878,  0.1690,  ..., -0.4608,  1.30