In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from indicnlp.tokenize import indic_tokenize

In [2]:
file_path = "/home/angkul/my_data/coding/agi/llms/hindi.txt"

with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

In [3]:
t1 = ("इस संदर्भ में ब्लम एक सफल नेता के लिए ऐसे कार्य न करने के लिए निर्देश देते हैं। अंत।" 
"जिससे कर्मचारी और नेता के बीच दूरी बनी रहे।")

In [4]:
# get the unique tokens
# tokens1 = indic_tokenize.trivial_tokenize(t1)

# tokens = sorted(list(set(tokens1)))
# tokens.extend(["अंत"])

tokens = indic_tokenize.trivial_tokenize(raw_text)

tokens = sorted(list(set(tokens)))
tokens.extend(["अंत"])

In [5]:
print(tokens)

['\n\n', '\n\nगुरुत्वाकर्षण', '\n\nनेतृत्व', '\n18', '\n1850', '\nअतीत', '\nअधिकारियों', '\nअनेक', '\nअनौपचारिक', '\nअभिजातीय', '\nअर्थात्', '\nआगे', '\nइन', '\nइब्राहीम', '\nइस', '\nइसी', '\nउचित', '\nउदाहरण', '\nउद्योग', '\nउपर्युक्त', '\nएक', '\nओसवाल्ड', '\nकंफ्युशिअसवाद', '\nकर्मचारियों', '\nकारखाने', '\nकार्य', '\nकार्यक्षमता', '\nकुछ', '\nकेप्लर', '\nकैद', '\nकोई', '\nगुरुत्वाकर्षण', '\nछोटे', '\nजर्मन', '\nजिस', '\nतब', '\nतृतीय', '\nद्वितीय', '\nनर', '\nनिरंकुश', '\nनिश्चित', '\nनेता', '\nनेतृत्व', '\nनोअम', '\nपक्षियों', '\nपिंजर', '\nपेट्रीसिया', '\nपॉल', '\nप्रत्येक', '\nप्रथम', '\nप्रागैतिहासिक', '\nप्रारंभिक', '\nप्रेरणा', '\nबोनोबो', '\nब्रूस', '\nब्लम', '\nभारतीय', '\nमाना', '\nमोर', '\nयदि', '\nयह', '\nयहाँ', '\nरिचर्ड', '\nलीलावती', '\nवयस्क', '\nविद्वानों', '\nव्यक्तिगत', '\nसमस्याओं', '\nसामूहिक', '\nसोच', '\nस्मिथ', '\nहाउजर', '\nहालांकि', '!', '"', '%', "'", ',', '-', '.', '/', '0', '100', '100000', '115', '1300', '15', '16', '17', '170', '18', '1800', '1890', '19

In [6]:
vocab = {token:integer for integer, token in enumerate(tokens)}

In [7]:
print(vocab)

{'\n\n': 0, '\n\nगुरुत्वाकर्षण': 1, '\n\nनेतृत्व': 2, '\n18': 3, '\n1850': 4, '\nअतीत': 5, '\nअधिकारियों': 6, '\nअनेक': 7, '\nअनौपचारिक': 8, '\nअभिजातीय': 9, '\nअर्थात्': 10, '\nआगे': 11, '\nइन': 12, '\nइब्राहीम': 13, '\nइस': 14, '\nइसी': 15, '\nउचित': 16, '\nउदाहरण': 17, '\nउद्योग': 18, '\nउपर्युक्त': 19, '\nएक': 20, '\nओसवाल्ड': 21, '\nकंफ्युशिअसवाद': 22, '\nकर्मचारियों': 23, '\nकारखाने': 24, '\nकार्य': 25, '\nकार्यक्षमता': 26, '\nकुछ': 27, '\nकेप्लर': 28, '\nकैद': 29, '\nकोई': 30, '\nगुरुत्वाकर्षण': 31, '\nछोटे': 32, '\nजर्मन': 33, '\nजिस': 34, '\nतब': 35, '\nतृतीय': 36, '\nद्वितीय': 37, '\nनर': 38, '\nनिरंकुश': 39, '\nनिश्चित': 40, '\nनेता': 41, '\nनेतृत्व': 42, '\nनोअम': 43, '\nपक्षियों': 44, '\nपिंजर': 45, '\nपेट्रीसिया': 46, '\nपॉल': 47, '\nप्रत्येक': 48, '\nप्रथम': 49, '\nप्रागैतिहासिक': 50, '\nप्रारंभिक': 51, '\nप्रेरणा': 52, '\nबोनोबो': 53, '\nब्रूस': 54, '\nब्लम': 55, '\nभारतीय': 56, '\nमाना': 57, '\nमोर': 58, '\nयदि': 59, '\nयह': 60, '\nयहाँ': 61, '\nरिचर्ड': 62, '\nलीलावती

In [15]:
print(len(vocab))

2254


In [8]:
class hindiTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preproccessed = indic_tokenize.trivial_tokenize(text)
        preproccessed = [item.strip() for item in preproccessed if item.strip()]
        preproccessed = [item if item in self.str_to_int
                         else "अंत" for item in preproccessed]

        ids = [self.str_to_int[s] for s in preproccessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        return text

In [9]:
to = hindiTokenizer(vocab)

In [10]:
to1 = "जिससे कर्मचारी और नेता के बीच दूरी बनी रहे।अनंत"
to.encode(to1)

[859, 540, 494, 1146, 605, 1488, 1008, 1450, 1711, 2248, 2254]

In [11]:
to.decode(to.encode(to1))

'जिससे कर्मचारी और नेता के बीच दूरी बनी रहे । अंत'

In [12]:
class hindiDataset:
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i+max_length]
            target_chunk = token_ids[i+1: i+max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]


In [13]:
def create_dataloader_v1(txt, batch_size, max_length, stride, shuffle=True, drop_last = True, num_workers=0):

    tokenizer = to

    dataset = hindiDataset(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset, batch_size, shuffle, drop_last=drop_last, num_workers=num_workers
    )

    return dataloader

In [14]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=2, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[1525, 1655, 1670, 1141]]), tensor([[1655, 1670, 1141, 1655]])]


In [16]:
inputs1, targets1 = next(data_iter)
print("Input: ", inputs1)
print("Target: ", targets1)

Input:  tensor([[1670, 1141, 1655,  965]])
Target:  tensor([[1141, 1655,  965,  478]])


In [17]:
vocab_size = 2254
output_dim = 256
context_length = 1024

In [18]:
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [19]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Input: ", inputs)
print("Target: ", targets)
print("Input shape: ", inputs.shape)

Input:  tensor([[1525, 1655, 1670, 1141],
        [1655,  965,  478,  605],
        [1040,  937, 1200,  555],
        [ 467, 1420,  494,  734],
        [1685,  555, 1166, 2233],
        [  77, 1001,  605,  215],
        [1522, 1647, 1665,  244],
        [  78,  800,  605, 1730]])
Target:  tensor([[1655, 1670, 1141, 1655],
        [ 965,  478,  605, 1040],
        [ 937, 1200,  555,  467],
        [1420,  494,  734, 1685],
        [ 555, 1166, 2233,   77],
        [1001,  605,  215, 1522],
        [1647, 1665,  244,   78],
        [ 800,  605, 1730, 1647]])
Input shape:  torch.Size([8, 4])


In [20]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [21]:
print(token_embeddings)

tensor([[[-1.4306, -1.0675, -0.2656,  ...,  0.0078, -2.9192,  1.4365],
         [ 2.0001,  0.2721, -0.8617,  ..., -1.4836, -0.1131,  0.4732],
         [ 0.8345, -0.9418,  0.8504,  ..., -0.4415,  0.1979,  1.1358],
         [-0.3314,  0.7942,  1.7510,  ...,  1.0738,  0.2079,  0.9983]],

        [[ 2.0001,  0.2721, -0.8617,  ..., -1.4836, -0.1131,  0.4732],
         [-0.3432,  0.7745,  0.6078,  ...,  0.5217, -0.5309, -0.0543],
         [-0.0109, -1.0853, -0.6166,  ..., -1.1420, -2.6438,  0.5352],
         [ 1.1050, -1.1989, -2.2794,  ...,  0.8033, -1.0606,  0.5993]],

        [[-0.9078,  1.3736,  0.7721,  ...,  0.9173, -1.0332,  0.4758],
         [-0.7471, -0.9954,  0.8860,  ...,  1.0148, -0.2438,  0.3308],
         [ 0.1225, -0.5613,  1.0978,  ...,  0.1773,  0.5645,  0.0490],
         [ 1.0105,  0.1749, -1.8779,  ..., -1.1783,  1.3749,  0.7687]],

        ...,

        [[ 0.3575,  0.8783,  1.6736,  ..., -0.4682,  0.5519, -0.6969],
         [-0.6814, -1.2627, -0.0612,  ...,  0.8053, -0.18