In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import numpy as np

In [4]:
data=pd.read_csv('hindi_english_parallel2.csv')

In [5]:
data.head()

Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [6]:
data.shape

(130476, 2)

In [22]:

START_TOKEN = '<start>'
PADDING_TOKEN = '<padding>'
END_TOKEN = '<end>'

english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', '@',
                      'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                      'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                      'Y', 'Z', '[', '\\', ']', '^', '_', '`',
                      'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                      'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                      'y', 'z', '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]



hindi_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ँ', 'ं', 'ः',
                    'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ऌ', 'ए', 'ऐ', 'ओ', 'औ',
                    'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण',
                    'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह',
                    '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्', 'ॐ',
                    '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', PADDING_TOKEN, END_TOKEN]

In [23]:
len(english_vocabulary)

97

In [24]:
len(hindi_vocabulary)

109

In [25]:
'ई'+ 'उ'

'ईउ'

In [26]:


index_to_hindi = {k:v for k,v in enumerate(hindi_vocabulary)}
hindi_to_index = {v:k for k,v in enumerate(hindi_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [27]:
index_to_hindi

{0: '<start>',
 1: ' ',
 2: '!',
 3: '"',
 4: '#',
 5: '$',
 6: '%',
 7: '&',
 8: "'",
 9: '(',
 10: ')',
 11: '*',
 12: '+',
 13: ',',
 14: '-',
 15: '.',
 16: '/',
 17: '0',
 18: '1',
 19: '2',
 20: '3',
 21: '4',
 22: '5',
 23: '6',
 24: '7',
 25: '8',
 26: '9',
 27: ':',
 28: '<',
 29: '=',
 30: '>',
 31: '?',
 32: 'ँ',
 33: 'ं',
 34: 'ः',
 35: 'अ',
 36: 'आ',
 37: 'इ',
 38: 'ई',
 39: 'उ',
 40: 'ऊ',
 41: 'ऋ',
 42: 'ऌ',
 43: 'ए',
 44: 'ऐ',
 45: 'ओ',
 46: 'औ',
 47: 'क',
 48: 'ख',
 49: 'ग',
 50: 'घ',
 51: 'ङ',
 52: 'च',
 53: 'छ',
 54: 'ज',
 55: 'झ',
 56: 'ञ',
 57: 'ट',
 58: 'ठ',
 59: 'ड',
 60: 'ढ',
 61: 'ण',
 62: 'त',
 63: 'थ',
 64: 'द',
 65: 'ध',
 66: 'न',
 67: 'प',
 68: 'फ',
 69: 'ब',
 70: 'भ',
 71: 'म',
 72: 'य',
 73: 'र',
 74: 'ल',
 75: 'व',
 76: 'श',
 77: 'ष',
 78: 'स',
 79: 'ह',
 80: '़',
 81: 'ऽ',
 82: 'ा',
 83: 'ि',
 84: 'ी',
 85: 'ु',
 86: 'ू',
 87: 'ृ',
 88: 'ॄ',
 89: 'ॅ',
 90: 'े',
 91: 'ै',
 92: 'ॉ',
 93: 'ो',
 94: 'ौ',
 95: '्',
 96: 'ॐ',
 97: '०',
 98: '१',
 99: '२',
 100

In [28]:
data.head()

Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [29]:
data.shape

(130476, 2)

In [30]:
sample=df[:100000]
sample.head()

Unnamed: 0,english,hindi
0,Give your application an accessibility workout,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
1,Accerciser Accessibility Explorer,एक्सेर्साइसर पहुंचनीयता अन्वेषक
2,The default plugin layout for the bottom panel,निचले पटल के लिए डिफोल्ट प्लग-इन खाका
3,The default plugin layout for the top panel,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
4,A list of plugins that are disabled by default,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...


In [31]:
import re
sample['hindi'][:4].apply(lambda x: re.sub("'", '', x))

Unnamed: 0,hindi
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका


In [32]:
sample['hindi'].to_list()[:10]

['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें',
 'एक्सेर्साइसर पहुंचनीयता अन्वेषक',
 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका',
 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका',
 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है',
 'अवधि को हाइलाइट रकें',
 'पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि',
 'सीमांत (बोर्डर) के रंग को हाइलाइट करें',
 'हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। ',
 'भराई के रंग को हाइलाइट करें']

In [33]:
engsen=sample['english'].to_list()
engsen[:10]

['Give your application an accessibility workout',
 'Accerciser Accessibility Explorer',
 'The default plugin layout for the bottom panel',
 'The default plugin layout for the top panel',
 'A list of plugins that are disabled by default',
 'Highlight duration',
 'The duration of the highlight box when selecting accessible nodes',
 'Highlight border color',
 'The color and opacity of the highlight border.',
 'Highlight fill color']

In [34]:
hindisen=sample['hindi'].to_list()
hindisen[:10]

['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें',
 'एक्सेर्साइसर पहुंचनीयता अन्वेषक',
 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका',
 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका',
 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है',
 'अवधि को हाइलाइट रकें',
 'पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि',
 'सीमांत (बोर्डर) के रंग को हाइलाइट करें',
 'हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। ',
 'भराई के रंग को हाइलाइट करें']

In [35]:
for i in engsen:
  print(i)
  print(type(i))
  break

Give your application an accessibility workout
<class 'str'>


In [36]:
[sentence.rstrip('\n') for sentence in engsen[:2]]

['Give your application an accessibility workout',
 'Accerciser Accessibility Explorer']

In [37]:
engsen = [str(sentence).rstrip('\n') for sentence in engsen]
hindisen = [str(sentence).rstrip('\n') for sentence in hindisen]


In [38]:
engsen[:2],hindisen[:2]

(['Give your application an accessibility workout',
  'Accerciser Accessibility Explorer'],
 ['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें',
  'एक्सेर्साइसर पहुंचनीयता अन्वेषक'])

In [39]:
max(len(x) for x in engsen), max(len(x) for x in hindisen),


(1298, 1090)

In [40]:
import numpy as np
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length Kannada: {np.percentile([len(x) for x in hindisen], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in engsen], PERCENTILE)}" )

97th percentile length Kannada: 91.0
97th percentile length English: 88.0


In [41]:

max_sequence_length = 200
# to check if a token or character/alphabet ins engsen or hindi is present in about hindi/eng vocab pf charceter
def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

#to check if engsend or hindisen each sent has max 200 charcers
def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(hindisen)):
    hindi_sentence, english_sentence = hindisen[index], engsen[index]
    if is_valid_length(hindi_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(hindi_sentence, hindi_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(hindisen)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 100000
Number of valid sentences: 64908


In [42]:
#to craeted cuomt dataset pytorch inbuitl method used in our own csutom datas set class

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, kannada_sentences):
        self.english_sentences = english_sentences
        self.kannada_sentences = kannada_sentences
    #rturn number of eng/hind or sentences in a list
    def __len__(self):
        return len(self.english_sentences)
    #return 1:1 mapping of one lang to other
    def __getitem__(self, idx):
        return self.english_sentences[idx], self.kannada_sentences[idx]


In [43]:
#example
# Sample data
english_sentences = ["Hello", "How are you?", "Good morning"]
kannada_sentences = ["ಹಲೋ", "ನೀವು ಹೇಗಿದ್ದೀರಾ?", "ಶುಭೋದಯ"]

# Create dataset
dataset = TextDataset(english_sentences, kannada_sentences)

# Check dataset length
print(len(dataset))  # Output: 3

# Fetch a sample sentence pair
print(dataset[1])  # Output: ("How are you?", "ನೀವು ಹೇಗಿದ್ದೀರಾ?")

3
('How are you?', 'ನೀವು ಹೇಗಿದ್ದೀರಾ?')


In [44]:
dataset.english_sentences,dataset.kannada_sentences

(['Hello', 'How are you?', 'Good morning'],
 ['ಹಲೋ', 'ನೀವು ಹೇಗಿದ್ದೀರಾ?', 'ಶುಭೋದಯ'])

In [45]:
dataset[1]

('How are you?', 'ನೀವು ಹೇಗಿದ್ದೀರಾ?')

In [46]:

dataset = TextDataset(engsen, hindisen)

In [47]:
dataset[10]

('The color and opacity of the highlight fill.',
 'हाइलाइट किया गया भराई का रंग और पारदर्शिता। ')

In [48]:

batch_size = 3
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 3:
        break

[('Give your application an accessibility workout', 'Accerciser Accessibility Explorer', 'The default plugin layout for the bottom panel'), ('अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें', 'एक्सेर्साइसर पहुंचनीयता अन्वेषक', 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका')]
[('The default plugin layout for the top panel', 'A list of plugins that are disabled by default', 'Highlight duration'), ('ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका', 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है', 'अवधि को हाइलाइट रकें')]
[('The duration of the highlight box when selecting accessible nodes', 'Highlight border color', 'The color and opacity of the highlight border.'), ('पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि', 'सीमांत (बोर्डर) के रंग को हाइलाइट करें', 'हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। ')]
[('Highlight fill color', 'The color and opacity of the highlight fill.', 'API Browser'), ('भराई के रंग को हाइलाइट करें', 'हाइलाइट किया गया भराई का रंग और पारदर्शिता। ', 'एपी

In [49]:

def tokenize(sentence, language_to_index, start_token=True, end_token=True):
    sentence_word_indicies = [language_to_index[token] for token in list(sentence)]
    if start_token:
        sentence_word_indicies.insert(0, language_to_index[START_TOKEN])
    if end_token:
        sentence_word_indicies.append(language_to_index[END_TOKEN])
    for _ in range(len(sentence_word_indicies), max_sequence_length):
        sentence_word_indicies.append(language_to_index[PADDING_TOKEN])
    return torch.tensor(sentence_word_indicies)

In [50]:
batch

[('Browse the various methods of the current accessible',
  'Hide private attributes',
  'Method'),
 ('इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें',
  'निजी गुणों को छिपाएं',
  'विधि')]

In [51]:
eng_tokenized, hn_tokenized = [], []
for sentence_num in range(batch_size):
    eng_sentence, hn_sentence = batch[0][sentence_num], batch[1][sentence_num]
    eng_tokenized.append( tokenize(eng_sentence, english_to_index, start_token=False, end_token=False) )
    hn_tokenized.append( tokenize(hn_sentence, hindi_to_index, start_token=True, end_token=True) )
eng_tokenized = torch.stack(eng_tokenized)
hn_tokenized = torch.stack(hn_tokenized)


In [None]:


eng_tokenized# now we have token every word so here 95 is a apdding token u can check

tensor([[34, 82, 79, 87, 83, 69,  1, 84, 72, 69,  1, 86, 65, 82, 73, 79, 85, 83,
          1, 77, 69, 84, 72, 79, 68, 83,  1, 79, 70,  1, 84, 72, 69,  1, 67, 85,
         82, 82, 69, 78, 84,  1, 65, 67, 67, 69, 83, 83, 73, 66, 76, 69, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95],
        [40, 73, 68, 69,  1, 80, 82, 73, 86, 65, 84, 69,  1, 65, 84, 84, 82, 73,
         6

In [None]:
hn_tokenized

tensor([[  0,  37,  78,   1,  78,  71,  72,   1,  54,  83,  78,  90,   1,  67,
          95,  73,  82,  67,  95,  62,   1,  47,  83,  72,  82,   1,  49,  72,
          82,   1,  79,  93,  13,   1,  39,  78,  47,  84,   1,  75,  83,  70,
          83,  66,  95,  66,   1,  75,  83,  65,  83,  72,  93,  33,   1,   9,
          71,  90,  63,  59,  10,   1,  71,  90,  33,   1,  75,  83,  52,  73,
          61,   1,  47,  73,  90,  33, 108, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107

In [None]:
{i:j for i,j in english_to_index.items() if j==95}

{'<padding>': 95}

In [None]:
hn_tokenized

tensor([[  0,  37,  78,   1,  78,  71,  72,   1,  54,  83,  78,  90,   1,  67,
          95,  73,  82,  67,  95,  62,   1,  47,  83,  72,  82,   1,  49,  72,
          82,   1,  79,  93,  13,   1,  39,  78,  47,  84,   1,  75,  83,  70,
          83,  66,  95,  66,   1,  75,  83,  65,  83,  72,  93,  33,   1,   9,
          71,  90,  63,  59,  10,   1,  71,  90,  33,   1,  75,  83,  52,  73,
          61,   1,  47,  73,  90,  33, 108, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107

In [None]:
{i:j for i,j in hindi_to_index.items() if j==107}#above is for just batch of 3 sample sentences

{'<padding>': 107}

In [None]:
NEG_INFTY = -1e9

def create_masks(eng_batch, kn_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, kn_sentence_length = len(eng_batch[idx]), len(kn_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      kn_chars_to_padding_mask = np.arange(kn_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, kn_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, kn_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, kn_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    print(f"encoder_self_attention_mask {encoder_self_attention_mask.size()}: {encoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_self_attention_mask {decoder_self_attention_mask.size()}: {decoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_cross_attention_mask {decoder_cross_attention_mask.size()}: {decoder_cross_attention_mask[0, :10, :10]}")
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [None]:
batch

[('Browse the various methods of the current accessible',
  'Hide private attributes',
  'Method'),
 ('इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें',
  'निजी गुणों को छिपाएं',
  'विधि')]

In [None]:
batch[0]

('Browse the various methods of the current accessible',
 'Hide private attributes',
 'Method')

In [None]:
#masking ofor deocder

create_masks(batch[0], batch[1])

encoder_self_attention_mask torch.Size([3, 200, 200]): tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
decoder_self_attention_mask torch.Size([3, 200, 200]): tensor([[ 0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09,
         -1.0000e

(tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          ...,
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09]],
 
         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          ...,
    

In [None]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = RoPEEmbedding(d_model)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self, batch, start_token=True, end_token=True):

        def tokenize(sentence, start_token=True, end_token=True):
            sentence_word_indicies = [self.language_to_index[token] for token in list(sentence)]
            if start_token:
                sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sentence_word_indicies), self.max_sequence_length):
                sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])

            return torch.tensor(sentence_word_indicies, device=x.device)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
        tokenized = torch.stack(tokenized).to(x.device)
        print('tokenized',tokenized)
        return tokenized

    def forward(self, x, start_token=False,end_token=True): # sentence
        x = self.batch_tokenize(x ,end_token)
        x = self.embedding(x)
        pos = self.position_encoder(x)
        x = self.dropout(x + pos)
        return x

class RoPEEmbedding(torch.nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        assert embedding_dim % 2 == 0, "Embedding dimension must be even for RoPE"
        self.embedding_dim = embedding_dim

    def forward(self, x):
        """
        Forward pass for Rotary Position Embedding.

        Args:
        - x: Tensor of shape (batch_size, seq_len, embedding_dim)

        Returns:
        - Tensor with RoPE applied to the last two dimensions.
        """
        seq_len = x.shape[1]

        # Generate position indices
        position_ids = torch.arange(seq_len, dtype=torch.float32, device=x.device)

        # Compute the rotary angles
        freqs = 1.0 / (10000 ** (torch.arange(0, self.embedding_dim, 2, dtype=torch.float32, device=x.device) / self.embedding_dim))
        angles = torch.einsum('i,j->ij', position_ids, freqs)

        # Create the rotation matrix for sin and cos embeddings
        sin = torch.sin(angles).repeat_interleave(2, dim=-1)
        cos = torch.cos(angles).repeat_interleave(2, dim=-1)

        # Apply rotation using cos and sin embeddings
        x1 = x * cos + self.rotate_half(x) * sin
        return x1
    def rotate_half(self,x):
          """
          Rotate the last dimension by swapping adjacent components and negating the correct ones.
          """
          x1 = x[..., ::2]  # Elements at even positions: x1, x3, x5
          x2 = x[..., 1::2]  # Elements at odd positions: x2, x4, x6
          return torch.flatten(torch.stack([-x2, x1], dim=-1), start_dim=-2)  # Interleave and negate correctly



In [None]:
batch

[('Browse the various methods of the current accessible',
  'Hide private attributes',
  'Method'),
 ('इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें',
  'निजी गुणों को छिपाएं',
  'विधि')]

In [None]:
english_to_index

{'<start>': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '#': 4,
 '$': 5,
 '%': 6,
 '&': 7,
 "'": 8,
 '(': 9,
 ')': 10,
 '*': 11,
 '+': 12,
 ',': 13,
 '-': 14,
 '.': 15,
 '/': 16,
 '0': 17,
 '1': 18,
 '2': 19,
 '3': 20,
 '4': 21,
 '5': 22,
 '6': 23,
 '7': 24,
 '8': 25,
 '9': 26,
 ':': 27,
 '<': 28,
 '=': 29,
 '>': 30,
 '?': 31,
 '@': 32,
 'A': 33,
 'B': 34,
 'C': 35,
 'D': 36,
 'E': 37,
 'F': 38,
 'G': 39,
 'H': 40,
 'I': 41,
 'J': 42,
 'K': 43,
 'L': 44,
 'M': 45,
 'N': 46,
 'O': 47,
 'P': 48,
 'Q': 49,
 'R': 50,
 'S': 51,
 'T': 52,
 'U': 53,
 'V': 54,
 'W': 55,
 'X': 56,
 'Y': 57,
 'Z': 58,
 '[': 59,
 '\\': 60,
 ']': 61,
 '^': 62,
 '_': 63,
 '`': 64,
 'a': 65,
 'b': 66,
 'c': 67,
 'd': 68,
 'e': 69,
 'f': 70,
 'g': 71,
 'h': 72,
 'i': 73,
 'j': 74,
 'k': 75,
 'l': 76,
 'm': 77,
 'n': 78,
 'o': 79,
 'p': 80,
 'q': 81,
 'r': 82,
 's': 83,
 't': 84,
 'u': 85,
 'v': 86,
 'w': 87,
 'x': 88,
 'y': 89,
 'z': 90,
 '{': 91,
 '|': 92,
 '}': 93,
 '~': 94,
 '<padding>': 95,
 '<end>': 96}

In [None]:
len(batch[0][0])

52

In [None]:
batch[0][0]

'Browse the various methods of the current accessible'

In [None]:
tokenize(batch[1][0],hindi_to_index,start_token=True,end_token=True)

tensor([  0,  37,  78,   1,  78,  71,  72,   1,  54,  83,  78,  90,   1,  67,
         95,  73,  82,  67,  95,  62,   1,  47,  83,  72,  82,   1,  49,  72,
         82,   1,  79,  93,  13,   1,  39,  78,  47,  84,   1,  75,  83,  70,
         83,  66,  95,  66,   1,  75,  83,  65,  83,  72,  93,  33,   1,   9,
         71,  90,  63,  59,  10,   1,  71,  90,  33,   1,  75,  83,  52,  73,
         61,   1,  47,  73,  90,  33, 108, 107, 107, 107, 107, 107, 107, 107,
        107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
        107, 107])

In [None]:
hindi_to_index

{'<start>': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '#': 4,
 '$': 5,
 '%': 6,
 '&': 7,
 "'": 8,
 '(': 9,
 ')': 10,
 '*': 11,
 '+': 12,
 ',': 13,
 '-': 14,
 '.': 15,
 '/': 16,
 '0': 17,
 '1': 18,
 '2': 19,
 '3': 20,
 '4': 21,
 '5': 22,
 '6': 23,
 '7': 24,
 '8': 25,
 '9': 26,
 ':': 27,
 '<': 28,
 '=': 29,
 '>': 30,
 '?': 31,
 'ँ': 32,
 'ं': 33,
 'ः': 34,
 'अ': 35,
 'आ': 36,
 'इ': 37,
 'ई': 38,
 'उ': 39,
 'ऊ': 40,
 'ऋ': 41,
 'ऌ': 42,
 'ए': 43,
 'ऐ': 44,
 'ओ': 45,
 'औ': 46,
 'क': 47,
 'ख': 48,
 'ग': 49,
 'घ': 50,
 'ङ': 51,
 'च': 52,
 'छ': 53,
 'ज': 54,
 'झ': 55,
 'ञ': 56,
 'ट': 57,
 'ठ': 58,
 'ड': 59,
 'ढ': 60,
 'ण': 61,
 'त': 62,
 'थ': 63,
 'द': 64,
 'ध': 65,
 'न': 66,
 'प': 67,
 'फ': 68,
 'ब': 69,
 'भ': 70,
 'म': 71,
 'य': 72,
 'र': 73,
 'ल': 74,
 'व': 75,
 'श': 76,
 'ष': 77,
 'स': 78,
 'ह': 79,
 '़': 80,
 'ऽ': 81,
 'ा': 82,
 'ि': 83,
 'ी': 84,
 'ु': 85,
 'ू': 86,
 'ृ': 87,
 'ॄ': 88,
 'ॅ': 89,
 'े': 90,
 'ै': 91,
 'ॉ': 92,
 'ो': 93,
 'ौ': 94,
 '्': 95,
 'ॐ': 96,
 '०': 97,
 '१': 98,
 '२': 99,
 '३'

In [None]:
d_model = 2
max_sequence_length = 100
hn_vocab_size = len(hindi_vocabulary)
eng_vocab_size = len(english_vocabulary)
engtokenization=SentenceEmbedding(max_sequence_length, d_model, hindi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
t1=engtokenization(batch[1],start_token=True,end_token=True)
t1.shape

tokenized tensor([[  0,  37,  78,   1,  78,  71,  72,   1,  54,  83,  78,  90,   1,  67,
          95,  73,  82,  67,  95,  62,   1,  47,  83,  72,  82,   1,  49,  72,
          82,   1,  79,  93,  13,   1,  39,  78,  47,  84,   1,  75,  83,  70,
          83,  66,  95,  66,   1,  75,  83,  65,  83,  72,  93,  33,   1,   9,
          71,  90,  63,  59,  10,   1,  71,  90,  33,   1,  75,  83,  52,  73,
          61,   1,  47,  73,  90,  33, 108, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107],
        [  0,  66,  83,  54,  84,   1,  49,  85,  61,  93,  33,   1,  47,  93,
           1,  53,  83,  67,  82,  43,  33, 108, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
         107, 107, 107

torch.Size([3, 100, 2])

In [None]:
#see in above each sentence is encoded by 200 token 3 batch so 3,200 eahc token 2 dim so 3,200,2

In [None]:
#test sample


In [None]:
t1# bathc of 3 sentenses this is for english so eahc sentence has 200 char and eahc char has 2 dim

tensor([[[-0.0000e+00, -0.0000e+00],
         [ 3.6776e-01, -1.5366e+00],
         [-5.8199e-02,  2.1279e-01],
         [ 5.6764e-02, -1.3904e-01],
         [ 1.5382e-01,  5.0668e-02],
         [-1.5258e+00, -1.8942e+00],
         [-9.3235e-01, -3.5708e-01],
         [-4.5978e-01, -1.2244e+00],
         [ 4.1604e-01,  2.9574e+00],
         [-5.9714e-01, -1.5521e-01],
         [-2.0421e-01,  2.0034e-01],
         [-9.8733e-01, -6.9212e-02],
         [-1.7813e+00, -7.5952e-02],
         [ 2.5517e+00,  1.7287e+00],
         [ 4.9911e-03,  3.0778e-01],
         [ 3.5298e-01,  1.1101e+00],
         [-1.7555e-01,  2.5419e-01],
         [ 1.1416e+00, -9.6630e-01],
         [-1.2312e+00, -2.4771e+00],
         [-8.3439e-01, -1.1151e+00],
         [ 6.5675e-01,  2.3415e+00],
         [ 5.6464e-03, -4.7954e-01],
         [ 1.1032e-02, -1.6947e-02],
         [ 0.0000e+00,  1.1678e+00],
         [-1.7812e+00,  1.9309e-01],
         [-1.4836e+00, -2.5043e+00],
         [ 1.0656e+00, -2.5406e+00],
 

In [None]:
eng_tokenized.shape

torch.Size([3, 200])

In [None]:
len(eng_tokenized[1])

200

In [None]:
eng_tokenized#for 1st batch 3 eng sentences

tensor([[34, 82, 79, 87, 83, 69,  1, 84, 72, 69,  1, 86, 65, 82, 73, 79, 85, 83,
          1, 77, 69, 84, 72, 79, 68, 83,  1, 79, 70,  1, 84, 72, 69,  1, 67, 85,
         82, 82, 69, 78, 84,  1, 65, 67, 67, 69, 83, 83, 73, 66, 76, 69, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95],
        [40, 73, 68, 69,  1, 80, 82, 73, 86, 65, 84, 69,  1, 65, 84, 84, 82, 73,
         6

In [None]:
# above was character embedding now we try for wordise embeding

In [None]:

#def batch_tokenize(batch):
def tokenize(sentence):
  words = sentence.split()  # ✅ Now splits by words
  # for below The .get() method of a dictionary tries to find the token in language_to_index.If token exists in the dictionary, it returns the corresponding index.If token does not exist in the dictionary, it returns language_to_index[PADDING_TOKEN],
  sentence_word_indices = [language_to_index.get(token, language_to_index[PADDING_TOKEN]) for token in words]
  print('sentence_word_indices',sentence_word_indices)
  return torch.tensor(sentence_word_indices)


# === Special Tokens ===
START_TOKEN = "<START>"
END_TOKEN = "<END>"
PADDING_TOKEN = "<PAD>"

# === Define Example Vocabulary ===
language_to_index = {
    "Browse": 1, "the": 2, "various": 3, "methods": 4, "of": 5, "current": 6, "accessible": 7,
    "Hide": 8, "private": 9, "attributes": 10, "Method": 11,
    START_TOKEN: 12, END_TOKEN: 13, PADDING_TOKEN: 0  # Special tokens
}

batch = [
    "Browse the various methods of the current accessible",
    "Hide private attributes",
    "Method"
]

tokenized_sentences = [tokenize(sentence) for sentence in batch]
print("Tokenized Sentences (Word Indices):\n", tokenized_sentences)


sentence_word_indices [1, 2, 3, 4, 5, 2, 6, 7]
sentence_word_indices [8, 9, 10]
sentence_word_indices [11]
Tokenized Sentences (Word Indices):
 [tensor([1, 2, 3, 4, 5, 2, 6, 7]), tensor([ 8,  9, 10]), tensor([11])]


In [None]:
torch.stack(tokenized_sentences)# u get his error cuz size of all tensor shud be same

RuntimeError: stack expects each tensor to be equal size, but got [8] at entry 0 and [3] at entry 1

In [None]:
# so we make it same using truncation

def batch_tokenize(batch,max_seq_length,start,end):
  def tokenize(sentence):
    words = sentence.split()  # ✅ Now splits by words
    # for below The .get() method of a dictionary tries to find the token in language_to_index.If token exists in the dictionary, it returns the corresponding index.If token does not exist in the dictionary, it returns language_to_index[PADDING_TOKEN],
    sentence_word_indices = [language_to_index.get(token, language_to_index[PADDING_TOKEN]) for token in words]
    print('sentence_word_indices',sentence_word_indices)
    # ✅ Apply truncation BEFORE adding special tokens
    print('after truncation')
    sentence_word_indices = sentence_word_indices[:max_seq_length - (1 if start else 0) - (1 if end else 0)]
    print('sentence_word_indices',sentence_word_indices)
    # Apply padding if needed (padding should be added after truncation in case after trunction needs padding)
    if len(sentence_word_indices) < max_seq_length:
        padding_length = max_seq_length - len(sentence_word_indices)
        sentence_word_indices.extend([language_to_index[PADDING_TOKEN]] * padding_length)

    print('after padding')
    print('sentence_word_indices',sentence_word_indices)
    return torch.tensor(sentence_word_indices)

  return torch.stack([tokenize(sentence) for sentence in batch])

# === Special Tokens ===
START_TOKEN = "<START>"
END_TOKEN = "<END>"
PADDING_TOKEN = "<PAD>"

# === Define Example Vocabulary ===
language_to_index = {
    "Browse": 1, "the": 2, "various": 3, "methods": 4, "of": 5, "current": 6, "accessible": 7,
    "Hide": 8, "private": 9, "attributes": 10, "Method": 11,
    START_TOKEN: 12, END_TOKEN: 13, PADDING_TOKEN: 0  # Special tokens
}

batch = [
    "Browse the various methods of the current accessible",
    "Hide private attributes of them",
    "Method acting"
]
maxtoken=3


tokenized_sentences = batch_tokenize(batch,maxtoken,start=False,end=False)
print("Tokenized Sentences (Word Indices):\n", tokenized_sentences)


sentence_word_indices [1, 2, 3, 4, 5, 2, 6, 7]
after truncation
sentence_word_indices [1, 2, 3]
after padding
sentence_word_indices [1, 2, 3]
sentence_word_indices [8, 9, 10, 5, 0]
after truncation
sentence_word_indices [8, 9, 10]
after padding
sentence_word_indices [8, 9, 10]
sentence_word_indices [11, 0]
after truncation
sentence_word_indices [11, 0]
after padding
sentence_word_indices [11, 0, 0]
Tokenized Sentences (Word Indices):
 tensor([[ 1,  2,  3],
        [ 8,  9, 10],
        [11,  0,  0]])


In [None]:
language_to_index.get('Method','not found')

11

In [None]:
import torch
import torch.nn
# now add start and end token

# so we make it same using truncation

def batch_tokenize(batch,max_seq_length,language_to_index,start,end):
  def tokenize(sentence):
    words = sentence.split()  # ✅ Now splits by words
    # for below The .get() method of a dictionary tries to find the token in language_to_index.If token exists in the dictionary, it returns the corresponding index.If token does not exist in the dictionary, it returns language_to_index[PADDING_TOKEN],
    sentence_word_indices = [language_to_index.get(token, language_to_index[PADDING_TOKEN]) for token in words]
    print('sentence_word_indices',sentence_word_indices)
    # ✅ Apply truncation BEFORE adding special tokens
    print('after truncation')
    sentence_word_indices = sentence_word_indices[:max_seq_length - (1 if start else 0) - (1 if end else 0)]
    print('sentence_word_indices',sentence_word_indices)

    # ✅ Add special tokens
    if start:
        sentence_word_indices.insert(0, language_to_index[START_TOKEN])
        print('after start',sentence_word_indices)
    if end:
        sentence_word_indices.append(language_to_index[END_TOKEN])
        print('after end',sentence_word_indices)

    # Apply padding if needed (padding should be added after truncation in case after trunction needs padding)
    if len(sentence_word_indices) < max_seq_length:
        padding_length = max_seq_length - len(sentence_word_indices)
        sentence_word_indices.extend([language_to_index[PADDING_TOKEN]] * padding_length)

    print('after padding')
    print('sentence_word_indices',sentence_word_indices)
    return torch.tensor(sentence_word_indices)

  return torch.stack([tokenize(sentence) for sentence in batch])

# === Special Tokens ===
START_TOKEN = "<START>"
END_TOKEN = "<END>"
PADDING_TOKEN = "<PAD>"

# === Define Example Vocabulary ===
language_to_index = {
    "Browse": 1, "the": 2, "various": 3, "methods": 4, "of": 5, "current": 6, "accessible": 7,
    "Hide": 8, "private": 9, "attributes": 10, "Method": 11,
    START_TOKEN: 12, END_TOKEN: 13, PADDING_TOKEN: 0  # Special tokens
}
print('language_to_index',language_to_index)
batch = [
    "Browse the various methods of the current accessible",
    "Hide private attributes of them",
    "Method acting"
]
maxtoken=3


tokenized_sentences = batch_tokenize(batch,maxtoken,language_to_index=language_to_index,start=True,end=False)
print("Tokenized Sentences (Word Indices):\n", tokenized_sentences)


language_to_index {'Browse': 1, 'the': 2, 'various': 3, 'methods': 4, 'of': 5, 'current': 6, 'accessible': 7, 'Hide': 8, 'private': 9, 'attributes': 10, 'Method': 11, '<START>': 12, '<END>': 13, '<PAD>': 0}
sentence_word_indices [1, 2, 3, 4, 5, 2, 6, 7]
after truncation
sentence_word_indices [1, 2]
after start [12, 1, 2]
after padding
sentence_word_indices [12, 1, 2]
sentence_word_indices [8, 9, 10, 5, 0]
after truncation
sentence_word_indices [8, 9]
after start [12, 8, 9]
after padding
sentence_word_indices [12, 8, 9]
sentence_word_indices [11, 0]
after truncation
sentence_word_indices [11, 0]
after start [12, 11, 0]
after padding
sentence_word_indices [12, 11, 0]
Tokenized Sentences (Word Indices):
 tensor([[12,  1,  2],
        [12,  8,  9],
        [12, 11,  0]])


In [None]:
#max len 5
import torch
import torch.nn


def batch_tokenize(batch,max_seq_length,language_to_index,start,end):
  def tokenize(sentence):
    words = sentence.split()  # ✅ Now splits by words
    # for below The .get() method of a dictionary tries to find the token in language_to_index.If token exists in the dictionary, it returns the corresponding index.If token does not exist in the dictionary, it returns language_to_index[PADDING_TOKEN],
    sentence_word_indices = [language_to_index.get(token, language_to_index[PADDING_TOKEN]) for token in words]
    print('sentence_word_indices',sentence_word_indices)
    # ✅ Apply truncation BEFORE adding special tokens
    print('after truncation')
    sentence_word_indices = sentence_word_indices[:max_seq_length - (1 if start else 0) - (1 if end else 0)]
    print('sentence_word_indices',sentence_word_indices)

    # ✅ Add special tokens
    if start:
        sentence_word_indices.insert(0, language_to_index[START_TOKEN])
        print('after start',sentence_word_indices)
    if end:
        sentence_word_indices.append(language_to_index[END_TOKEN])
        print('after end',sentence_word_indices)

    # Apply padding if needed (padding should be added after truncation in case after trunction needs padding)
    if len(sentence_word_indices) < max_seq_length:
        padding_length = max_seq_length - len(sentence_word_indices)
        sentence_word_indices.extend([language_to_index[PADDING_TOKEN]] * padding_length)

    print('after padding')
    print('sentence_word_indices',sentence_word_indices)
    return torch.tensor(sentence_word_indices)

  return torch.stack([tokenize(sentence) for sentence in batch])

# === Special Tokens ===
START_TOKEN = "<START>"
END_TOKEN = "<END>"
PADDING_TOKEN = "<PAD>"

# === Define Example Vocabulary ===
language_to_index = {
    "Browse": 1, "the": 2, "various": 3, "methods": 4, "of": 5, "current": 6, "accessible": 7,
    "Hide": 8, "private": 9, "attributes": 10, "Method": 11,
    START_TOKEN: 12, END_TOKEN: 13, PADDING_TOKEN: 0  # Special tokens
}
print('language_to_index',language_to_index)
batch = [
    "Browse the various methods of the current accessible",
    "Hide private attributes of them",
    "Method acting"
]
maxtoken=5


tokenized_sentences = batch_tokenize(batch,maxtoken,language_to_index=language_to_index,start=True,end=True)
print("Tokenized Sentences (Word Indices):\n", tokenized_sentences)


language_to_index {'Browse': 1, 'the': 2, 'various': 3, 'methods': 4, 'of': 5, 'current': 6, 'accessible': 7, 'Hide': 8, 'private': 9, 'attributes': 10, 'Method': 11, '<START>': 12, '<END>': 13, '<PAD>': 0}
sentence_word_indices [1, 2, 3, 4, 5, 2, 6, 7]
after truncation
sentence_word_indices [1, 2, 3]
after start [12, 1, 2, 3]
after end [12, 1, 2, 3, 13]
after padding
sentence_word_indices [12, 1, 2, 3, 13]
sentence_word_indices [8, 9, 10, 5, 0]
after truncation
sentence_word_indices [8, 9, 10]
after start [12, 8, 9, 10]
after end [12, 8, 9, 10, 13]
after padding
sentence_word_indices [12, 8, 9, 10, 13]
sentence_word_indices [11, 0]
after truncation
sentence_word_indices [11, 0]
after start [12, 11, 0]
after end [12, 11, 0, 13]
after padding
sentence_word_indices [12, 11, 0, 13, 0]
Tokenized Sentences (Word Indices):
 tensor([[12,  1,  2,  3, 13],
        [12,  8,  9, 10, 13],
        [12, 11,  0, 13,  0]])


In [None]:
#now for eng to hin

# Hindi vocabulary
hindi_to_index = {
    "इस": 1, "समय": 2, "जिसे": 3, "प्राप्त": 4, "किया": 5, "गया": 6, "हो": 7, "विभिन्न": 8,
    "विधियों": 9, "में": 10, "विचरण": 11, "करें": 12, "निजी": 13, "गुणों": 14, "को": 15,
    "छिपाएं": 16, "विधि": 17,
    START_TOKEN: 18, END_TOKEN: 19, PADDING_TOKEN: 0  # Special tokens
}

# === Special Tokens ===
START_TOKEN = "<START>"
END_TOKEN = "<END>"
PADDING_TOKEN = "<PAD>"

# === Define Example Vocabulary ===
eng_to_index = {
    "Browse": 1, "the": 2, "various": 3, "methods": 4, "of": 5, "current": 6, "accessible": 7,
    "Hide": 8, "private": 9, "attributes": 10, "Method": 11,
    START_TOKEN: 12, END_TOKEN: 13, PADDING_TOKEN: 0  # Special tokens
}


# === Example Bilingual Sentences (English ↔ Hindi) ===
bilingual_batch = [
    ("Browse the various methods of the current accessible", "इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें"),
    ("Hide private attributes", "निजी गुणों को छिपाएं"),
    ("Method", "विधि")
]

# === Parameters ===
max_seq_length = 5  # ✅ Limit to 5 words per sentence

# === Separate English and Hindi Sentences ===
english_sentences = [pair[0] for pair in bilingual_batch]
hindi_sentences = [pair[1] for pair in bilingual_batch]

# === Tokenization ===
english_tokenized = batch_tokenize(english_sentences, max_seq_length, eng_to_index,start=True,end=True)
hindi_tokenized = batch_tokenize(hindi_sentences, max_seq_length, hindi_to_index,start=True,end=True)

print("English Tokenized:\n", english_tokenized)
print("Hindi Tokenized:\n", hindi_tokenized)


sentence_word_indices [1, 2, 3, 4, 5, 2, 6, 7]
after truncation
sentence_word_indices [1, 2, 3]
after start [12, 1, 2, 3]
after end [12, 1, 2, 3, 13]
after padding
sentence_word_indices [12, 1, 2, 3, 13]
sentence_word_indices [8, 9, 10]
after truncation
sentence_word_indices [8, 9, 10]
after start [12, 8, 9, 10]
after end [12, 8, 9, 10, 13]
after padding
sentence_word_indices [12, 8, 9, 10, 13]
sentence_word_indices [11]
after truncation
sentence_word_indices [11]
after start [12, 11]
after end [12, 11, 13]
after padding
sentence_word_indices [12, 11, 13, 0, 0]
sentence_word_indices [1, 2, 3, 4, 5, 6, 0, 0, 8, 9, 0, 10, 11, 12]
after truncation
sentence_word_indices [1, 2, 3]
after start [18, 1, 2, 3]
after end [18, 1, 2, 3, 19]
after padding
sentence_word_indices [18, 1, 2, 3, 19]
sentence_word_indices [13, 14, 15, 16]
after truncation
sentence_word_indices [13, 14, 15]
after start [18, 13, 14, 15]
after end [18, 13, 14, 15, 19]
after padding
sentence_word_indices [18, 13, 14, 15, 19]

In [None]:
language_to_index

{'Browse': 1,
 'the': 2,
 'various': 3,
 'methods': 4,
 'of': 5,
 'current': 6,
 'accessible': 7,
 'Hide': 8,
 'private': 9,
 'attributes': 10,
 'Method': 11,
 '<START>': 12,
 '<END>': 13,
 '<PAD>': 0}

In [None]:
#max len 5 #final
import torch
import torch.nn


def batch_tokenize(batch,max_seq_length,language_to_index,start,end):
  def tokenize(sentence):
    words = sentence.split()  # ✅ Now splits by words
    # for below The .get() method of a dictionary tries to find the token in language_to_index.If token exists in the dictionary, it returns the corresponding index.If token does not exist in the dictionary, it returns language_to_index[PADDING_TOKEN],
    sentence_word_indices = [language_to_index.get(token, language_to_index[PADDING_TOKEN]) for token in words]
    # ✅ Apply truncation BEFORE adding special tokens
    sentence_word_indices = sentence_word_indices[:max_seq_length - (1 if start else 0) - (1 if end else 0)]

    # ✅ Add special tokens
    if start:
        sentence_word_indices.insert(0, language_to_index[START_TOKEN])

    if end:
        sentence_word_indices.append(language_to_index[END_TOKEN])


    # Apply padding if needed (padding should be added after truncation in case after trunction needs padding)
    if len(sentence_word_indices) < max_seq_length:
        padding_length = max_seq_length - len(sentence_word_indices)
        sentence_word_indices.extend([language_to_index[PADDING_TOKEN]] * padding_length)

    return torch.tensor(sentence_word_indices)

  return torch.stack([tokenize(sentence) for sentence in batch])

# === Special Tokens ===
START_TOKEN = "<START>"
END_TOKEN = "<END>"
PADDING_TOKEN = "<PAD>"
eng_to_index = {
    "Browse": 1, "the": 2, "various": 3, "methods": 4, "of": 5, "current": 6, "accessible": 7,
    "Hide": 8, "private": 9, "attributes": 10, "Method": 11,
    START_TOKEN: 12, END_TOKEN: 13, PADDING_TOKEN: 0  # Special tokens
}
hindi_to_index = {
    "इस": 1, "समय": 2, "जिसे": 3, "प्राप्त": 4, "किया": 5, "गया": 6, "हो": 7, "विभिन्न": 8,
    "विधियों": 9, "में": 10, "विचरण": 11, "करें": 12, "निजी": 13, "गुणों": 14, "को": 15,
    "छिपाएं": 16, "विधि": 17,
    START_TOKEN: 18, END_TOKEN: 19, PADDING_TOKEN: 0  # Special tokens
}

# === Example Bilingual Sentences (English ↔ Hindi) ===
bilingual_batch = [
    ("Browse the various methods of the current accessible", "इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें"),
    ("Hide private attributes", "निजी गुणों को छिपाएं"),
    ("Method", "विधि")
]

# === Parameters ===
max_seq_length = 5  # ✅ Limit to 5 words per sentence

# === Separate English and Hindi Sentences ===
english_sentences = [pair[0] for pair in bilingual_batch]
hindi_sentences = [pair[1] for pair in bilingual_batch]

# === Tokenization ===
english_tokenized = batch_tokenize(english_sentences, max_seq_length, eng_to_index,start=True,end=True)
hindi_tokenized = batch_tokenize(hindi_sentences, max_seq_length, hindi_to_index,start=True,end=True)

print("English Tokenized:\n", english_tokenized)
print("Hindi Tokenized:\n", hindi_tokenized)





English Tokenized:
 tensor([[12,  1,  2,  3, 13],
        [12,  8,  9, 10, 13],
        [12, 11, 13,  0,  0]])
Hindi Tokenized:
 tensor([[18,  1,  2,  3, 19],
        [18, 13, 14, 15, 19],
        [18, 17, 19,  0,  0]])


In [None]:
class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        # create embedding of dmodel from input tokens
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = RoPEEmbedding(d_model)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self,batch,start,end):
      def tokenize(sentence):
        words = sentence.split()  # ✅ Now splits by words
        # for below The .get() method of a dictionary tries to find the token in language_to_index.If token exists in the dictionary, it returns the corresponding index.If token does not exist in the dictionary, it returns language_to_index[PADDING_TOKEN],
        sentence_word_indices = [self.language_to_index.get(token, self.language_to_index[self.PADDING_TOKEN]) for token in words]
        # ✅ Apply truncation BEFORE adding special tokens
        sentence_word_indices = sentence_word_indices[:self.max_sequence_length - (1 if start else 0) - (1 if end else 0)]

        # ✅ Add special tokens
        if start:
            sentence_word_indices.insert(0, self.language_to_index[self.START_TOKEN])

        if end:
            sentence_word_indices.append(self.language_to_index[self.END_TOKEN])


        # Apply padding if needed (padding should be added after truncation in case after trunction needs padding)
        if len(sentence_word_indices) < self.max_sequence_length:
            padding_length = self.max_sequence_length - len(sentence_word_indices)
            sentence_word_indices.extend([self.language_to_index[self.PADDING_TOKEN]] * padding_length)

        return torch.tensor(sentence_word_indices)

      return torch.stack([tokenize(sentence) for sentence in batch])


    def forward(self, x,start,end): # sentence
        x = self.batch_tokenize(x,start,end)
        (print('x',x))
        x = self.embedding(x)
        pos = self.position_encoder(x)
        x = self.dropout(x + pos)
        return x


#rope embedding
class RoPEEmbedding(torch.nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        assert embedding_dim % 2 == 0, "Embedding dimension must be even for RoPE"
        self.embedding_dim = embedding_dim

    def forward(self, x):
        """
        Forward pass for Rotary Position Embedding.

        Args:
        - x: Tensor of shape (batch_size, seq_len, embedding_dim)

        Returns:
        - Tensor with RoPE applied to the last two dimensions.
        """
        seq_len = x.shape[1]

        # Generate position indices
        position_ids = torch.arange(seq_len, dtype=torch.float32, device=x.device)

        # Compute the rotary angles
        freqs = 1.0 / (10000 ** (torch.arange(0, self.embedding_dim, 2, dtype=torch.float32, device=x.device) / self.embedding_dim))
        angles = torch.einsum('i,j->ij', position_ids, freqs)

        # Create the rotation matrix for sin and cos embeddings
        sin = torch.sin(angles).repeat_interleave(2, dim=-1)
        cos = torch.cos(angles).repeat_interleave(2, dim=-1)

        # Apply rotation using cos and sin embeddings
        x1 = x * cos + self.rotate_half(x) * sin
        return x1


    def rotate_half(self,x):
        """
        Rotate the last dimension of the input tensor by swapping odd and even elements and negating one.

        Args:
        - x: Tensor of shape (..., embedding_dim)

        Returns:
        - Rotated tensor of the same shape.
        """
        x1, x2 = x[..., ::2], x[..., 1::2]  # Split into even and odd dimensions
        return torch.cat((-x2, x1), dim=-1)

#sine embedding
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self, x):
        batch_size, seq_len, _ = x.size()

        # Generate the positional encoding
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)

        # Ensure the PE matches the batch size and sequence length
        PE = PE.unsqueeze(0).expand(batch_size, seq_len, self.d_model)

        return PE + x




In [None]:
#testing


# === Special Tokens ===
START_TOKEN = "<START>"
END_TOKEN = "<END>"
PADDING_TOKEN = "<PAD>"
eng_to_index = {
    "Browse": 2, "the": 1, "various": 4, "methods": 3, "of": 5, "current": 7, "accessible": 6,
    "Hide": 11, "private": 10, "attributes": 12, "Method": 9,
    START_TOKEN: 12, END_TOKEN: 13, PADDING_TOKEN: 0  # Special tokens
}
hindi_to_index = {
    "इस": 1, "समय": 2, "जिसे": 3, "प्राप्त": 4, "किया": 5, "गया": 6, "हो": 7, "विभिन्न": 8,
    "विधियों": 9, "में": 10, "विचरण": 11, "करें": 12, "निजी": 13, "गुणों": 14, "को": 15,
    "छिपाएं": 16, "विधि": 17,
    START_TOKEN: 18, END_TOKEN: 19, PADDING_TOKEN: 0  # Special tokens
}

# === Example Bilingual Sentences (English ↔ Hindi) ===
bilingual_batch = [
    ("Browse the various methods of the current accessible", "इस समय जिसे प्राप्त किया गया हो, उसकी विभिन्न विधियों (मेथड) में विचरण करें"),
    ("Hide private attributes", "निजी गुणों को छिपाएं"),
    ("Method", "विधि")
]

# === Separate English and Hindi Sentences ===
english_sentences = [pair[0] for pair in bilingual_batch]
hindi_sentences = [pair[1] for pair in bilingual_batch]

index_to_hindi = {v: k for k, v in hindi_to_index.items()}

# === Parameters ===
max_seq_length = 5  # ✅ Limit to 5 words per sentence

# === Separate English and Hindi Sentences ===
english_sentences = [pair[0] for pair in bilingual_batch]
hindi_sentences = [pair[1] for pair in bilingual_batch]

d_model = 2
batch_size = 3
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 1
max_sequence_length = 5
hn_vocab_size = len(hindi_to_index)
eng_vocab_size = len(eng_to_index)
torch.manual_seed(2)
tokenization=SentenceEmbedding(max_sequence_length, d_model, hindi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
#engtoken=tokenization(english_sentences,start=True,end=True)
#print('engtoken',engtoken)
sample='इस'
sample.split()
print('hindi_sentences',sample.split())
hintoken=tokenization(sample.split(),start=True,end=True)

#print('hintoken',hintoken)

hindi_sentences ['इस']
x tensor([[18,  1, 19,  0,  0]])


In [None]:
#below is for char by char embedding

In [2]:
sample='इस'
sample.split()

['इस']

In [69]:


class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = RoPEEmbedding(d_model)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self, batch, start_token=True, end_token=True):
      def tokenize(sentence, start_token=True, end_token=True):
          # Convert sentence to list of word indices
          sentence_word_indices = [self.language_to_index[token] for token in list(sentence)]

          # Add start and end tokens if needed
          if start_token:
              sentence_word_indices.insert(0, self.language_to_index[self.START_TOKEN])
          if end_token:
              sentence_word_indices.append(self.language_to_index[self.END_TOKEN])

          # Pad the sentence to max_sequence_length
          while len(sentence_word_indices) < self.max_sequence_length:
              sentence_word_indices.append(self.language_to_index[self.PADDING_TOKEN])

          # Ensure the sequence length doesn't exceed max_sequence_length
          sentence_word_indices = sentence_word_indices[:self.max_sequence_length]

          return torch.tensor(sentence_word_indices)

      tokenized = []
      for sentence_num in range(len(batch)):
          tokenized.append(tokenize(batch[sentence_num], start_token, end_token))

      # All sentences are now padded to the same length, so stack them
      tokenized = torch.stack(tokenized)  # All tensors will have the same size now
      return tokenized


    def forward(self, x,start_token, end_token=True): # sentence
        x = self.batch_tokenize(x ,start_token,end_token)
        x = self.embedding(x)
        pos = self.position_encoder(x)
        x = self.dropout(x + pos)
        return x

class RoPEEmbedding(torch.nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        assert embedding_dim % 2 == 0, "Embedding dimension must be even for RoPE"
        self.embedding_dim = embedding_dim

    def forward(self, x):
        """
        Forward pass for Rotary Position Embedding.

        Args:
        - x: Tensor of shape (batch_size, seq_len, embedding_dim)

        Returns:
        - Tensor with RoPE applied to the last two dimensions.
        """
        seq_len = x.shape[1]

        # Generate position indices
        position_ids = torch.arange(seq_len, dtype=torch.float32, device=x.device)

        # Compute the rotary angles
        freqs = 1.0 / (10000 ** (torch.arange(0, self.embedding_dim, 2, dtype=torch.float32, device=x.device) / self.embedding_dim))
        angles = torch.einsum('i,j->ij', position_ids, freqs)

        # Create the rotation matrix for sin and cos embeddings
        sin = torch.sin(angles).repeat_interleave(2, dim=-1)
        cos = torch.cos(angles).repeat_interleave(2, dim=-1)

        # Apply rotation using cos and sin embeddings
        x1 = x * cos + self.rotate_half(x) * sin
        return x1
    def rotate_half(self,x):
          """
          Rotate the last dimension by swapping adjacent components and negating the correct ones.
          """
          x1 = x[..., ::2]  # Elements at even positions: x1, x3, x5
          x2 = x[..., 1::2]  # Elements at odd positions: x2, x4, x6
          return torch.flatten(torch.stack([-x2, x1], dim=-1), start_dim=-2)  # Interleave and negate correctly



In [70]:
#testing


# === Special Tokens ===
START_TOKEN = "<START>"
END_TOKEN = "<END>"
PADDING_TOKEN = "<PAD>"

START_TOKEN = '<start>'
PADDING_TOKEN = '<padding>'
END_TOKEN = '<end>'

english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', '@',
                      'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                      'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                      'Y', 'Z', '[', '\\', ']', '^', '_', '`',
                      'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                      'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                      'y', 'z', '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]



hindi_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ँ', 'ं', 'ः',
                    'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ऌ', 'ए', 'ऐ', 'ओ', 'औ',
                    'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण',
                    'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह',
                    '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्', 'ॐ',
                    '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', PADDING_TOKEN, END_TOKEN]





index_to_hindi = {k:v for k,v in enumerate(hindi_vocabulary)}
hindi_to_index = {v:k for k,v in enumerate(hindi_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}
# === Example Bilingual Sentences (English ↔ Hindi) ===


# === Separate English and Hindi Sentences ===
english_sentences =batch[0]
hindi_sentences = batch[1]



d_model = 2
batch_size = 3
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 1
max_sequence_length = 3
torch.manual_seed(2)
engtokenization=SentenceEmbedding(max_sequence_length, d_model, english_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
engtoken=engtokenization(batch[0],start_token=True,end_token=True)
print('engtoken',engtoken.shape)


engtoken torch.Size([3, 3, 2])


In [71]:
max_sequence_length=3
sample=['इस','निजी गुणों']
print('hindi_sentences',sample)
hintokenization=SentenceEmbedding(max_sequence_length, d_model, hindi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
hintoken=hintokenization(sample,start_token=True,end_token=True)
print('hintoken',hintoken.shape)

hindi_sentences ['इस', 'निजी गुणों']
hintoken torch.Size([2, 3, 2])


In [72]:
hintoken

tensor([[[ 2.0528, -2.7078],
         [ 2.1669, -0.7417],
         [-1.0729,  1.6896]],

        [[ 2.0528, -2.7078],
         [-4.9444,  1.3839],
         [ 0.0000, -0.0000]]], grad_fn=<MulBackward0>)

In [1]:
#break into batches

In [None]:
import pandas as pd



# Read the CSV file
df = pd.read_csv('Deeplearning/LLMs from scratch/hindi_english_parallel.csv')

# Check the data
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'Deeplearning\\LLMs_from_scratch\\hindi_english_parallel.csv'

In [104]:
df.isnull().sum()

Unnamed: 0,0
English,2
Hindi,312


In [105]:
df[df.isnull()].head()

Unnamed: 0,English,Hindi
0,,
1,,
2,,
3,,
4,,


In [106]:
df.shape

(130476, 2)

In [107]:
df.dropna(inplace=True)

In [108]:
df.isnull().sum()

Unnamed: 0,0
English,0
Hindi,0


In [122]:
sample=df[:100000].copy()

In [123]:
engsen=sample['English'].to_list()
hindisen=sample['Hindi'].to_list()

In [124]:
engsen[:10],hindisen[:10]

(['Help!',
  'Jump.',
  'Jump.',
  'Jump.',
  'Hello!',
  'Hello!',
  'Cheers!',
  'Cheers!',
  'Got it?',
  "I'm OK."],
 ['बचाओ!',
  'उछलो.',
  'कूदो.',
  'छलांग.',
  'नमस्ते।',
  'नमस्कार।',
  'वाह-वाह!',
  'चियर्स!',
  'समझे कि नहीं?',
  'मैं ठीक हूँ।'])

In [165]:
engsen1=engsen[:10000]
hindisen1=hindisen[:10000]

In [166]:

max_sequence_length = 200
# to check if a token or character/alphabet ins engsen or hindi is present in about hindi/eng vocab pf charceter
def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

#to check if engsend or hindisen each sent has max 200 charcers
def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(hindisen1)):
    hindi_sentence, english_sentence = hindisen1[index], engsen1[index]
    if is_valid_length(hindi_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(hindi_sentence, hindi_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(hindisen1)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 10000
Number of valid sentences: 4966


In [167]:
max_sequence_length = 200  # Maximum allowed characters
min_sequence_length = 40   # Minimum required characters

# Function to check if a sentence contains only valid tokens
def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):  # Ensure unique characters are checked
        if token not in vocab:
            return False
    return True

# Function to check if sentence length is within min & max limits
def is_valid_length(sentence, min_length, max_length):
    sentence_length = len(sentence)  # No need to convert to list explicitly
    return min_length <= sentence_length < max_length  # Ensuring it fits the range

valid_sentence_indices = []
for index in range(len(hindisen1)):
    hindi_sentence, english_sentence = hindisen1[index], engsen1[index]

    if (is_valid_length(hindi_sentence, min_sequence_length, max_sequence_length) and
        is_valid_length(english_sentence, min_sequence_length, max_sequence_length) and
        is_valid_tokens(hindi_sentence, hindi_vocabulary)):

        valid_sentence_indices.append(index)

print(f"Total sentences: {len(hindisen1)}")
print(f"Valid sentences: {len(valid_sentence_indices)}")


Total sentences: 10000
Valid sentences: 2700


In [168]:
hindisen1 = [hindisen1[i] for i in valid_sentence_indices]
engsen1 = [engsen1[i] for i in valid_sentence_indices]


In [169]:
engsen1[:10]

['How much time do we have to finish this?',
 'Can you identify the man in this picture?',
 'Will you take us for a drive next Sunday?',
 'Could you send someone up to make the bed?',
 'Do you know the man standing on the bridge?',
 "You aren't leaving Japan for good, are you?",
 'Do you know this part of the city very well?',
 'If it rains tomorrow, will you stay at home?',
 'Are you going to cut down all the trees here?',
 'Did it not occur to you to close the windows?']

In [170]:
hindisen1[:10]

['हमारे पास इस काम को खतम करने के लिए कितना समय है?',
 'क्या तुम उस आदमी को उसकी तस्वीर से पहचान सकते हो?',
 'हमें अगले हफ़्ते ड्राईव पर लेजाओगे क्या?',
 'आप किसी को बिस्तर बनाने के लिए भेज सकते हैं क्या?',
 'तुम ब्रिज पर खड़े हुए आदमी को जानते हो क्या?',
 'तुम हमेशा के लिए तो जापान नहीं जा रहे हो ना?',
 'तुम शहर के इस हिस्से को अच्छी तरह से जानते हो क्या?',
 'अगर कल बारिश हुई तो तुम घर में रहोगे क्या?',
 'तुम यहाँ सारे के सारे पेड़ काट डालोगे क्या?',
 'तुम्हें खिड़कियाँ बंद करने की नहीं सूझी?']

In [171]:
#to craeted cuomt dataset pytorch inbuitl method used in our own csutom datas set class

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, lang_sentences):
        self.english_sentences = english_sentences
        self.lang_sentences = lang_sentences
    #rturn number of eng/hind or sentences in a list
    def __len__(self):
        return len(self.english_sentences)
    #return 1:1 mapping of one lang to other
    def __getitem__(self, idx):
        return self.english_sentences[idx], self.lang_sentences[idx]


dataset = TextDataset(engsen1, hindisen1)

In [172]:
dataset.english_sentences[0],dataset.lang_sentences[0]

('How much time do we have to finish this?',
 'हमारे पास इस काम को खतम करने के लिए कितना समय है?')

In [173]:
dataset[0]

('How much time do we have to finish this?',
 'हमारे पास इस काम को खतम करने के लिए कितना समय है?')

In [184]:
#this code will create batches
batch_size = 2
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 1:
        break

[('How much time do we have to finish this?', 'Can you identify the man in this picture?'), ('हमारे पास इस काम को खतम करने के लिए कितना समय है?', 'क्या तुम उस आदमी को उसकी तस्वीर से पहचान सकते हो?')]
[('Will you take us for a drive next Sunday?', 'Could you send someone up to make the bed?'), ('हमें अगले हफ़्ते ड्राईव पर लेजाओगे क्या?', 'आप किसी को बिस्तर बनाने के लिए भेज सकते हैं क्या?')]
[('Do you know the man standing on the bridge?', "You aren't leaving Japan for good, are you?"), ('तुम ब्रिज पर खड़े हुए आदमी को जानते हो क्या?', 'तुम हमेशा के लिए तो जापान नहीं जा रहे हो ना?')]


In [177]:
batch

[('Do you know this part of the city very well?',
  'If it rains tomorrow, will you stay at home?'),
 ('तुम शहर के इस हिस्से को अच्छी तरह से जानते हो क्या?',
  'अगर कल बारिश हुई तो तुम घर में रहोगे क्या?')]

In [178]:
eng_batch, ln_batch = batch

In [179]:
eng_batch

('Do you know this part of the city very well?',
 'If it rains tomorrow, will you stay at home?')

In [180]:
ln_batch

('तुम शहर के इस हिस्से को अच्छी तरह से जानते हो क्या?',
 'अगर कल बारिश हुई तो तुम घर में रहोगे क्या?')

In [185]:
#now combinig with sentence ebedding
batch_size = 2
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)
d_model = 2
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 1
max_sequence_length = 3
torch.manual_seed(2)
engtokenization=SentenceEmbedding(max_sequence_length, d_model, english_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
hintokenization=SentenceEmbedding(max_sequence_length, d_model, hindi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)


for batch_num, batch in enumerate(iterator):
    print('\nbatch_num',batch_num+1)
    print('batch')
    print(batch)
    eng_batch, ln_batch = batch
    engtoken=engtokenization(eng_batch,start_token=True,end_token=True)
    print('engtoken',engtoken.shape)
    hintoken=hintokenization(ln_batch,start_token=True,end_token=True)
    print('hintoken',hintoken.shape)
    if batch_num > 1:
        break


batch_num 1
batch
[('How much time do we have to finish this?', 'Can you identify the man in this picture?'), ('हमारे पास इस काम को खतम करने के लिए कितना समय है?', 'क्या तुम उस आदमी को उसकी तस्वीर से पहचान सकते हो?')]
engtoken torch.Size([2, 3, 2])
hintoken torch.Size([2, 3, 2])

batch_num 2
batch
[('Will you take us for a drive next Sunday?', 'Could you send someone up to make the bed?'), ('हमें अगले हफ़्ते ड्राईव पर लेजाओगे क्या?', 'आप किसी को बिस्तर बनाने के लिए भेज सकते हैं क्या?')]
engtoken torch.Size([2, 3, 2])
hintoken torch.Size([2, 3, 2])

batch_num 3
batch
[('Do you know the man standing on the bridge?', "You aren't leaving Japan for good, are you?"), ('तुम ब्रिज पर खड़े हुए आदमी को जानते हो क्या?', 'तुम हमेशा के लिए तो जापान नहीं जा रहे हो ना?')]
engtoken torch.Size([2, 3, 2])
hintoken torch.Size([2, 3, 2])


In [None]:
#above works perfectly