### Get Data


In [1]:
train_en_path = "../train.en/train.en"
train_hi_path = "../train.hi/train.hi"

with open(train_en_path, 'r') as file:
    data_en = file.readlines()

with open(train_hi_path, 'r') as file:
    data_hi = file.readlines()

In [2]:
len(data_en), len(data_hi)

(8568307, 8568307)

In [21]:
START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'

english_vocabulary = [
    START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', 
    ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', '@', 
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 
    'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 
    'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN
]


hindi_vocabulary = [
    START_TOKEN, PADDING_TOKEN, END_TOKEN, '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    # Adding Devanagari characters (vowels and consonants)
    *(chr(code) for code in range(0x0900, 0x097F)),
    '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', '<', '=', '>', '?', '@', 
    '[', '\\', ']', '^', '_', '`', '।', '“', '”', '{', '|', '}', '~'
]



# for char in hindi_vocabulary:
#     print(char.encode('utf-8').decode('utf-8'))
len(english_vocabulary), len(hindi_vocabulary)


(97, 174)

In [22]:
len(set(english_vocabulary)), len(set(hindi_vocabulary))

(97, 173)

In [5]:
english_vocabulary = list(set(english_vocabulary))
hindi_vocabulary = list(set(hindi_vocabulary))

In [6]:
index_to_hindi = {k:v for k,v in enumerate(hindi_vocabulary)}
hindi_to_index = {v:k for k,v in enumerate(hindi_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

### Processing


In [7]:
Total_Sentences = 500

data_en, data_hi = data_en[:Total_Sentences], data_hi[:Total_Sentences]
data_en = [sentence.rstrip() for sentence in data_en]
data_hi = [sentence.rstrip() for sentence in data_hi]

In [8]:
max(len(x) for x in data_en), max(len(x) for x in data_hi)

(492, 576)

In [9]:
def valid_length(max_sequence_length, en, hi):
    return len(en)<= max_sequence_length - 2  and len(hi)<= max_sequence_length - 2  # we may add start token and end token

def valid_vocab(en, hi, english_vocabulary, hindi_vocabulary):
    hindi_vocabulary = set(hindi_vocabulary)
    english_vocabulary = set(english_vocabulary)
    
    for chr in en:
        if chr not in english_vocabulary:
            return False
        
    for chr in hi:
        if chr not in hindi_vocabulary:
            return False
        
    return True

In [10]:
max_sequence_length = 200
final_sentences = 20

en_sentences = []
hi_sentences = []
total = 0
bad_ones = 0

for index, (en, hi) in enumerate(zip(data_en, data_hi)):
    if valid_length(max_sequence_length, en, hi) and valid_vocab(en, hi, english_vocabulary, hindi_vocabulary):
        en_sentences.append(en)
        hi_sentences.append(hi)
        total += 1
    else:
        bad_ones += 1        

    if total == final_sentences:
        break
    
len(hi_sentences), len(en_sentences), bad_ones
    

(20, 20, 4)

In [11]:
max(len(x) for x in en_sentences), max(len(x) for x in hi_sentences)

(173, 189)

### Masks


In [12]:
import torch
nil = -1e9

In [13]:
nil = -1e9

encoder_self_attention_mask = torch.full([3, 5, 5], 0.0)
encoder_self_attention_mask[0, 3:, : ] = nil
encoder_self_attention_mask[0, :, 3: ] = nil
print(encoder_self_attention_mask[0])

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]])


In [14]:
nil = -1e9

def get_masks(en_batch, hi_batch):
    num_of_sentences = len(en_batch)
    
    decoder_self_attention_mask = torch.full([num_of_sentences, max_sequence_length, max_sequence_length] , nil)
    decoder_self_attention_mask = torch.triu(decoder_self_attention_mask , diagonal = 1)
    
    encoder_self_attention_mask = torch.full([num_of_sentences, max_sequence_length, max_sequence_length], 0.0)
    encoder_decoder_attention_mask = torch.full([num_of_sentences, max_sequence_length, max_sequence_length], 0.0)
    
    for index in range(num_of_sentences):
        num_of_en_tokens, num_of_hi_tokens = len(en_batch[index]), len(hi_batch[index])
        
        encoder_self_attention_mask[index, num_of_en_tokens:, : ] = nil
        encoder_self_attention_mask[index, :, num_of_en_tokens: ] = nil
        
        encoder_decoder_attention_mask[index, num_of_hi_tokens: , : ] = nil
        encoder_decoder_attention_mask[index, : , num_of_en_tokens: ] = nil
        
    return decoder_self_attention_mask,encoder_self_attention_mask, encoder_decoder_attention_mask
        

In [15]:
en_batch = torch.rand(1,3)
hi_batch = torch.rand(1,2)
ds,es,eds = get_masks(en_batch, hi_batch)
ds[0]

tensor([[ 0.0000e+00, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00, -1.0000e+09,  ..., -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
         -1.0000e+09, -1.0000e+09],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
         -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]])

### Tokenization


In [16]:
import torch

def tokenize(sentences, language_to_index, start_token=False, end_token=False):
    sentence_indices = [language_to_index[token] for token in list(sentences)]
    
    if start_token:
        sentence_indices.insert(0, language_to_index[START_TOKEN])
    if end_token:
        sentence_indices.append(language_to_index[END_TOKEN])
        
    while len(sentence_indices) < max_sequence_length:
        sentence_indices.append(language_to_index[PADDING_TOKEN])
            
    return torch.tensor(sentence_indices)

def get_tokenized_sentences(sentences, language_to_index, start_token=False, end_token=False):
    
    for sentence in sentences:
        yield tokenize(sentence, language_to_index, start_token, end_token)
    
en_tokenized = list(get_tokenized_sentences(en_sentences, english_to_index, start_token=False, end_token=False))
hi_tokenized = list(get_tokenized_sentences(hi_sentences, hindi_to_index, start_token=True, end_token=True))

### Dataloader


In [17]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, en_sentences, hi_sentences):
        super().__init__()
        self.en_sentences = en_sentences
        self.hi_sentences = hi_sentences
    
    def __len__(self):
        return len(self.en_sentences)
    
    def __getitem__(self, index):
        return self.en_sentences[index], self.hi_sentences[index]
        
dataset = TextDataset(en_sentences, hi_sentences)
dataset[0]

('In reply, Pakistan got off to a solid start.',
 'जिसके जवाब में पाक ने अच्छी शुरुआत की थी.')

In [18]:
dataloader = DataLoader(dataset = dataset, batch_size= 4, shuffle=True, num_workers=4)
iterator = iter(dataloader)
