In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import numpy as np

In [None]:
START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'

In [None]:
hindi_vocabulary = ['<START>', 'ॽ', 'ॡ', 'श्र', '>', '+', 'ढ', '$', 'ॄ', 'य़', '७', '॒', 'ज', '0', '॓', 'श', '!', '"', 'ी', 'ो', 'झ', 'ॱ', '5', 'स', 'े', 'ॐ', 'य', 'ऌ', 'ग़', 'ई', 'ं', 'त', 'ॅ', 'क़', 'च', '%', '़', 'ऐ', 'ू', 'ऱ', "'", ':', '(', 'घ', 'ऍ', 'ट', 'ऊ', '/', 'ण', '*', 'ः', 'र', 'प', '4', 'त्र', 'ग', 'ˌ', 'ज्ञ', '6', 'ठ', 'ा', 'ञ', '7', ',', 'ॅ्', '9', 'ॆ', '-', '<', 'ऺ', '९', '्', 'भ', 'ङ', 'ड़', '॥', 'ज़', '२', '1', '&', 'ध', 'फ', 'ळ', 'फ़', ';', 'ख', 'ऩ', 'व', 'ि', 'ै', '=', 'ए', 'ड', '५', 'ख़', 'ऎ', ' ', '#', 'अः', 'ु', 'ऒ', 'छ', 'द', '@', 'ढ़', '।', 'उ', 'ऽ', 'ॠ', 'म', '१', 'ल', '॰', 'ँ', '?', '६', 'ौ', 'ॢ', 'ॹ', 'न', 'ऑ', '3', 'ओ', '॑', 'ष', '३', 'ॉ', 'ऋ', 'ॲ', 'इ', '॔', 'ॣ', '८', '.', '०', '8', 'अं', 'थ', ')', '४', '2', 'औ', 'ब', 'ऴ', 'ृ', 'क', 'अ', 'ॊ', '₹', 'ह', 'आ', '<PADDING>', '<END>']

In [None]:
english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@', 

                        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 
                        'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 
                        'Y', 'Z',

                        '[', '\\', ']', '^', '_', '`', 

                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 
                        'y', 'z', 
                        
                        '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]

In [None]:
index_to_hindi = {k:v for k,v in enumerate(hindi_vocabulary)}
hindi_to_index = {v:k for k,v in enumerate(hindi_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [None]:
english_file = '../data/english_to_hindi/train.en'
hindi_file = '../data/english_to_hindi/train.hi'

with open(english_file, 'r') as file:
    english_sentences = file.readlines()
with open(hindi_file, 'r') as file:
    hindi_sentences = file.readlines()

In [None]:
# Limit Number of sentences
TOTAL_SENTENCES = 200000
english_sentences = english_sentences[:TOTAL_SENTENCES]
hindi_sentences = hindi_sentences[:TOTAL_SENTENCES]

# Remove new-line characters
english_sentences = [sentence.rstrip('\n') for sentence in english_sentences]
hindi_sentences = [sentence.rstrip('\n') for sentence in hindi_sentences]

In [None]:
english_sentences[:10]

In [None]:
hindi_sentences[:10]

In [None]:
max(len(x) for x in hindi_sentences), max(len(x) for x in english_sentences),

In [None]:
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length hindi: {np.percentile([len(x) for x in hindi_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )

In [None]:
# Model Parameters
d_model = 512
batch_size = 3
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 1
max_sequence_length = 300

In [None]:
def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(hindi_sentences)):
    hindi_sentence, english_sentence = hindi_sentences[index], english_sentences[index]
    if is_valid_length(hindi_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(hindi_sentence, hindi_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(hindi_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

In [None]:
hindi_sentences = [hindi_sentences[i] for i in valid_sentence_indicies]
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]

In [None]:
class TextDataset(Dataset):

    def __init__(self, english_sentences, hindi_sentences):
        self.english_sentences = english_sentences
        self.hindi_sentences = hindi_sentences

    def __len__(self):
        return len(self.english_sentences) #+ len(self.hindi_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.hindi_sentences[idx]

In [None]:
dataset = TextDataset(english_sentences, hindi_sentences)

In [None]:
len(dataset)

In [None]:
dataset[1]

In [None]:
dataset[2]

In [None]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN
    
    def batch_tokenize(self, batch, start_token, end_token):

        def tokenize(sentence, start_token, end_token):
            sentence_word_indicies = [self.language_to_index[token] for token in list(sentence)]

            if start_token:
                sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sentence_word_indicies), self.max_sequence_length):
                sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indicies)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
        tokenized = torch.stack(tokenized)
        return tokenized.to(get_device())
    
    def forward(self, x, start_token, end_token): # sentence
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder().to(get_device())
        x = self.dropout(x + pos)
        return x

# # Model Parameters (Already initialized above.) Here only for reference to help debug errors.
# d_model = 512
# batch_size = 3
# ffn_hidden = 2048
# num_heads = 8
# drop_prob = 0.1
# num_layers = 1
# max_sequence_length = 300

# Instantiate Class
sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, hindi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)

In [None]:
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

for batch_num, batch in enumerate(iterator):

    # Get English and Hindi Batches
    eng_batch, hn_batch = batch
    out = sentence_embedding(hn_batch, START_TOKEN, END_TOKEN)