In [1]:
import spacy
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize import indic_normalize
import pickle
from tqdm.notebook import tqdm
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
MAX_LENGTH = 64
BATCH_SIZE = 32

In [3]:
class Lang():

  def __init__(self, name, spacy_tokenizer):
    self.name = name
    self.word2index = {"<SOS>":0, '<EOS>': 1, "<PAD>": 2, '<UNK>': 3}
    self.index2word = {0: "<SOS>", 1: "<EOS>", 2: "<PAD>", 3: '<UNK>'}
    self.word2count = {}
    self.n_words = 4
    self.tokenizer = spacy_tokenizer

  def add_word(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1

    else:
      self.word2count[word] += 1

  def add_sentence(self, sentence):
    tokens = self.tokenize_sentence(sentence)
    for token in tokens: 
      self.add_word(token)

  def tokenize_sentence(self, sentence):
    tokens = [token.text for token in self.tokenizer(sentence.lower())]
    return tokens

  def __len__(self):
    return self.n_words

In [4]:
class Hindi_lang():

  def __init__(self, name):
    self.name = name
    self.word2index = {"<SOS>":0, '<EOS>': 1, "<PAD>": 2, '<UNK>': 3}
    self.index2word = {0: "<SOS>", 1: "<EOS>", 2: "<PAD>", 3: '<UNK>'}
    self.word2count = {}
    self.n_words = 4
    self.normalizer = indic_normalize.DevanagariNormalizer(lang='hi', remove_nuktas=True)

  def add_word(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1

    else:
      self.word2count[word] += 1

  def add_sentence(self, sentence):
    tokens = self.tokenize_sentence(sentence)
    for token in tokens: 
      self.add_word(token)

  def tokenize_sentence(self, sentence):
    # first normalize the sentence, then tokenize
    norm_sent = self.normalizer.normalize(sentence)
    tokens = indic_tokenize.trivial_tokenize(norm_sent)
    return tokens

  def __len__(self):
    return self.n_words

In [17]:
def tensorFromSentenceEval(lang_vocab, sent, max_length=MAX_LENGTH):
    
    tokens = lang_vocab.tokenize_sentence(sent)
    existing_tokens = lang_vocab.word2index.keys()
    
    indexes = []
    for token in tokens:
        if token in existing_tokens:
            index = lang_vocab.word2index[token] 
        else:
            index = lang_vocab.word2index['<UNK>']
        indexes.append(index)

    pad_index = lang_vocab.word2index['<PAD>']

    if len(indexes) < max_length:
        indexes += [pad_index] * (max_length - len(indexes))

    else:
        indexes = indexes[:max_length]

    return torch.tensor(indexes, dtype=torch.long)
    # (max_length,)

In [5]:
class TestSet(Dataset):

    def __init__(self, test_sent_tensor_list):
        super().__init__()
        self.inp = test_sent_tensor_list

    def __len__(self):
        return len(self.inp)
    
    def __getitem__(self, index):
        return self.inp[index]

In [10]:
test_df = pd.read_csv('../datasets/eng_Hindi_data_test_X.csv', header=None)
print(test_df.shape)

(20000, 1)


In [11]:
test_df.head()

Unnamed: 0,0
0,(लूत की सुनते काहे को) ग़रज़ सूरज निकलते निकलत...
1,कि अब तो पकड़े गए मूसा ने कहा हरगिज़ नहीं क्यो...
2,खरीदारी सूची बनाएँ (S)
3,"और जब तुमसे मेरे बन्दे मेरे सम्बन्ध में पूछें,..."
4,"और जब वह लौटता है, तो धरती में इसलिए दौड़-धूप ..."


In [12]:
test_sent_list = test_df.iloc[:, 0].tolist()
print(len(test_sent_list))

20000


In [14]:
with open('../new/hindi_input_vocab.pkl', 'rb') as f: 
    hindi_input_vocab = pickle.load(f)

In [15]:
print(len(hindi_input_vocab))

27939


In [18]:
test_tensor_list = []

for my_sent in tqdm(test_sent_list):

    my_tensor = tensorFromSentenceEval(hindi_input_vocab, my_sent)
    test_tensor_list.append(my_tensor)

  0%|          | 0/20000 [00:00<?, ?it/s]

In [27]:
test_tensor_list[8]

tensor([14546,    12,  1169,    53, 12933,   184,  3404,   184,  8728,    15,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2])

In [29]:
test_set = TestSet(test_tensor_list)
print(len(test_set))

20000


In [30]:
test_set[0].dtype

torch.int64

In [31]:
test_loader = DataLoader(test_set, batch_size=32)

In [51]:
with open('../new/test_loader.pkl', 'wb') as f: 
    pickle.dump(test_loader, f)

In [None]:
def inference(model, batch, output_vocab):

    # batch.shape --> (batch_size, seq_len)

    output_tensor = model.forward(batch.to(device), None)
    output_tensor = output_tensor.argmax(dim=-1)
    # (batch_size, seq_len)

    output_list = []
    for i in range(output_tensor.size(0)):
        output_tokens = [output_tokens.index2word[idx] for idx in output_tensor[i]]
        if '<PAD>' in output_tokens:
            output_tokens = output_tokens[:output_tokens.index('<PAD>')]
        elif '<EOS>' in output_tokens:
            output_tokens = output_tokens[:output_tokens.index('<EOS>')]

        
        output_list.append(' '.join(output_tokens))

    return output_list

In [47]:
i = next(iter(test_loader))
print(i.shape)

torch.Size([32, 64])


In [42]:
test_tensor_list[0].shape

torch.Size([64])

In [34]:
df = pd.read_csv('eng_Hindi_data_train.csv',header=None )
df.head()

Unnamed: 0,0,1
0,and deliver us by Thy mercy from the people of...,और अपनी रहमत से हमें इन काफ़िर लोगों (के नीचे)...
1,Transformed position of fourth point,चौथे बिन्दु का रूपांतरित स्थान
2,"Oh, woe to me; I wish I never took so - and - ...",हाए अफसोस काश मै फला शख्स को अपना दोस्त न बनाता
3,The PS file is to be translated into a PDF fil...,पीएस2पीडीएफ के इस्तेमाल से पीएस फ़ाइल को पीडीए...
4,Receiving LDAP search results...,LDAP खोज परिणाम पा रहा है...


In [8]:
output_sent_all = df.iloc[:, 0].tolist()
print(output_sent_all[0])
print(len(output_sent_all))

and deliver us by Thy mercy from the people of the unbelievers. '
140000


In [9]:
input_sent_all = df.iloc[:, 1].tolist()
print(len(input_sent_all))
print(input_sent_all[0])

140000
और अपनी रहमत से हमें इन काफ़िर लोगों (के नीचे) से नजात दे


In [16]:
hindi_input_vocab = Hindi_lang("hindi")

for my_sent in tqdm(input_sent_all):
  hindi_input_vocab.add_sentence(my_sent)

print(len(hindi_input_vocab))

with open('new/hindi_input_vocab.pkl', 'wb') as f: 
  pickle.dump(hindi_input_vocab, f)

  0%|          | 0/140000 [00:00<?, ?it/s]

27939


In [12]:
nlp_english = spacy.load("en_core_web_sm")
english_output_vocab = Lang("english", nlp_english)

for my_sent in tqdm(output_sent_all):
  english_output_vocab.add_sentence(my_sent)

print(len(english_output_vocab))

# saving the english vocab

with open('new/english_output_vocab.pkl', 'wb') as f: 
  pickle.dump(english_output_vocab, f)

  0%|          | 0/140000 [00:00<?, ?it/s]

24261


In [59]:
with open('new/hindi_input_vocab.pkl', 'rb') as f: 
    hindi_input_vocab = pickle.load(f)

with open('new/english_output_vocab.pkl', 'rb') as f: 
    english_output_vocab = pickle.load(f)

In [60]:
with open('out_sent.pkl', 'rb') as f: 
    input_sent_all = pickle.load(f)

with open('inp_sent.pkl', 'rb') as f: 
    output_sent_all = pickle.load(f)

In [65]:
EOS_TOKEN_INDEX = hindi_input_vocab.word2index['<EOS>']
PAD_TOKEN_INDEX = hindi_input_vocab.word2index['<PAD>']
SOS_TOKEN_INDEX = hindi_input_vocab.word2index['<SOS>']

print(f"EOS = {EOS_TOKEN_INDEX}, PAD = {PAD_TOKEN_INDEX}, SOS = {SOS_TOKEN_INDEX}")

EOS = 1, PAD = 2, SOS = 0


In [62]:
BATCH_SIZE = 32
MAX_LENGTH = 64

In [64]:
input_seqs = []
output_seqs = []

for input_sent, output_sent in tqdm(zip(input_sent_all, output_sent_all)):
  input_seq = [hindi_input_vocab.word2index[word] for word in hindi_input_vocab.tokenize_sentence(input_sent)]
  output_seq = [english_output_vocab.word2index[word] for word in english_output_vocab.tokenize_sentence(output_sent)]

  output_seq = [SOS_TOKEN_INDEX] + output_seq + [EOS_TOKEN_INDEX]

  if len(input_seq) < MAX_LENGTH:
    input_seq += [hindi_input_vocab.word2index['<PAD>']] * (MAX_LENGTH - len(input_seq))
  else:
    input_seq = input_seq[:64]

  if len(output_seq) < MAX_LENGTH:
    output_seq += [english_output_vocab.word2index['<PAD>']] * (MAX_LENGTH - len(output_seq))
  else:
    output_seq = output_seq[:MAX_LENGTH]
    output_seq[-1] = EOS_TOKEN_INDEX

  input_seqs.append(torch.tensor(input_seq, dtype=torch.long))
  output_seqs.append(torch.tensor(output_seq, dtype=torch.long))

print(len(input_seqs))
print(input_seqs[0].shape)

print(len(output_seqs))
print(output_seqs[0].shape)

0it [00:00, ?it/s]

140000
torch.Size([64])
140000
torch.Size([64])


In [67]:
output_seqs[0]

tensor([ 0,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 11, 14, 15, 16,  1,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2])

In [68]:
class Mydat(Dataset):

  def __init__(self, input_seqs, output_seqs):
    super().__init__()
    self.input_seqs = input_seqs
    self.output_seqs = output_seqs

  def __len__(self):
    return len(self.input_seqs)

  def __getitem__(self, index):
    return self.input_seqs[index], self.output_seqs[index]

In [69]:
my_dataset = Mydat(input_seqs, output_seqs)
print(len(my_dataset))

140000


In [15]:
with open('new/dataset.pkl', 'wb') as f: 
    pickle.dump(my_dataset, f)

# Random

In [2]:
a = torch.randn(32, 64)
b = a[0:1, :]
print(b.shape)

torch.Size([1, 64])


In [24]:
temp = Mydat(test_sent_list, None)

In [26]:
temp_loader = DataLoader(, batch_size=32)

In [49]:
a = torch.randn(3, 4, 5)
b = a.argmax(dim=-1)

print(b.shape)

torch.Size([3, 4])
