In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [2]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_resources.git"
!pip install Morfessor

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1396, done.[K
remote: Counting objects: 100% (177/177), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 1396 (delta 133), reused 119 (delta 105), pack-reused 1219[K
Receiving objects: 100% (1396/1396), 9.57 MiB | 10.76 MiB/s, done.
Resolving deltas: 100% (743/743), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 139 (delta 2), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (139/139), 149.77 MiB | 26.03 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Updating files: 100% (28/28), done.
Collecting Morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


In [3]:
import json
with open("/content/drive/MyDrive/neural_machine_translation/train_data1.json", "r") as file:
  data = json.load(file)

# English-Hindi
eng_hi_source_sent_train = []
eng_hi_target_sent_train = []
eng_hi_id_train = []

for lang_pair, lang_data in data.items():
  if lang_pair == "English-Hindi":
    print(f"Language pair: {lang_pair}")
    for d_type, d_entry in lang_data.items():
      print(f"  Data type: {d_type}")
      for id, pair in d_entry.items():
        if d_type == "Train":
          eng_hi_source_sent_train.append(pair["source"])
          eng_hi_target_sent_train.append(pair["target"])
          eng_hi_id_train.append(id)

Language pair: English-Hindi
  Data type: Train


In [4]:
INDIC_NLP_LIB_HOME = "/content/indic_nlp_library"
INDIC_NLP_RESOURCES = "/content/indic_nlp_resources"
import sys
sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))
from indicnlp import common, loader
common.set_resources_path(INDIC_NLP_RESOURCES)
loader.load()

In [5]:
# to use the lang.py file
sys.path.append("/content/drive/MyDrive/neural_machine_translation/code")

In [6]:
from lang import Language

In [7]:
print(eng_hi_source_sent_train[200])
print(eng_hi_target_sent_train[200])

The attractive women dance and sing with the background music of Dholki (drum), Manjeera (cymbals), Tuntuni (a single string instrument), Daf (a tambourine like instrument with a single leather surface) and harmonium.
आकर्षक महिलाओं को नृत्य और संगीत ढोलक(ड्रम),मंजीरा (सिम्बाल्स),तुन्तुनी (एक स्ट्रिंग यंत्र ) , डैफ (एक चमड़े के सतह की तरह का यंत्र ) और हारमोनियम की पृष्ठभूमि के साथ होता है |


###DATALOADERS

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

In [9]:
import torch
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
  def __init__(self, source_lang, target_lang, source_sents, target_sents):
    self.source_lang = source_lang
    self.target_lang = target_lang
    # self.source_sents = torch.tensor(source_sents)
    # self.target_sents = torch.tensor(target_sents)
    self.source_sents = source_sents
    self.target_sents = target_sents


  def __len__(self):
    return len(self.source_sents)

  def __getitem__(self, idx):
    source_sent = self.source_sents[idx]
    target_sent = self.target_sents[idx]
    # source_idx_from_sent =  self.source_lang.idx_from_sentence(list(source_sent.numpy()))
    # target_idx_from_sent =  self.target_lang.idx_from_sentence(list(target_sent.numpy()))
    source_idx_from_sent = self.source_lang.idx_from_sentence(source_sent)
    target_idx_from_sent =  self.target_lang.idx_from_sentence(target_sent)

    return torch.tensor(source_idx_from_sent), torch.tensor(target_idx_from_sent)

In [10]:
# Languages
en_lang = Language(lang="en")
hi_lang = Language(lang="hi")

# Load the en_lang instance
import pickle
with open('/content/drive/MyDrive/neural_machine_translation/saves/language_instances/en_lang.pkl', 'rb') as f:
    en_lang = pickle.load(f)

# Load the hi_lang instance
with open('/content/drive/MyDrive/neural_machine_translation/saves/language_instances/hi_lang.pkl', 'rb') as f:
    hi_lang = pickle.load(f)

In [11]:
print(en_lang.idx2word[200], hi_lang.idx2word[200])

first बनाया


In [12]:
# COLLATE FUNCTION
# def pad_sequences(batch):
#   sorted_batch = sorted(batch, key=lambda x: x.size(0), reverse=True)
#   seq_padded = torch.nn.utils.rnn.pad_sequence(sorted_batch, padding_value=1)
#   # <EOS> is 1
#   lengths = torch.LongTensor([len(x) for x in sorted_batch])
#   return seq_padded, lengths

def collate_fn(batch):
  source_batch, target_batch = zip(*batch)
  sorted_indices = sorted(range(len(source_batch)), key=lambda x: source_batch[x].size(0), reverse=True)
  sorted_source_batch = [source_batch[i] for i in sorted_indices]
  sorted_target_batch = [target_batch[i] for i in sorted_indices]

  source_padded = torch.nn.utils.rnn.pad_sequence(sorted_source_batch, padding_value=1) # <EOS> as padding
  target_padded = torch.nn.utils.rnn.pad_sequence(sorted_target_batch, padding_value=1)

  source_lengths = torch.LongTensor([len(x) for x in sorted_source_batch])
  target_lengths = torch.LongTensor([len(x) for x in sorted_target_batch])

  return source_padded, source_lengths, target_padded, target_lengths

In [13]:
from torch.utils.data import DataLoader

dataset = TranslationDataset(
    source_lang=en_lang,
    target_lang=hi_lang,
    source_sents=eng_hi_source_sent_train,
    target_sents=eng_hi_target_sent_train
)

BATCH_SIZE = 64

dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn
)

In [14]:
for i, (src, src_lengths, trg, trg_lengths) in enumerate(dataloader):
    print(f"Batch {i+1}")
    print(f"Source batch shape: {src.shape}")
    print(f"Source lengths shape: {src_lengths.shape}")
    print(f"Target batch shape: {trg.shape}")
    print(f"Target lengths shape: {trg_lengths.shape}")
    print("------")

    if i == 2:  # Limiting it to 3 batches for demonstration purposes
        break


Batch 1
Source batch shape: torch.Size([56, 64])
Source lengths shape: torch.Size([64])
Target batch shape: torch.Size([61, 64])
Target lengths shape: torch.Size([64])
------
Batch 2
Source batch shape: torch.Size([54, 64])
Source lengths shape: torch.Size([64])
Target batch shape: torch.Size([64, 64])
Target lengths shape: torch.Size([64])
------
Batch 3
Source batch shape: torch.Size([62, 64])
Source lengths shape: torch.Size([64])
Target batch shape: torch.Size([72, 64])
Target lengths shape: torch.Size([64])
------


In [15]:
# Get the first batch
src_batch, src_lengths, trg_batch, trg_lengths = next(iter(dataloader))

# Print out the first 3 sequences of the batch
for i in range(3):
    print(f"Source sequence {i+1}: {src_batch[:, i]} (Length: {src_lengths[i]})")
    print(f"Target sequence {i+1}: {trg_batch[:, i]} (Length: {trg_lengths[i]})")
    print("------")

Source sequence 1: tensor([ 1617,    37,  2878, 39069,    22,  1522, 13592,    22,    16,  8139,
         1123,    33,    16, 39070,    22,  1170,    24,  1161,     5,  2024,
           54,    75,  8817,    57,    37,  9172, 39071, 39072,    22,    12,
           16,  2633,    33,    16,   934,    18,  1103,    77, 39073,    37,
          282,   619,  8784, 19585,   467,    16,  1999,    33,   170,  1287,
         4477,    44]) (Length: 52)
Target sequence 1: tensor([ 3703, 13029,    16,  8156,    20,   301,    30, 21996,    11,   277,
           16,  2139,    30,  1205,    11,  3597,     5,  9024,    16,  1667,
          239, 25889, 42012, 42013,    13,   751,  1575,   495,    20,    37,
         9024,   933, 42014,    30,  4102,    77, 42015,    13,   176,  1829,
           20,    37,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1]) (Length: 42)
------
Source sequence 2: tensor([ 6087

In [16]:
# Get the first batch
src_batch, src_lengths, trg_batch, trg_lengths = next(iter(dataloader))

# Convert sequences back to sentences and print
for i in range(3):
    # src_sent = ' '.join([en_lang.idx2word[idx.item()] for idx in src_batch[:, i]])
    # trg_sent = ' '.join([hi_lang.idx2word[idx.item()] for idx in trg_batch[:, i]])
    src_sent = dataset.source_lang.sentence_from_idx([idx.item() for idx in src_batch[:, i]])
    trg_sent = dataset.target_lang.sentence_from_idx([idx.item() for idx in trg_batch[:, i]])

    print(f"Source sentence {i+1}: {src_sent}")
    print(f"Target sentence {i+1}: {trg_sent}")
    print("------")


Source sentence 1: Most 6 published clinical studies that have demonstrated reductions in HAIs with the use of alcoholbased hand rubs have been associated with products that contain at least 70 % alcohol ( isopropanol ) , 0.5 % chlorhexidine and a skin emollient ( Grayson and Russo , 2009 ) .
Target sentence 1: अधिकांश 6 प्रकाशित नैदानिक ​​अध्ययनों ने एचएआई में अल्कोहलयुक्त हैंड रब के उपयोग के साथ कटौती का प्रदर्शन किया है , जो ऐसे उत्पादों से जुड़े हैं जिनमें कम से कम 70 % अल्कोहल ( आइसोप्रोपेनॉल ) , 0.5 % क्लोरहेक्सिडिन और एक त्वचा इमोलिएंट ( ग्रेसन और रूसो , 2009 ) है । <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
------
Source sentence 2: In a survey among 205 children of a school , 68 percent had difficulty in keeping up with the studies after the tragedy , 48 percent of them had been rendered unconscious by inhaling leaked methyl isocyanine gas . <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
Target sentence 2: यूनियन कार्बाइड कारखाने से लगी हुई एक कॉल