In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import json
with open("/content/drive/MyDrive/neural_machine_translation/train_data1.json", "r") as file:
  data = json.load(file)

In [3]:
for key in data.keys():
  print(key)

English-Bengali
English-Gujarati
English-Hindi
English-Kannada
English-Malayalam
English-Tamil
English-Telgu


In [4]:
# English-Hindi
eng_hi_source_sent_train = []
eng_hi_target_sent_train = []
eng_hi_id_train = []

for lang_pair, lang_data in data.items():
  if lang_pair == "English-Hindi":
    print(f"Language pair: {lang_pair}")
    for d_type, d_entry in lang_data.items():
      print(f"  Data type: {d_type}")
      for id, pair in d_entry.items():
        if d_type == "Train":
          eng_hi_source_sent_train.append(pair["source"])
          eng_hi_target_sent_train.append(pair["target"])
          eng_hi_id_train.append(id)

Language pair: English-Hindi
  Data type: Train


In [5]:
print(f"number of sentence pair: {len(eng_hi_source_sent_train)}, {len(eng_hi_id_train)}")

number of sentence pair: 80797, 80797


In [6]:
print(eng_hi_source_sent_train[0])
print(eng_hi_target_sent_train[0])

cancel everything on my calendar
मेरे कैलेंडर पर सब कुछ रद्द करें


In [7]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1396, done.[K
remote: Counting objects: 100% (177/177), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 1396 (delta 133), reused 119 (delta 105), pack-reused 1219[K
Receiving objects: 100% (1396/1396), 9.57 MiB | 5.38 MiB/s, done.
Resolving deltas: 100% (743/743), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 139 (delta 2), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (139/139), 149.77 MiB | 6.51 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Updating files: 100% (28/28), done.
Collecting Morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


In [8]:
INDIC_NLP_LIB_HOME = "/content/indic_nlp_library"
INDIC_NLP_RESOURCES = "/content/indic_nlp_resources"

In [9]:
import sys
sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))

In [10]:
from indicnlp import common, loader
common.set_resources_path(INDIC_NLP_RESOURCES)
loader.load()

###TEXT NORMALIZATION and SENTENCE TOKENIZATION and WORD TOKENIZATION

In [11]:
from indicnlp.normalize.indic_normalize import BaseNormalizer, IndicNormalizerFactory

factory = IndicNormalizerFactory()
base_norm = BaseNormalizer("hi", remove_nuktas=True)
fact_norm = factory.get_normalizer("hi", remove_nuktas=True)


input_text = "\u0958 \u0915\u093c"
output_base_text = base_norm.normalize(input_text)
output_fact_text = fact_norm.normalize(input_text)

print(f"Input: {input_text}")
print(f"Factory normalize: {output_fact_text}")
print(f"Base normalize: {output_base_text}")

Input: क़ क़
Factory normalize: क क
Base normalize: क़ क़


In [12]:
from indicnlp.tokenize import sentence_tokenize

hindi_string = """तो क्या विश्व कप 2019 में मैच का बॉस टॉस है? यानी मैच में हार-जीत में \
टॉस की भूमिका अहम है? आप ऐसा सोच सकते हैं। विश्वकप के अपने-अपने पहले मैच में बुरी तरह हारने वाली एशिया की दो टीमों \
पाकिस्तान और श्रीलंका के कप्तान ने हालांकि अपने हार के पीछे टॉस की दलील तो नहीं दी, लेकिन यह जरूर कहा था कि वह एक अहम टॉस हार गए थे।"""

sents = sentence_tokenize.sentence_split(hindi_string, "hi")
for sent in sents:
  print(sent)

तो क्या विश्व कप 2019 में मैच का बॉस टॉस है?
यानी मैच में हार-जीत में टॉस की भूमिका अहम है?
आप ऐसा सोच सकते हैं।
विश्वकप के अपने-अपने पहले मैच में बुरी तरह हारने वाली एशिया की दो टीमों पाकिस्तान और श्रीलंका के कप्तान ने हालांकि अपने हार के पीछे टॉस की दलील तो नहीं दी, लेकिन यह जरूर कहा था कि वह एक अहम टॉस हार गए थे।


In [13]:
from indicnlp.tokenize import indic_tokenize

hindi_string = 'सुनो, कुछ आवाज़ आ रही है। फोन?'
print('Input String: {}'.format(hindi_string))
print('Tokens: ')
print(type(indic_tokenize.trivial_tokenize(hindi_string)))
for t in indic_tokenize.trivial_tokenize(hindi_string):
  print(t)

Input String: सुनो, कुछ आवाज़ आ रही है। फोन?
Tokens: 
<class 'list'>
सुनो
,
कुछ
आवाज़
आ
रही
है
।
फोन
?


In [14]:
guj_string = "વીતેલા દિવસોમાં આપણે કેટલાય ઉત્સવો ઉજવ્યા. હજી ગઇકાલે જ પૂરા હિંદુસ્તાનમાં શ્રીકૃષ્ણ જન્મોત્સવ ઉજવવામાં આવ્યો."
print('Input String: {}'.format(guj_string))
print('Tokens: ')
print(indic_tokenize.trivial_tokenize(guj_string))
for t in indic_tokenize.trivial_tokenize(guj_string):
  print(t)

Input String: વીતેલા દિવસોમાં આપણે કેટલાય ઉત્સવો ઉજવ્યા. હજી ગઇકાલે જ પૂરા હિંદુસ્તાનમાં શ્રીકૃષ્ણ જન્મોત્સવ ઉજવવામાં આવ્યો.
Tokens: 
['વીતેલા', 'દિવસોમાં', 'આપણે', 'કેટલાય', 'ઉત્સવો', 'ઉજવ્યા', '.', 'હજી', 'ગઇકાલે', 'જ', 'પૂરા', 'હિંદુસ્તાનમાં', 'શ્રીકૃષ્ણ', 'જન્મોત્સવ', 'ઉજવવામાં', 'આવ્યો', '.']
વીતેલા
દિવસોમાં
આપણે
કેટલાય
ઉત્સવો
ઉજવ્યા
.
હજી
ગઇકાલે
જ
પૂરા
હિંદુસ્તાનમાં
શ્રીકૃષ્ણ
જન્મોત્સવ
ઉજવવામાં
આવ્યો
.


In [15]:
from indicnlp.tokenize import indic_detokenize

indic_string='" सुनो , कुछ आवाज़ आ रही है . " , उसने कहा । '

print(f'Input String: {indic_string}')
output_string = indic_detokenize.trivial_detokenize(indic_string,lang='hi')
print(f'Detokenized String: {output_string}')

Input String: " सुनो , कुछ आवाज़ आ रही है . " , उसने कहा । 
Detokenized String: "सुनो, कुछ आवाज़ आ रही है.", उसने कहा। 


###Transliteration

In [16]:
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
# input_text='राजस्थान'
input_text='രാജസ്ഥാന'
# input_text='රාජස්ථාන'
print(UnicodeIndicTransliterator.transliterate(input_text,"ml","ta"))

ராஜஸ்தாந


In [17]:
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator

# input_text='राजस्थान'
input_text='ஆசிரியர்கள்'
lang='ta'

print(ItransTransliterator.to_itrans(input_text,lang))

aachiriyarkald


In [18]:
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator

input_text='kahaa jaanaa hai?'
lang='hi'
x=ItransTransliterator.from_itrans(input_text,lang)
print(x)

कहा जाना है?


LEXICAL SIMILARITY

In [19]:
from indicnlp.script import indic_scripts as isc
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator

lang1_str='पिछले दिनों हम लोगों ने कई उत्सव मनाये. कल, हिन्दुस्तान भर में श्री कृष्ण जन्म-महोत्सव मनाया गया.'
lang2_str='વીતેલા દિવસોમાં આપણે કેટલાય ઉત્સવો ઉજવ્યા. હજી ગઇકાલે જ પૂરા હિંદુસ્તાનમાં શ્રીકૃષ્ણ જન્મોત્સવ ઉજવવામાં આવ્યો.'
lang1='hi'
lang2='gu'

lcsr, len1, len2 = isc.lcsr_indic(lang1_str,lang2_str,lang1,lang2)

print('{} string: {}'.format(lang1, lang1_str))
print('{} string: {}'.format(lang2, UnicodeIndicTransliterator.transliterate(lang2_str,lang2,lang1)))
print('Both strings are shown in Devanagari script using script conversion for readability.')
print('LCSR: {}'.format(lcsr))

hi string: पिछले दिनों हम लोगों ने कई उत्सव मनाये. कल, हिन्दुस्तान भर में श्री कृष्ण जन्म-महोत्सव मनाया गया.
gu string: वीतेला दिवसोमां आपणे केटलाय उत्सवो उजव्या. हजी गइकाले ज पूरा हिंदुस्तानमां श्रीकृष्ण जन्मोत्सव उजववामां आव्यो.
Both strings are shown in Devanagari script using script conversion for readability.
LCSR: 0.5545454545454546


SYLLABIFICATION

In [20]:
from indicnlp.syllable import syllabifier

w='जगदीशचंद्र'
lang='hi'

print(' '.join(syllabifier.orthographic_syllabify(w,lang)))

ज ग दी श च ंद्र


WORD-SEGMENTATION (MORPHOLOGY)

In [20]:
from indicnlp.morph import unsupervised_morph
from indicnlp import common

analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer('mr') # marathi

indic_string='आपल्या हिरड्यांच्या आणि दातांच्यामध्ये जीवाणू असतात .'

analyzes_tokens=analyzer.morph_analyze_document(indic_string.split(' '))

for w in analyzes_tokens:
    print(w)

In [21]:
!pip install spacy
!python -m spacy download en_core_web_sm

2023-09-08 13:34:23.302935: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


##TEXT PROCESSING

###English

In [30]:
import spacy
eng_nlp = spacy.load("en_core_web_sm")
from indicnlp.tokenize import indic_tokenize

class Language:
  def __init__(self, lang="en"):
    self.language = lang
    self.word2idx = {"<SOS>": 0, "<EOS>": 1, "<UNK>": 2}
    self.idx2word = {0: "<SOS>", 1: "<EOS>", 2: "<UNK>"}
    self.n_words = 3
    self.word_count = {}

  def add_word(self, word):
    if word not in self.word2idx:
      self.word2idx[word] = self.n_words
      self.idx2word[self.n_words] = word
      self.word_count[word] = 1
      self.n_words += 1
    else:
      self.word_count[word] += 1

  def add_sentence(self, sent):
    tokens = []
    if self.language == "en":
      tokens = [token.text for token in eng_nlp.tokenizer(sent)]
    else:
      tokens = [token for token in indic_tokenize.trivial_tokenize(sent)]
      # tokens = [token for token in sent.split(" ")]

    for token in tokens:
      self.add_word(token)

  def idx_from_sentence(self, sent):
      # return [self.word2idx[word] if word in self.word2idx else self.word2idx["<UNK>"] for word in sentence.split(' ')]
      tokens = []
      if self.language == "en":
        tokens = [token.text for token in eng_nlp.tokenizer(sent)]
      else:
        tokens = [token for token in indic_tokenize.trivial_tokenize(sent)]

      return [self.word2idx[token] if token in self.word2idx else self.word2idx["<UNK>"] for token in tokens]

  def sentence_from_idx(self, indices):
      return ' '.join([self.idx2word[idx] for idx in indices]) # need updates

In [31]:
en_lang = Language(lang="en")
hi_lang = Language(lang="hi")
# for sent in eng_hi_source_sent_train:
#   en_lang.add_sentence(sent)
# for sent in eng_hi_target_sent_train:
#   hi_lang.add_sentence(sent)

In [32]:
print(f"Vocab size of english train set in eng_hi: {en_lang.n_words}")
print(f"Vocab size of hindi train set in eng_hi: {hi_lang.n_words}")

Vocab size of english train set in eng_hi: 67284
Vocab size of hindi train set in eng_hi: 75578


In [33]:
import pickle

# # Save the en_lang instance
# with open('/content/drive/MyDrive/neural_machine_translation/saves/language_instances/en_lang.pkl', 'wb') as f:
#     pickle.dump(en_lang, f)

# # Save the hi_lang instance
# with open('/content/drive/MyDrive/neural_machine_translation/saves/language_instances/hi_lang.pkl', 'wb') as f:
#     pickle.dump(hi_lang, f)


In [34]:
# Load the en_lang instance
import pickle
with open('/content/drive/MyDrive/neural_machine_translation/saves/language_instances/en_lang.pkl', 'rb') as f:
    en_lang = pickle.load(f)

# Load the hi_lang instance
with open('/content/drive/MyDrive/neural_machine_translation/saves/language_instances/hi_lang.pkl', 'rb') as f:
    hi_lang = pickle.load(f)

In [35]:
en_lang.idx2word[156]

'inscriptions'

In [36]:
hi_lang.idx2word[156]

'पद'

In [None]:
print(f"Vocab size of english train set in eng_hi: {en_lang.n_words}")
print(f"Vocab size of hindi train set in eng_hi: {hi_lang.n_words}")

Vocab size of english train set in eng_hi: 67284
Vocab size of hindi train set in eng_hi: 75578


In [None]:
eng_hi_source_sent_train[1000]

"i don't need light"

In [None]:
[t.text for t in eng_nlp.tokenizer(eng_hi_source_sent_train[1000])]

['i', 'do', "n't", 'need', 'light']

In [None]:
for token in eng_nlp.tokenizer(eng_hi_source_sent_train[1000]):
  print(en_lang.word2idx[token.text])

513
414
415
110
910


In [29]:
en_lang.sentence_from_idx([en_lang.word2idx[token.text] for token in eng_nlp.tokenizer(eng_hi_source_sent_train[1000])])

"i do n't need light"