# English to Marathi Neural Machine Translation

## 1. Tokenizer & Dataset Utilities

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("samantar_dataset.csv")

In [3]:
dataset.head()

Unnamed: 0,src,tgt
0,Political suggestions,राजकीय पक्षांच्या सूचना
1,He said this in a public meeting.,यावेळी सभेत बोलताना त्यांनी ही घोषणा केली.
2,Few close friends and family members attended ...,तसेच त्यांच्या लग्नासोहळ्यासाठी काही नातेवाईक ...
3,Supreme Court closes a contempt plea filed by ...,‘चौकीदार चोर है’ या राहुल गांधी याच्या वक्तव्य...
4,The growth of our economy depends on PSBs abil...,सार्वजनिक क्षेत्रातील बँकांची बाजारात वित्त पु...


In [4]:
import sentencepiece as spm

In [6]:
with open("smp_input.txt", "w", encoding="utf-8") as f:
    for en, mr in zip(dataset["src"], dataset["tgt"]):
        f.write(en.strip() + "\n")
        f.write(mr.strip() + "\n")

In [7]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input="smp_input.txt",
    model_prefix="en_mr_unigram",
    vocab_size=32000,                 # recommended
    model_type="unigram",             # important
    character_coverage=0.9995,        # important for Marathi
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: smp_input.txt
  input_format: 
  model_prefix: en_mr_unigram
  model_type: UNIGRAM
  vocab_size: 32000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: 2
  eos_id: 3
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
 

In [8]:
sp = spm.SentencePieceProcessor()
sp.load("en_mr_unigram.model")

True

In [9]:
tokens = sp.encode("वृत्तसंस्थेने दिलेल्या माहितीनुसार, विरोध करण्याऱ्या शेतकऱ्यांची दिशाभूल केली जात असल्याचा आरोप प्रगतीशील शेतकरी संघटना, सेनीपतचे अध्यक्ष कंवलसिंग चौहान यांनी केला.", out_type=int)
print(tokens)
text = sp.decode(tokens)
print(text)

[24075, 1184, 2935, 5, 1399, 4686, 3754, 11607, 16264, 71, 340, 2098, 546, 2627, 3588, 1451, 3644, 5, 9903, 44, 8870, 35, 366, 10159, 9241, 4152, 9416, 70, 92, 4]
वृत्तसंस्थेने दिलेल्या माहितीनुसार, विरोध करण्याऱ्या शेतकऱ्यांची दिशाभूल केली जात असल्याचा आरोप प्रगतीशील शेतकरी संघटना, सेनीपतचे अध्यक्ष कंवलसिंग चौहान यांनी केला.


In [10]:
with open("en_mr_unigram.vocab", "r", encoding="utf-8") as f:
    for i in range(10):
        print(f.readline())

<pad>	0

<unk>	0

<s>	0

</s>	0

.	-2.78317

,	-3.64751

▁the	-3.87328

▁of	-4.58389

▁आहे	-4.60313

▁	-4.68035

