# Translator using Bi-gram Model

## Prepare everything!

In [5]:
import nltk
import pandas as pd
import numpy as np
import hazm
import matplotlib.pyplot as plt
from deep_translator import GoogleTranslator
from nltk.corpus import reuters, stopwords
from nltk.stem import PorterStemmer
import itertools

from LoadData import LoadData
from WordCounts import WordCounts
from BigramProbability import BigramProbability

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('reuters')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amirhosein\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Amirhosein\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Amirhosein\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Amirhosein\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
for file_id in reuters.fileids():
  txt = reuters.open(file_id).read()
  print(txt)
  break

ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict would hurt
  them in the long-run, in the short-term Tokyo's loss might be
  their gain.
      The U.S. Has said it will impose 300 mln dlrs of tariffs on
  imports of Japanese electronics goods on April 17, in
  retaliation for Japan's alleged failure to stick to a pact not
  to sell semiconductors on world markets at below cost.
      Unofficial Japanese estimates put the impact of the tariffs
  at 10 billion dlrs and spokesmen for major electronics firms
  said they would virtually halt exports

In [3]:
doc = LoadData()

print("length of doc:",len(doc),"word ")
print("first 10 word:\n",doc[:10])

length of doc: 983789 word 
first 10 word:
 ['<START>', 'asian', 'export', 'fear', 'damag', 'japan', 'rift', 'mount', 'trade', 'friction']


## Build Model

In [4]:
swc, pwc = WordCounts(doc)

print("words:", "\t\t\t  count:")

for idx, pair in enumerate(pwc):
    print(pair,"\t\t",pwc[pair])
    if idx == 5:
        break

words: 			  count:
('<START>', 'asian') 		 4
('asian', 'export') 		 1
('export', 'fear') 		 3
('fear', 'damag') 		 1
('damag', 'japan') 		 1
('japan', 'rift') 		 1


In [6]:
unigram_prob, bigram_prob = BigramProbability(swc, pwc)

print("bigram:", "\t\t  probability:")

for idx, pair in enumerate(bigram_prob):
    print(pair,"\t\t",bigram_prob[pair])
    if idx == 5:
        break

bigram: 		  probability:
('<START>', 'asian') 		 7.31047591198187e-05
('asian', 'export') 		 0.015384615384615385
('export', 'fear') 		 0.0010893246187363835
('fear', 'damag') 		 0.006211180124223602
('damag', 'japan') 		 0.0038022813688212928
('japan', 'rift') 		 0.0005291005291005291


In [9]:
# calculate likelihood matrix
likelyhood = {}

for (a, _) in swc.items():
  for (b, _) in swc.items():
    prob_a = unigram_prob.get(a)
    prob_ab = bigram_prob.get((a,b),0)
    likelyhood[(a, b)] = 0 if prob_a == 0 else prob_ab / prob_a

In [10]:
print("bigram:", "\t\t likelyhood:")

for idx, pair in enumerate(likelyhood):
    print(pair,"\t\t",likelyhood[pair])
    if idx == 5:
        break

bigram: 		 likelyhood:
('<START>', '<START>') 		 0.0
('<START>', 'asian') 		 0.0013144173161365475
('<START>', 'export') 		 0.06473505281972496
('<START>', 'fear') 		 0.004271856277443779
('<START>', 'damag') 		 0.002628834632273095
('<START>', 'japan') 		 0.10219594632961655


## Translation

In [20]:
input = "پردازش زبان طبیعی رشته‌ای از هوش مصنوعی است."

In [21]:
normalizer = hazm.Normalizer()
normalized_words = normalizer.normalize(input)
input_words = hazm.word_tokenize(normalized_words)
input_words

['پردازش', 'زبان', 'طبیعی', 'رشته\u200cای', 'از', 'هوش', 'مصنوعی', 'است', '.']

In [24]:
translator = GoogleTranslator(source='fa', target='en')
stemmer = PorterStemmer()

eng_tokens = []
for x in input_words:
  r = translator.translate(x)
  if r is None:
    continue
  eng_tokens.append(r)

tokens = ['<START>']
for w in eng_tokens:
  s = stemmer.stem(w)
  tokens.extend(nltk.word_tokenize(s))
tokens.append('<END>')

tokens

['<START>',
 'process',
 'languag',
 'natur',
 'string',
 'from',
 'intellig',
 'artifici',
 'is',
 '<END>']

In [27]:
fileids = reuters.fileids()
uniq_tokens = set()

for file_id in fileids[:10]:
  for w in reuters.words(file_id):
    s = stemmer.stem(w)
    uniq_tokens.update(nltk.word_tokenize(s))

next_idx = 2
reverse_idx = {'<START>': 0, '<END>': 1}

for token in uniq_tokens:
  reverse_idx[token] = next_idx
  next_idx += 1

In [28]:
permutations = list(itertools.permutations(tokens))
probs = np.zeros(len(permutations))

for j, permute in enumerate(permutations):
  total_prob = 1.0
  for i, w in enumerate(permute):
    if i == len(permute) - 1:
      break
    x_w_idx = reverse_idx.get(w)
    if not x_w_idx:
      continue
    x_n_idx = reverse_idx.get(permute[i + 1])
    if not x_n_idx:
      continue
    total_prob *= likelyhood.get((x_w_idx, x_n_idx), 0)
  probs[j] = total_prob

z = np.argmax(probs)
' '.join(permutations[z])

'<START> process languag natur string from intellig artifici is <END>'