In [10]:
import random

In [40]:
"""Utility for context-aware removal of ZWNJ and ZWJ in Bangla text.
"""

from __future__ import unicode_literals
import io
import re


STANDARDIZE_ZW = re.compile(r'(?<=\u09b0)[\u200c\u200d]+(?=\u09cd\u09af)')
DELETE_ZW = re.compile(r'(?<!\u09b0)[\u200c\u200d](?!\u09cd\u09af)')


def RemoveOptionalZW(text):
    text = STANDARDIZE_ZW.sub('\u200D', text)
    text = DELETE_ZW.sub('', text)
    return text

In [6]:
googlex = {} # dictionary containing google lexicon

with open('lexicon.tsv') as f:
    for line in f:
        if line.startswith('#'): 
            continue
            
        items = line.strip().split('\t')
        if len(items) >= 2:
            word, pron = items[0], items[1]
            
            # duplicate check
            #if word in googlex:
            #    print(word, '\t', pron, '\t', googlex[word])
            googlex[word] = pron
            
print('total words in google lexicon:', len(googlex))

total words in google lexicon: 64970


In [7]:
sustlex = {} # dictionary containing sust lexicon

with open('bn_lexicon_final_sorted.txt') as f:
    for line in f:
        items = line.strip().split()
        
        # error check
        if len(items) != 2:
            print(items)
            continue
            
        word, pron = items[0], items[1]
        sustlex[word] = pron

print('total words in sust lexicon:', len(sustlex))

total words in sust lexicon: 135760


In [41]:
trans = {} # dictionary containing the transcription map

with open('conversion_map.txt') as f:
    for line in f:
        items = line.strip().split()
        if len(items) >= 2:
            trans[ RemoveOptionalZW(items[0]) ] = ' '.join(items[1:])
            
print('total transcription rules:', len(trans))

total transcription rules: 56


* Issues to handle:

** implicit O. Ex- ক -> k O

In [53]:
def transcribe(word):
    result = []
    i = 0
    wlen = len(word)
    prev1 = prev2 = ''
    
    while i < wlen:
        if word[i:].startswith('\u09cd\u09af\u09be'): # ্যা্যা
            result.append('E')
            i += 3
            continue
        
        c = word[i]
        if c in trans:
            if c=='\u09cd' and i==wlen-1:
                pass
            else:
                result.append(trans[c])
            
        i += 1
            
    return ' '.join(result)


In [49]:
word = 'পাকস্থলির্'
list(word)
# transcribe(word)
# trans['\u09cd']
#trans.keys()

['প', 'া', 'ক', 'স', '্', 'থ', 'ল', 'ি', 'র', '্']

In [47]:
sample_sustlex = random.sample(sustlex.items(), 20)

In [48]:
for k, v in sample_sustlex:
    if k in googlex:
        result = transcribe(v)
        print(k, '\t', v, '\t', googlex[k], '\t', result)

ক্যাম্পাস 	 ক্যাম্পাস্ 	 k E m . p a s 	 k E m . p a s
থামলেন 	 থাম্লেন্ 	 th a m . l e n 	 th a m . l e n
পাকস্থলীর 	 পাকস্থলির্ 	 p a . k o s . th o . l i r 	 p a k s . th l i r
আওতা 	 আও্তা 	 a o^ . t a 	 a o . t a
ক্যাভিটি 	 ক্যাভিটি 	 k E . bh i . T i 	 k E bh i T i


In [50]:
a=b=''

In [52]:
b

''