# Indic Text Transliterator

To transliterate text in Indic languages from one script to another (including IPA and ISO transliteration schemes)

## SIL Converter
We use [SIL Converter](#) at this stage for the transliteration. Later we can use the same base rules for generating a node library and python library for the transliteration.

In [5]:
header_metadata = """
; This file was created by <beniza> using TECkitMappingEditorU.exe v4.0.0.0 on 12/17/2019.
;   Conversion Type = Legacy_to_from_Unicode
;   Left-hand side font = Gautami;18
;   Right-hand side font = Charis SIL;15.75
;   Main Window Position = 0,0,658,713
;   Left-hand side Character Map Window Position = 658,0,457,447
;   Right-hand side Character Map Window Position = 658,447,457,413
"""

In [6]:
header = """
EncodingName            "IndicTextTransliterator"
DescriptiveName         "A multi-directional transliteration tool for text in Indic Scripts"
Version                 "1"
Contact                 "mailto:beniza@gmail.com"
RegistrationAuthority   "New Life Computer Institute"
RegistrationName        "in.nlci.encodingconverter.indictransliterator"
Copyright               "© 2019 NLCI. CC-BY-SA."
LHSFlags                ()
RHSFlags                ()
"""

In [9]:
import unicodedata

In [45]:
def getScriptRange(scriptNum):
    '''Returns the start and end codepoint of the script specified by the scriptnum'''
    scriptStart = 2304 + scriptNum * 128 # Devanagari (the first of the Indic Script blocks). Each Indic Unicode block is of the size 128
    return(scriptStart, scriptStart+128)

In [31]:
def teckitizeChar(ch):
    '''returns the unicode name of `ch` in lower case, joined with underscores'''
    # Teckit maps requires unicode names in lower case and connected using underscores
    try:
        teckitizedName = "_".join(unicodedata.name(chr(ch)).lower().split())
    except:
        return False
    return teckitizedName

In [84]:
global reservedCodePoints
reservedCodePoints = []

In [87]:
def generateTeckitMap(scriptNum):
    '''Autogenerate the mappings according to the teckit language'''
    ipaMapping = {'telugu_letter_a':  'a','telugu_letter_aa':  'aː','telugu_letter_i':  'i',\
         'telugu_letter_ii':  'iː','telugu_letter_u':  'u','telugu_letter_uu':  'uː',\
         'telugu_letter_vocalic_r':  'r̯','telugu_letter_vocalic_l':  'l̯','telugu_letter_e':  'e',\
         'telugu_letter_ee':  'eː','telugu_letter_ai':  'aⁱ','telugu_letter_o':  'o',\
         'telugu_letter_oo':  'oː','telugu_letter_au':  'aᵘ',\
         'telugu_letter_ka':  'k', 'telugu_letter_kha':  'kʰ','telugu_letter_ga':  'ɡ','telugu_letter_gha':  'ɡʱa', 'telugu_letter_nga':  'ŋa',\
         'telugu_letter_ca':  'tʃa','telugu_letter_cha':  'tʃʰa','telugu_letter_ja':  'dʒa','telugu_letter_jha':  'dʒʱa','telugu_letter_nya':  'ɲa',\
         'telugu_letter_tta':  'ṭa','telugu_letter_ttha':  'ʈʰa','telugu_letter_dda':  'ɖa','telugu_letter_ddha':  'ɖʱa','telugu_letter_nna':  'ɳa',\
         'telugu_letter_ta':  'ta','telugu_letter_tha':  'tʰa','telugu_letter_da':  'da','telugu_letter_dha':  'dʱa','telugu_letter_na':  'na',\
         'telugu_letter_pa':  'pa','telugu_letter_pha':  'pʰa','telugu_letter_ba':  'ba','telugu_letter_bha':  'bʱa','telugu_letter_ma':  'ma',\
         'telugu_letter_ya':  'ya','telugu_letter_ra':  'ja','telugu_letter_rra':  'ra','telugu_letter_la':  'la',\
         'telugu_letter_lla':  'ɭa','telugu_letter_llla':  'ɽa','telugu_letter_va':  'ʋa','telugu_letter_sha':  'ʃa',\
         'telugu_letter_ssa':  'ʂa','telugu_letter_sa':  'sa','telugu_letter_ha':  'ha','telugu_sign_avagraha':  '',\
         'telugu_vowel_sign_aa':  'aː','telugu_vowel_sign_i':  'i','telugu_vowel_sign_ii':  'iː','telugu_vowel_sign_u':  'u',\
         'telugu_vowel_sign_uu':  'uː','telugu_vowel_sign_vocalic_r':  'r̯','telugu_vowel_sign_vocalic_rr':  'l̯',\
         'telugu_vowel_sign_e':  'e','telugu_vowel_sign_ee':  'eː','telugu_vowel_sign_ai':  'aⁱ','telugu_vowel_sign_o':  'o',\
         'telugu_vowel_sign_oo':  'oː','telugu_vowel_sign_au':  'aᵘ','telugu_sign_virama':  '','telugu_length_mark':  '',\
         'telugu_ai_length_mark':  '','telugu_letter_tsa':  '','telugu_letter_dza':  '','telugu_letter_rrra':  '',\
         'telugu_letter_vocalic_rr':  'r̯ː','telugu_letter_vocalic_ll':  'l̯ː','telugu_vowel_sign_vocalic_l':  'r̯ː',\
         'telugu_vowel_sign_vocalic_ll':  'l̯ː','telugu_digit_zero':  '0','telugu_digit_one':  '1','telugu_digit_two':  '2',\
         'telugu_digit_three':  '3','telugu_digit_four':  '4','telugu_digit_five':  '5','telugu_digit_six':  '6',\
         'telugu_digit_seven':  '7','telugu_digit_eight':  '8','telugu_digit_nine':  '9'}
    startCodePoint, endCodePoint = getScriptRange(scriptNum)
    for codePoint in range(startCodePoint, endCodePoint):
        teckName = teckitizeChar(codePoint)
        
        if teckName:
            # ipaChar = teckName.split("_")[-1] # this is not ipa at this point. TODO: Write a function
            ipaChar = ipaMapping.get(teckName, False)
            if(codePoint in range(2325 + scriptNum * 128, 2325 + scriptNum * 128+21)):
               print('{} {} "{}"'.format(teckName, ' /_ [vowelSigns] > ', ipaChar))
               print('{} <> "{}{}" ;{}'.format(teckName, ipaChar, "a", chr(codePoint)))
            else:
                print('{} <> "{}" ;{}'.format(teckName, ipaChar, chr(codePoint)))
            # print('{}'.format(teckName))
        else:
            reservedCodePoints.append(hex(codePoint))
print("Warning! {}: {}".format("Couldn't find a name for the following codepoints. May be they're <reserved> codepoints in the block", ", ".join(reservedCodePoints)))



In [89]:
generateTeckitMap(6)

telugu_sign_combining_candrabindu_above <> "False" ;ఀ
telugu_sign_candrabindu <> "False" ;ఁ
telugu_sign_anusvara <> "False" ;ం
telugu_sign_visarga <> "False" ;ః
telugu_sign_combining_anusvara_above <> "False" ;ఄ
telugu_letter_a <> "a" ;అ
telugu_letter_aa <> "aː" ;ఆ
telugu_letter_i <> "i" ;ఇ
telugu_letter_ii <> "iː" ;ఈ
telugu_letter_u <> "u" ;ఉ
telugu_letter_uu <> "uː" ;ఊ
telugu_letter_vocalic_r <> "r̯" ;ఋ
telugu_letter_vocalic_l <> "l̯" ;ఌ
telugu_letter_e <> "e" ;ఎ
telugu_letter_ee <> "eː" ;ఏ
telugu_letter_ai <> "aⁱ" ;ఐ
telugu_letter_o <> "o" ;ఒ
telugu_letter_oo <> "oː" ;ఓ
telugu_letter_au <> "aᵘ" ;ఔ
telugu_letter_ka  / [vowelSigns] >  "k"
telugu_letter_ka <> "ka" ;క
telugu_letter_kha  / [vowelSigns] >  "kʰ"
telugu_letter_kha <> "kʰa" ;ఖ
telugu_letter_ga  / [vowelSigns] >  "ɡ"
telugu_letter_ga <> "ɡa" ;గ
telugu_letter_gha  / [vowelSigns] >  "ɡʱa"
telugu_letter_gha <> "ɡʱaa" ;ఘ
telugu_letter_nga  / [vowelSigns] >  "ŋa"
telugu_letter_nga <> "ŋaa" ;ఙ
telugu_letter_ca  / [vowelSigns] >  "t

In [66]:
0x0915

2325

In [69]:
d["telugu_letter_a"]

'a'