In [69]:
katakana_minus_hiragana = 0x30a1 - 0x3041 # KATAKANA LETTER A - HIRAGANA A

def is_hirgana(char):
    return 0x3040 < ord(char[0]) and ord(char[0]) < 0x3097

def is_katakana(char):
    return 0x30a0 < ord(char[0]) and ord(char[0]) < 0x30f7

def hiragana_to_katakana(hiragana_text):
    katakana_text = ""
    max_len = 0
    for i, char in enumerate(hiragana_text):
        if is_hirgana(char):
            katakana_text += chr(ord(char) + katakana_minus_hiragana)
            max_len += 1
        else:
            break
    return katakana_text, max_len


def katakana_to_hiragana(katakana_text):
    hiragana_text = ""
    max_len = 0
    for i, char in enumerate(katakana_text):
        if is_katakana(char):
            hiragana_text += chr(ord(char) - katakana_minus_hiragana)
            max_len += 1
        else:
            break
    return hiragana_text, max_len


In [70]:
def test_H2K():
    tests = [ (u"かんたん",   (u"カンタン", 4)),
              (u"にゃ",       (u"ニャ",2)),
              (u"っき",       (u"ッキ",2)),
              (u"っふぁ",     (u"ッファ", 3)),
              (u"しつもん",   (u"シツモン",4)),
              (u"ちがい",     (u"チガイ",3)) ]
    for _in, _out in tests:
        assert hiragana_to_katakana(_in) == _out
        
def test_K2H():
    tests = [ (u"カンタン",   (u"かんたん", 4)),
              (u"ニャ",       (u"にゃ",2)),
              (u"ッキ",       (u"っき",2)),
              (u"ッファ",     (u"っふぁ", 3)),
              (u"シツモン",   (u"しつもん",4)),
              (u"チガイ",     (u"ちがい",3)) ]
    for _in, _out in tests:
        assert katakana_to_hiragana(_in) == _out
    
test_H2K()
test_K2H()

('かんたん', 4)
('にゃ', 2)
('っき', 2)
('っふぁ', 3)
('しつもん', 4)
('ちがい', 3)


In [52]:
def read_hiragana_mappings(filename):
    hiragana_to_romanji_mappings = {}
    with open(filename) as fin:
        for line in fin:
            if line.startswith(';;'):
                continue
            else:
                romanji, hiragana = line.strip().split()
                hiragana_to_romanji_mappings[hiragana] = romanji
    return hiragana_to_romanji_mappings

In [74]:
hiragana_to_hepburn_mappings = read_hiragana_mappings('data/hepburnhira.utf8')
hiragana_to_passport_mappings = read_hiragana_mappings('data/passporthira.utf8')
hiragana_to_kunrei_mappings = read_hiragana_mappings('data/kunreihira.utf8')

katakana_to_hepburn_mappings = read_hiragana_mappings('data/hepburndict.utf8')
katakana_to_passport_mappings = read_hiragana_mappings('data/passportdict.utf8')
katakana_to_kunrei_mappings = read_hiragana_mappings('data/kunreidict.utf8')


In [75]:
def hiragana_to_romaji(hiragana_text, dictionary):
    romanji_text = ""
    max_len = -1
    # Cap the maximum hiragana search at 4 characters.
    # Because that's the largest key size in the dictionary.
    r = min(len(hiragana_text)+1, 4) 
    for i in range(r):
        if hiragana_text[:i] in dictionary:
            if max_len < i:
                max_len = i
                romanji_text = dictionary[hiragana_text[:i]]
    return romanji_text, max_len

def katakana_to_romaji(katakana_text, dictionary):
    romanji_text = ""
    max_len = -1
    # Cap the maximum katakana search at 4 characters.
    # Because that's the largest key size in the dictionary.
    r = min(len(katakana_text)+1, 4) 
    for i in range(r):
        if katakana_text[:i] in dictionary:
            if max_len < i:
                max_len = i
                romanji_text = dictionary[katakana_text[:i]]
    return romanji_text, max_len

In [89]:
def test_H2A():
    tests = [ (u"かんたん",   ("ka", 1)),
            (u"にゃ", ("nya",2)),
            (u"っき", ("kki",2)),
            (u"っふぁ", ("ffa", 3)),
            (u"しつもん",   ("shi",1)),
            (u"ちがい", ("chi",1)) ]
    for _in, _out in tests:
        assert hiragana_to_romaji(_in, hiragana_to_hepburn_mappings) == _out

def test_K2A():
    tests = [ (u"カンタン",   ("ka", 1)),
            (u"ニャ", ("nya",2)),
            (u"ッキ", ("kki",2)),
            (u"ッファ", ("ffa", 3)),
            (u"シツモン",   ("shi", 1)),
            (u"チガイ",  ("chi", 1)),
            (u"ジ", ("ji",1)) ]
    for _in, _out in tests:
        assert katakana_to_romaji(_in, katakana_to_hepburn_mappings) == _out
        
test_H2A()
test_K2A()

In [None]:
def convert(text):
    
    output_text = ""
    i = 0
    while i <= len(text):
        