Skip to content

Commit

Permalink
Code clean up
Browse files Browse the repository at this point in the history
make comply with PEP440.
remove deprecated function
  • Loading branch information
bluedisk committed Feb 1, 2017
1 parent f0e0d9b commit 4d421ee
Show file tree
Hide file tree
Showing 10 changed files with 148 additions and 96 deletions.
33 changes: 18 additions & 15 deletions hgtk/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ def is_jamo(letter):
def is_hanja(phrase):
for unicode_value in map(lambda letter:ord(letter), phrase):
if ((unicode_value < FIRST_HANJA_UNICODE or unicode_value > LAST_HANJA_UNICODE) and
(unicode_value < FIRST_HANJA_EXT_A_UNICODE or unicode_value > LAST_HANJA_EXT_A_UNICODE)):
(unicode_value < FIRST_HANJA_EXT_A_UNICODE or unicode_value > LAST_HANJA_EXT_A_UNICODE)):
return False
return True


def is_latin1(phrase):
for unicode_value in map(lambda letter:ord(letter), phrase):
if unicode_value < FIRST_LATIN1_UNICODE or unicode_value > LAST_LATIN1_UNICODE:
Expand All @@ -62,21 +63,23 @@ def has_jongsung(letter):
code = lt.hangul_index(letter)
return code % NUM_JONG > 0


def has_batchim(letter):
"""This method is the same as has_jongsung()"""
return has_jongsung(letter)

def has_approximant(letter):
"""Approximant makes complex vowels, such as ones starting with y or w.
In Korean there is a unique approximant euㅡ making uiㅢ, but ㅢ does not make many irregularities."""
if len(letter) != 1:
raise Exception('The target string must be one letter.')
if not is_hangul(letter):
raise NotHangulException('The target string must be Hangul')

jaso = lt.decompose(letter)
diphthong = (u'ㅑ',u'ㅒ',u'ㅕ',u'ㅖ',u'ㅘ',u'ㅙ',u'ㅛ',u'ㅝ',u'ㅞ',u'ㅠ')
# excluded 'ㅢ' because y- and w-based complex vowels are irregular.
# vowels with umlauts (ㅐ, ㅔ, ㅚ, ㅟ) are not considered complex vowels.
return jaso[1] in diphthong

# DEPRECATED !
# def has_approximant(letter):
# """Approximant makes complex vowels, such as ones starting with y or w.
# In Korean there is a unique approximant euㅡ making uiㅢ, but ㅢ does not make many irregularities."""
# if len(letter) != 1:
# raise Exception('The target string must be one letter.')
# if not is_hangul(letter):
# raise NotHangulException('The target string must be Hangul')
#
# jaso = lt.decompose(letter)
# diphthong = (u'ㅑ', u'ㅒ', u'ㅕ', u'ㅖ', u'ㅘ', u'ㅙ', u'ㅛ', u'ㅝ', u'ㅞ', u'ㅠ')
# # excluded 'ㅢ' because y- and w-based complex vowels are irregular.
# # vowels with umlauts (ㅐ, ㅔ, ㅚ, ㅟ) are not considered complex vowels.
# return jaso[1] in diphthong
#
23 changes: 18 additions & 5 deletions hgtk/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,28 @@
################################################################################

# Code = 0xAC00 + (Chosung_index * NUM_JOONG * NUM_JONG) + (Joongsung_index * NUM_JONG) + (Jongsung_index)
CHO = (u'ㄱ',u'ㄲ',u'ㄴ',u'ㄷ',u'ㄸ',u'ㄹ',u'ㅁ',u'ㅂ',u'ㅃ',u'ㅅ',u'ㅆ',u'ㅇ',u'ㅈ',u'ㅉ',u'ㅊ',u'ㅋ',u'ㅌ',u'ㅍ',u'ㅎ')
JOONG = (u'ㅏ',u'ㅐ',u'ㅑ',u'ㅒ',u'ㅓ',u'ㅔ',u'ㅕ',u'ㅖ',u'ㅗ',u'ㅘ',u'ㅙ',u'ㅚ',u'ㅛ',u'ㅜ',u'ㅝ',u'ㅞ',u'ㅟ',u'ㅠ',u'ㅡ',u'ㅢ',u'ㅣ')
JONG = (u'',u'ㄱ',u'ㄲ',u'ㄳ',u'ㄴ',u'ㄵ',u'ㄶ',u'ㄷ',u'ㄹ',u'ㄺ',u'ㄻ',u'ㄼ',u'ㄽ',u'ㄾ',u'ㄿ',u'ㅀ',u'ㅁ',u'ㅂ',u'ㅄ',u'ㅅ',u'ㅆ',u'ㅇ',u'ㅈ',u'ㅊ',u'ㅋ',u'ㅌ',u'ㅍ',u'ㅎ')

CHO = (
u'ㄱ', u'ㄲ', u'ㄴ', u'ㄷ', u'ㄸ', u'ㄹ', u'ㅁ', u'ㅂ', u'ㅃ', u'ㅅ',
u'ㅆ', u'ㅇ', u'ㅈ', u'ㅉ', u'ㅊ', u'ㅋ', u'ㅌ', u'ㅍ', u'ㅎ'
)

JOONG = (
u'ㅏ', u'ㅐ', u'ㅑ', u'ㅒ', u'ㅓ', u'ㅔ', u'ㅕ', u'ㅖ', u'ㅗ', u'ㅘ',
u'ㅙ', u'ㅚ', u'ㅛ', u'ㅜ', u'ㅝ', u'ㅞ', u'ㅟ', u'ㅠ', u'ㅡ', u'ㅢ', u'ㅣ'
)

JONG = (
u'', u'ㄱ', u'ㄲ', u'ㄳ', u'ㄴ', u'ㄵ', u'ㄶ', u'ㄷ', u'ㄹ', u'ㄺ',
u'ㄻ', u'ㄼ', u'ㄽ', u'ㄾ', u'ㄿ', u'ㅀ', u'ㅁ', u'ㅂ', u'ㅄ', u'ㅅ',
u'ㅆ', u'ㅇ', u'ㅈ', u'ㅊ', u'ㅋ', u'ㅌ', u'ㅍ', u'ㅎ'
)

JAMO = CHO + JOONG + JONG[1:]

NUM_CHO = 19
NUM_JOONG = 21
NUM_JONG = 28

FIRST_HANGUL_UNICODE = 0xAC00 #'가'
LAST_HANGUL_UNICODE = 0xD7A3 #'힣'
FIRST_HANGUL_UNICODE = 0xAC00 # '가'
LAST_HANGUL_UNICODE = 0xD7A3 # '힣'
2 changes: 2 additions & 0 deletions hgtk/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
class NotHangulException(Exception):
pass


class NotLetterException(Exception):
pass


class NotWordException(Exception):
pass
21 changes: 12 additions & 9 deletions hgtk/josa.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,22 @@
from . import letter

################################################################################
# Josa functions
# Josa Type Parameters
################################################################################

EUN_NEUN = {'not': u'은', 'has': '는', 'except': None}
I_GA = {'not': u'이', 'has': '가', 'except': None}
EUL_REUL = {'not': u'을', 'has': '를', 'except': None}
GWA_WA = {'not': u'과', 'has': '와', 'except': None}
IDA_DA = {'not': u'이다', 'has': '다', 'except': None}

EURO_RO = {'not': u'으로', 'has': u'로', 'except': u'ㄹ'}
RYUL_YUL = {'not': u'률', 'has': u'율', 'except': u'ㄴ'}

class Josa:
EUN_NEUN = { 'not':u'은', 'has':'는', 'except': None }
I_GA = { 'not':u'이', 'has':'가', 'except': None }
EUL_REUL = { 'not':u'을', 'has':'를', 'except': None }
GWA_WA = { 'not':u'과', 'has':'와', 'except': None }
IDA_DA = { 'not':u'이다', 'has':'다', 'except': None }

EURO_RO = { 'not': u'으로', 'has':u'로', 'except':u'ㄹ' }
RYUL_YUL = { 'not': u'률', 'has':u'율', 'except':u'ㄴ' }
################################################################################
# Josa functions
################################################################################


def attach(word, josa=Josa.EUN_NEUN):
Expand Down
22 changes: 12 additions & 10 deletions hgtk/letter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
from __future__ import division

from .const import CHO, JOONG, JONG, FIRST_HANGUL_UNICODE, NUM_CHO, NUM_JOONG, NUM_JONG
from .exception import NotHangulException
from .exception import NotHangulException, NotLetterException

from six import unichr

################################################################################
# Decomposition & Combination
################################################################################


def compose(chosung, joongsung, jongsung=u''):
"""This function returns a Hangul letter by composing the specified chosung, joongsung, and jongsung.
@param chosung
Expand All @@ -25,21 +26,23 @@ def compose(chosung, joongsung, jongsung=u''):
joongsung_index = JOONG.index(joongsung)
jongsung_index = JONG.index(jongsung)
except Exception:
raise NotHangulException('No valid Hangul character can be generated using given combination of chosung, joongsung, and jongsung.')
raise NotHangulException('No valid Hangul character index')

return unichr(0xAC00 + chosung_index * NUM_JOONG * NUM_JONG + joongsung_index * NUM_JONG + jongsung_index)


def hangul_index(letter):
return ord(letter) - FIRST_HANGUL_UNICODE


def decompose_index(code):
jong = int(code % NUM_JONG)
code /= NUM_JONG
joong = int(code % NUM_JOONG)
code /= NUM_JOONG
cho = int(code)

return (cho, joong, jong)
return cho, joong, jong


def decompose(hangul_letter):
Expand All @@ -53,14 +56,13 @@ def decompose(hangul_letter):
raise NotHangulException('')

if hangul_letter in CHO:
return (hangul_letter, '', '')
return hangul_letter, '', ''

if hangul_letter in JOONG:
return ('', hangul_letter, '')
return '', hangul_letter, ''

if hangul_letter in JONG:
return ('', '', hangul_letter)

return '', '', hangul_letter

code = hangul_index(hangul_letter)
cho, joong, jong = decompose_index(code)
Expand All @@ -69,8 +71,8 @@ def decompose(hangul_letter):
cho = 0

try:
return (CHO[cho], JOONG[joong], JONG[jong])
return CHO[cho], JOONG[joong], JONG[jong]
except:
print ("%d / %d / %d"%(cho, joong, jong))
print ("%s / %s " %( (JOONG[joong].encode("utf8"), JONG[jong].encode('utf8'))))
print("%d / %d / %d"%(cho, joong, jong))
print("%s / %s " %( JOONG[joong].encode("utf8"), JONG[jong].encode('utf8')))
raise Exception()
49 changes: 29 additions & 20 deletions hgtk/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@

# 코딩 효율과 가독성을 위해서 index대신 unicode사용 by bluedisk
JONG_COMP = {
u'ㄱ':{
u'ㄱ': {
u'ㄱ': u'ㄲ',
u'ㅅ': u'ㄳ',
},
u'ㄴ':{
u'ㄴ': {
u'ㅈ': u'ㄵ',
u'ㅎ': u'ㄶ',
},
u'ㄹ':{
u'ㄹ': {
u'ㄱ': u'ㄺ',
u'ㅁ': u'ㄻ',
u'ㅂ': u'ㄼ',
Expand All @@ -30,10 +30,12 @@

DEFAULT_COMPOSE_CODE = u'ᴥ'


################################################################################
# Hangul Automata functions by bluedisk@gmail.com
################################################################################


def decompose(text, latin_filter=True, compose_code=DEFAULT_COMPOSE_CODE):
result=u""

Expand All @@ -55,76 +57,83 @@ def decompose(text, latin_filter=True, compose_code=DEFAULT_COMPOSE_CODE):

return result


STATUS_CHO = 0
STATUS_JOONG = 1
STATUS_JONG1 = 2
STATUS_JONG2 = 3


def compose(text, compose_code=DEFAULT_COMPOSE_CODE):
res_text = u""
status="CHO"

status = STATUS_CHO

for c in text:

if status == "CHO":
if status == STATUS_CHO:

if c in CHO:
chosung = c
status="JOONG"
status = STATUS_JOONG
else:
if c != compose_code:

res_text = res_text + c

elif status == "JOONG":
elif status == STATUS_JOONG:

if c != compose_code and c in JOONG:
joongsung = c
status="JONG1"
status = STATUS_JONG1
else:
res_text = res_text + chosung

if c in CHO:
chosung = c
status="JOONG"
status = STATUS_JOONG
else:
if c != compose_code:

res_text = res_text + c
status="CHO"
status = STATUS_CHO

elif status == "JONG1":
elif status == STATUS_JONG1:

if c != compose_code and c in JONG:
jongsung = c

if c in JONG_COMP:
status="JONG2"
status = STATUS_JONG2
else:
res_text = res_text + letter.compose(chosung, joongsung, jongsung)
status="CHO"
status = STATUS_CHO

else:
res_text = res_text + letter.compose(chosung, joongsung)

if c in CHO:
chosung = c
status="JOONG"
status = STATUS_JOONG
else:
if c != compose_code:

res_text = res_text + c

status="CHO"
status = STATUS_CHO

elif status == "JONG2":
elif status == STATUS_JONG2:

if c != compose_code and c in JONG_COMP[jongsung]:
jongsung = JONG_COMP[jongsung][c]
c = compose_code # 종성 재 출력 방지
c = compose_code # 종성 재 출력 방지

res_text = res_text + letter.compose(chosung, joongsung, jongsung)

if c != compose_code:

res_text = res_text + c

status="CHO"

status = STATUS_CHO

return res_text
return res_text
18 changes: 14 additions & 4 deletions tests/test_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,44 @@

import hgtk


# hangul test - true/false
def test_is_hangul_1():
assert hgtk.checker.is_hangul('한글입니다')


def test_is_hangul_2():
assert hgtk.checker.is_hangul('no한글입니다') == False
assert not hgtk.checker.is_hangul('no한글입니다')


# hanja test - true/false
def test_is_hanja_1():
assert hgtk.checker.is_hanja('大韓民國')


def test_is_hanja_2():
assert hgtk.checker.is_hanja('大한민국') == False
assert not hgtk.checker.is_hanja('大한민국')


# latin test - true/false
def test_is_latin1_1():
assert hgtk.checker.is_latin1('abcdefghijklmnopqrstuvwxyz')


def test_is_latin1_2():
assert hgtk.checker.is_latin1('한글latin1한') == False
assert not hgtk.checker.is_latin1('한글latin1한')


# batchim test - true/false
def test_has_batchim_1():
assert hgtk.checker.has_batchim('한')


def test_has_batchim_2():
assert hgtk.checker.has_batchim('하') == False
assert not hgtk.checker.has_batchim('하')


# DEPRECATED! - not a general function
# def test_has_approximant_1():
# assert hgtk.checker.has_approximant('롹')

Expand Down
Loading

0 comments on commit 4d421ee

Please sign in to comment.