In [3]:
from chin_dict.chindict import ChinDict

cd = ChinDict()

char_result = cd.lookup_char("泪")

print()
print("泪 components:")
print()

for component in char_result.components:
	print(component.character + ":", component.meaning)

# 氵: ['"water" radical in Chinese characters (Kangxi radical 85), occurring in 没, 法, 流 etc', 'see also 三點水|三点水[san1 dian3 shui3]']
# 目: ['eye', 'item', 'section', 'list', 'catalogue', 'table of contents', 'order (taxonomy)', 'goal', 'name', 'title']

print()

word_result = cd.lookup_word("发")

print("Translations for 发:")
print()
for word in word_result:
	print(f'{word.pinyin} {word.meaning}')

# Simplified: 发
# Traditional: 發
# Pinyin: fa1
# Meaning: ['to send out', "to show (one's feeling)", 'to issue', 'to develop', 'to make a bundle of money', 'classifier for gunshots (rounds)']

# Simplified: 发
# Traditional: 髮
# Pinyin: fa4
# Meaning: ['hair', 'Taiwan pr. [fa3]']



泪 components:

氵: ['"water" radical in Chinese characters (Kangxi radical 85), occurring in 没, 法, 流 etc', 'see also 三點水|三点水[san1 dian3 shui3]']
目: ['eye', 'item', 'section', 'list', 'catalogue', 'table of contents', 'order (taxonomy)', 'goal', 'name', 'title']

Translations for 发:

fa1 ['to send out', "to show (one's feeling)", 'to issue', 'to develop', 'to make a bundle of money', 'classifier for gunshots (rounds)']
fa4 ['hair', 'Taiwan pr. [fa3]']


In [34]:
from chin_dict.chindict import ChinDict

PC_NEW_LINE = chr(0xEAB1)
PC_HANVIET_MARK = "HÁN VIỆT"
PC_RELATED_MARK = "LIÊN QUAN"
PC_VIDU_OLD_MARK = "Ví dụ:"
PC_VIDU_NEW_MARK = "VÍ DỤ"
PC_DIAMOND = "❖"
PC_ARROW = "»"
PC_TRIANGLE = "▶"  # ►
PC_DIAMOND_SUIT = "♦"
PC_HEART_SUIT = "♥"
PC_CLUB_SUIT = "♣"
PC_SPADE_SUIT = "♠"

cd = ChinDict()

wordset = set()

with open('dic_words_set.txt', 'r', encoding='utf-8') as fread:
    wordset.update(fread.read())

print(f'{len(wordset)=}')

wordlist = sorted(list(wordset))
import hanzidentifier

fwrite = open('char_info_dict.txt', 'w', encoding='utf-8')
for word in wordlist[:50]:
    if not hanzidentifier.is_simplified(word):
        continue
    
    string = ''

    char_result = cd.lookup_char(word)
    if hasattr(char_result, 'components'):
        pinyin = char_result.pinyin[-1]
        string += f"{word}\t{pinyin}\t{'/'.join(char_result.meaning)}\n{text if text else ''}\n"
        # print(f"{word} {pinyin} meaning: {'/'.join(char_result.meaning)}\n{text if text else ''}")

        tree = char_result.tree(show=False)
        if tree:
            string += f'{tree}\n'

        for component in char_result.components:
            # print(f'\tcomponent  : {component.character} {"/".join(component.meaning) if component.meaning else "(No meaning)"}')
            string += f'  {component.character} {"/".join(component.meaning) if component.meaning else "(No meaning)"}'
        print(string)
        string = string.replace('\n', PC_NEW_LINE)
        fwrite.write(f'{string}\n')
    else:
        print(f"{word} has no components:")



len(wordset)=7873
○ has no components:
〇 has no components:
㗎 has no components:
一 has no components:


In [31]:
fwrite.close()

In [3]:
import regex as re

# pattern = re.compile(r'([\p{IsHan}\p{IsBopo}\p{IsHira}\p{IsKatakana}]+)', re.UNICODE)

pattern = re.compile(r'([\p{Block=CJK_Compatibility}\p{Block=CJK_Compatibility_Forms}\p{Block=CJK_Compatibility_Ideographs}\p{Block=CJK_Compatibility_Ideographs_Supplement}\p{Block=CJK_Radicals_Supplement}\p{Block=CJK_Strokes}\p{Block=CJK_Symbols_And_Punctuation}\p{Block=CJK_Unified_Ideographs}\p{Block=CJK_Unified_Ideographs_Extension_A}\p{Block=CJK_Unified_Ideographs_Extension_B}\p{Block=CJK_Unified_Ideographs_Extension_C}\p{Block=CJK_Unified_Ideographs_Extension_D}\p{Block=CJK_Unified_Ideographs_Extension_E}\p{Block=CJK_Unified_Ideographs_Extension_F}\p{Block=Enclosed_CJK_Letters_And_Months}]+)', re.UNICODE)


input = '''举
├── 㐄 (2)
│   ├── 丨 (3)
│   └── 二 (4)
└── 兴 (1)
CharResult(㇇)'''

matches = re.findall(pattern, input)

print(matches)

['举', '㐄', '丨', '二', '兴', '㇇']


In [2]:
import json
try:
    with open('char_dict.json', "r", encoding="utf-8") as fread:
        char_dict = json.load(fread)
except:
    print(f'No file {'char_dict.json'}')

wordset = set()

with open('dic_words_set.txt', 'r', encoding='utf-8') as fread:
    wordset.update(fread.read())

In [8]:
len(wordset)
char_dict_set = set(list(char_dict.keys()))

In [12]:
wordset - char_dict_set

{'\n',
 '○',
 '〇',
 '㗎',
 '倣',
 '傢',
 '僊',
 '儁',
 '兎',
 '兕',
 '勅',
 '勩',
 '匟',
 '厤',
 '叡',
 '呪',
 '咲',
 '唕',
 '唸',
 '啓',
 '啗',
 '喆',
 '喎',
 '喒',
 '嗰',
 '噉',
 '嚮',
 '囓',
 '坵',
 '埜',
 '堃',
 '塼',
 '塿',
 '墠',
 '壈',
 '壻',
 '妬',
 '姙',
 '媿',
 '嫺',
 '嬃',
 '嬭',
 '嬾',
 '孻',
 '寃',
 '尅',
 '尟',
 '廐',
 '廼',
 '彄',
 '恠',
 '愬',
 '慴',
 '憖',
 '懃',
 '懽',
 '戹',
 '扞',
 '拏',
 '掽',
 '搆',
 '搤',
 '搥',
 '搾',
 '撦',
 '擕',
 '擧',
 '攩',
 '攷',
 '敺',
 '斲',
 '昇',
 '曏',
 '柟',
 '椗',
 '椷',
 '楥',
 '榘',
 '槼',
 '樑',
 '橤',
 '毘',
 '汎',
 '洩',
 '溼',
 '澁',
 '澂',
 '澣',
 '濆',
 '灨',
 '煖',
 '燄',
 '燐',
 '燖',
 '燻',
 '牀',
 '牋',
 '牓',
 '犇',
 '獧',
 '玁',
 '玅',
 '瑲',
 '璊',
 '璫',
 '璿',
 '甎',
 '甖',
 '畊',
 '疎',
 '疘',
 '疿',
 '皙',
 '皜',
 '睍',
 '瞖',
 '矙',
 '砦',
 '硜',
 '碁',
 '碪',
 '磠',
 '祕',
 '秊',
 '秪',
 '稬',
 '穇',
 '穉',
 '穤',
 '穨',
 '窵',
 '竈',
 '筴',
 '箒',
 '箠',
 '篘',
 '簑',
 '籔',
 '粧',
 '紃',
 '紬',
 '絅',
 '絏',
 '絪',
 '絰',
 '絺',
 '綌',
 '綪',
 '綯',
 '縕',
 '繖',
 '繻',
 '纁',
 '纆',
 '纇',
 '缾',
 '罋',
 '羣',
 '翫',
 '翽',
 '肧',
 '臈',
 '臙',
 '臝',
 '艣',
 '艸',
 '菓

In [19]:
from chin_dict.chindict import ChinDict

cd = ChinDict()
c = cd.lookup_char( '龢')

pass


In [9]:
PATTERN_ZH = r'([\p{Block=CJK_Unified_Ideographs}\p{Block=CJK_Compatibility}\p{Block=CJK_Compatibility_Forms}\p{Block=CJK_Compatibility_Ideographs}\p{Block=CJK_Compatibility_Ideographs_Supplement}\p{Block=CJK_Radicals_Supplement}\p{Block=CJK_Strokes}\p{Block=CJK_Symbols_And_Punctuation}\p{Block=CJK_Unified_Ideographs}\p{Block=CJK_Unified_Ideographs_Extension_A}\p{Block=CJK_Unified_Ideographs_Extension_B}\p{Block=CJK_Unified_Ideographs_Extension_C}\p{Block=CJK_Unified_Ideographs_Extension_D}\p{Block=CJK_Unified_Ideographs_Extension_E}\p{Block=CJK_Unified_Ideographs_Extension_F}\p{Block=Enclosed_CJK_Letters_And_Months}])'
PATTERN_ZH1 = r'([\p{Block=CJK_Unified_Ideographs}\p{Block=CJK_Compatibility}\p{Block=CJK_Compatibility_Forms}' \
    r'\p{Block=CJK_Compatibility_Ideographs}\p{Block=CJK_Compatibility_Ideographs_Supplement}' \
    r'\p{Block=CJK_Radicals_Supplement}\p{Block=CJK_Strokes}\p{Block=CJK_Symbols_And_Punctuation}' \
    r'\p{Block=CJK_Unified_Ideographs}\p{Block=CJK_Unified_Ideographs_Extension_A}' \
    r'\p{Block=CJK_Unified_Ideographs_Extension_B}\p{Block=CJK_Unified_Ideographs_Extension_C}'\
    r'\p{Block=CJK_Unified_Ideographs_Extension_D}\p{Block=CJK_Unified_Ideographs_Extension_E}' \
    r'\p{Block=CJK_Unified_Ideographs_Extension_F}\p{Block=Enclosed_CJK_Letters_And_Months}])'


In [10]:
PATTERN_ZH==PATTERN_ZH1

True

In [3]:
a

True

In [4]:
print(a)

True
