In [103]:
from chin_dict.chindict import ChinDict

cd = ChinDict()

char_result = cd.lookup_char("泪")

print()
print("泪 components:")
print()

for component in char_result.components:
	print(component.character + ":", component.meaning)

# 氵: ['"water" radical in Chinese characters (Kangxi radical 85), occurring in 没, 法, 流 etc', 'see also 三點水|三点水[san1 dian3 shui3]']
# 目: ['eye', 'item', 'section', 'list', 'catalogue', 'table of contents', 'order (taxonomy)', 'goal', 'name', 'title']

print()

word_result = cd.lookup_word("发")

print("Translations for 发:")
print()
for word in word_result:
	print(f'{word.pinyin} {word.meaning}')

# Simplified: 发
# Traditional: 發
# Pinyin: fa1
# Meaning: ['to send out', "to show (one's feeling)", 'to issue', 'to develop', 'to make a bundle of money', 'classifier for gunshots (rounds)']

# Simplified: 发
# Traditional: 髮
# Pinyin: fa4
# Meaning: ['hair', 'Taiwan pr. [fa3]']



泪 components:

氵: ['"water" radical in Chinese characters (Kangxi radical 85), occurring in 没, 法, 流 etc', 'see also 三點水|三点水[san1 dian3 shui3]']
目: ['eye', 'item', 'section', 'list', 'catalogue', 'table of contents', 'order (taxonomy)', 'goal', 'name', 'title']

Translations for 发:

fa1 ['to send out', "to show (one's feeling)", 'to issue', 'to develop', 'to make a bundle of money', 'classifier for gunshots (rounds)']
fa4 ['hair', 'Taiwan pr. [fa3]']


In [104]:
from chin_dict.chindict import ChinDict

PC_NEW_LINE = chr(0xEAB1)
PC_HANVIET_MARK = "HÁN VIỆT"
PC_RELATED_MARK = "LIÊN QUAN"
PC_VIDU_OLD_MARK = "Ví dụ:"
PC_VIDU_NEW_MARK = "VÍ DỤ"
PC_DIAMOND = "❖"
PC_ARROW = "»"
PC_TRIANGLE = "▶"  # ►
PC_DIAMOND_SUIT = "♦"
PC_HEART_SUIT = "♥"
PC_CLUB_SUIT = "♣"
PC_SPADE_SUIT = "♠"

cd = ChinDict()

wordset = set()

with open('dic_words_set.txt', 'r', encoding='utf-8') as fread:
    wordset.update(fread.read())

print(f'{len(wordset)=}')

wordlist = sorted(list(wordset))
import hanzidentifier

fwrite = open('char_info_dict.txt', 'w', encoding='utf-8')
for word in wordlist[:50]:
    if not hanzidentifier.is_simplified(word):
        continue
    
    string = ''

    char_result = cd.lookup_char(word)
    if hasattr(char_result, 'components'):
        pinyin = char_result.pinyin[-1]
        string += f"{word}\t{pinyin}\t{'/'.join(char_result.meaning)}\n{text if text else ''}\n"
        # print(f"{word} {pinyin} meaning: {'/'.join(char_result.meaning)}\n{text if text else ''}")

        tree = char_result.tree(show=False)
        if tree:
            string += f'{tree}\n'

        for component in char_result.components:
            # print(f'\tcomponent  : {component.character} {"/".join(component.meaning) if component.meaning else "(No meaning)"}')
            string += f'  {component.character} {"/".join(component.meaning) if component.meaning else "(No meaning)"}'
        print(string)
        string = string.replace('\n', PC_NEW_LINE)
        fwrite.write(f'{string}\n')
    else:
        print(f"{word} has no components:")



len(wordset)=7873
○ has no components:
〇 has no components:
㗎 has no components:
一 has no components:


NameError: name 'text' is not defined

In [None]:
fwrite.close()

In [None]:
import regex as re

# pattern = re.compile(r'([\p{IsHan}\p{IsBopo}\p{IsHira}\p{IsKatakana}]+)', re.UNICODE)

pattern = re.compile(r'([\p{Block=CJK_Compatibility}\p{Block=CJK_Compatibility_Forms}\p{Block=CJK_Compatibility_Ideographs}\p{Block=CJK_Compatibility_Ideographs_Supplement}\p{Block=CJK_Radicals_Supplement}\p{Block=CJK_Strokes}\p{Block=CJK_Symbols_And_Punctuation}\p{Block=CJK_Unified_Ideographs}\p{Block=CJK_Unified_Ideographs_Extension_A}\p{Block=CJK_Unified_Ideographs_Extension_B}\p{Block=CJK_Unified_Ideographs_Extension_C}\p{Block=CJK_Unified_Ideographs_Extension_D}\p{Block=CJK_Unified_Ideographs_Extension_E}\p{Block=CJK_Unified_Ideographs_Extension_F}\p{Block=Enclosed_CJK_Letters_And_Months}]+)', re.UNICODE)


input = '''举
├── 㐄 (2)
│   ├── 丨 (3)
│   └── 二 (4)
└── 兴 (1)
CharResult(㇇)'''

matches = re.findall(pattern, input)

print(matches)

['举', '㐄', '丨', '二', '兴', '㇇']


In [3]:
import json
try:
    with open('char_dict.json', "r", encoding="utf-8") as fread:
        char_dict = json.load(fread)
except:
    print(f"No file {'char_dict.json'}")

wordset = set()

with open('dic_words_set.txt', 'r', encoding='utf-8') as fread:
    wordset.update(fread.read())

empty = [key for key in char_dict if not char_dict[key]['meaning'] ]

from tools_configs import Radicals
rads = Radicals()
rads.load_radical_data()

In [15]:

print(empty)
print(f'{len(empty)}=')
    
not_rads = [key for key in empty if not rads.is_radical_variant(key)]
print(f'{len(not_rads)}=')
len(char_dict)

['?', '⺀', '⺁', '⺈', '⺊', '⺌', '⺍', '⺕', '⺙', '⺧', '⺪', '⺮', '⺳', '⺶', '⺺', '⺻', '⺼', '⻊', '⻭', '㇀', '㇅', '㇆', '㇇', '㇈', '㇉', '㇎', '㇏', '㇒', '㇖', '㇗', '㇙', '㇜', '㇝', '㇠', '㐄', '㐆', '㐌', '㐬', '㐱', '㑒', '㒸', '㓞', '㔾', '㕡', '㕣', '㕵', '㗊', '㚘', '㝉', '㝴', '㝵', '㞢', '㠯', '㡀', '㢆', '㣎', '㣺', '㧜', '㨨', '㩅', '㬎', '㳟', '㸒', '㸚', '㼌', '䀠', '䂞', '䏌', '䏍', '䏎', '䒑', '䖝', '䖭', '䖵', '䧹', '䩗', '䩭', '並', '亙', '仝', '來', '倉', '倠', '僕', '兒', '兓', '兩', '冎', '冝', '刅', '勽', '匃', '匛', '區', '厈', '厽', '參', '叚', '吂', '咼', '員', '啇', '啚', '喬', '單', '嗇', '囙', '圡', '圤', '圼', '執', '堯', '壯', '壽', '夃', '夋', '夎', '夐', '夒', '夲', '夵', '夾', '妟', '媷', '孨', '寽', '將', '專', '尒', '尞', '屚', '屬', '屰', '嵒', '巸', '帀', '帶', '幹', '幾', '弔', '弜', '強', '從', '復', '忩', '怱', '恆', '悤', '惢', '愛', '慮', '憂', '戓', '戔', '戶', '敄', '敫', '旉', '旲', '昗', '昬', '昷', '曇', '會', '東', '枼', '桼', '條', '棥', '榮', '槀', '欮', '歲', '氶', '氾', '汒', '洰', '湯', '滈', '無', '煩', '爲', '爾', '犾', '狊', '獻', '玨', '甤', '畐', '畕', '畢', '畧', '畫', '異', '畱', '當', '疌', '皅', '盙', '監',

8429

In [12]:
len(wordset)
char_dict_set = set(list(char_dict.keys()))

In [13]:
from chin_dict.chindict import ChinDict

cd = ChinDict()
c = cd.lookup_char( '龢')

pass


In [None]:
a

True

In [None]:
print(a)

True


# Look up chars by radicals


In [None]:
def make_list(list_items):
    if list_items == None:
        return []
    else: 
        return list_items

In [None]:
char_by_radical = {}
char_dict_radical_only = {}

for char in char_dict:
    components = make_list(char_dict[char]['components'])
    if not components:
        continue

    char_dict_radical_only[char] = set([alternatives[comp] for comp in components if comp in alternatives])

    for comp in components:
        if comp in alternatives:
            alt = alternatives[comp]

            if alt not in char_by_radical:
                char_by_radical[alt] = set([char])
            else:
                char_by_radical[alt].add(char)

len(char_by_radical)

211

In [None]:
char = '龟'
print(char_dict[char]['tree'])
print(char_dict_radical_only[char])


龟
├── ⺈ (1)
└── 电 (2)
    ├── 乚 (4)
    └── 曰 (3)
{'⼄', '⼑', '⽈'}


In [None]:
components = ['⼄', '⼑', '⽈']

alt_comps = [alternatives[comp] for comp in components]

item0 = components[0]
alt = alternatives[item0]

chars = char_by_radical[alt]

for comp in components[1:]:
    alti = alternatives[comp]
    chars = chars.intersection(char_by_radical[alti])

# chars = char_by_radical[components[0]] & char_by_radical[components[1]]
print(f"Radicals {' '.join(components)}")
print(f"Kangxi norminals {' '.join(alt_comps)}")
print(f"Contains these components {' '.join(chars)} ")
for char in chars:
    if char_dict_radical_only[char] == set(alt_comps):
        print(f'Contains ONLY these components {char} ')

Radicals ⼄ ⼑ ⽈
Kangxi norminals ⼄ ⼑ ⽈
Contains these components 阄 龟 
Contains ONLY these components 龟 


In [None]:
output_dict = {}

key_set = set()

MAX_DEPTH = 5
depth = 0

while depth < MAX_DEPTH:
    
    depth += 1
while depth < MAX_DEPTH:
    for rad in radical_set:
        item = radical_set[rad]
        output_dict[rad] = {"meaning": item["meaning"],
                "number": item['number'],
                "pinyin": item['pinyin'],
                'characters': char_by_radical[rad]}
        


214

# Search by components
Components 艮 钅
Contains these components 银 锒 
Contains ONLY these components 银 

---

Components 寸 钅
Contains these components 锵 铸 锊 锝 
Contains ONLY these components 铸 

---

Components 车 钅
Contains these components 链 
Contains ONLY these components 链 

--- 

Components 一 丷 木 米 钅
Contains these components 抨 兰 鲆 夹 栟 尊 峡 饼 踯 呯 鲞 迸 頬 粞 摒 秤 郑 蓱 荚 樽 蛱 猷 瓶 枰 浃 奠 駦 来 烂 颊 坪 陕 屏 醚 涞 苹 洴 铼 硖 掷 狭 撙 徕 并 姘 朕 睐 遵 泙 萍 拦 拼 铗 骈 糟 评 莱 郏 伻 送 挟 砰 楢 遒 栏 赉 鳟 侠 腾 平 酋 联 玶 蝤 蹲 怦 关 醾 
Contains ONLY these components 铼 

--- 

Components 木 米 钅
Contains these components 粲 谜 蕃 籼 糯 屎 迷 糅 粘 鳞 粹 懊 籹 掬 番 菊 窸 潘 奧 璨 粞 类 敉 粢 屡 麟 糙 奥 喽 篓 楼 粝 磻 数 粑 噢 糖 咪 糒 擞 麋 澳 褛 搂 糇 糕 粗 麴 糈 粿 偻 来 瘘 璠 籴 旛 醚 糔 涞 藩 宩 燠 娄 精 蒌 眯 辚 糊 粦 蟠 隩 粉 粳 磷 粒 铼 薮 糁 糠 釆 徕 糗 僳 悉 睐 蝼 墦 籽 鄱 粽 料 蹯 糱 粪 踘 糜 嶙 粟 粮 宷 糟 莱 播 粥 粼 膰 粱 粕 缕 嶓 鞠 赉 遴 窭 镂 匊 髅 鬻 燔 皤 繙 释 翻 醾 幡 蟋 



In [None]:
import re
with open("C:/Users/it.fsoft/OneDrive - FPT Software/Personal/Playground/new_make_pleco_dicts/wordlists/radicals-useful_info.txt", 'r', encoding='utf-8') as fread:
    for line in fread.readlines():
        char, pinyin, define = line.strip().split('\t')

        notes = match.group(1) if (match:= re.search(r'Notes:(.+?)', define)) else ''
        distinguish = (match.group(1) if (match:= re.search(r'Distinguish From: (.+)', define)) else '').split(" ")
        variants = (match.group(1) if (match:= re.search(r'Variants:(.+?)', define)) else '').split(" ")
        rank = match.group(1) if (match:= re.search(r'RANK: (.+?)', define)) else ''
        mnemonic = match.group(1) if (match:= re.search(r'Mnemonic:(.+?)', define)) else ''

        print(f'{char} Notes {notes}\n{distinguish=}\t{variants=}\t{rank=}\t{mnemonic=}')
        pass




一 Notes  Characters are classified here when they have a horizontal stroke and don't fit under any other radicals
distinguish=['乛', '亠', '冖', '宀']	variants=['']	rank='88.5%'	mnemonic=''
丨 Notes  Ancient pronunciation and meaning
distinguish=['亅']	variants=['']	rank='66.3%'	mnemonic=''
丨 Notes  Now simply called shu4 - 竖, which means vertical stroke, see gun3
distinguish=['亅']	variants=['']	rank='66.3%'	mnemonic=''
丶 Notes 
distinguish=['']	variants=['']	rank='63.1%'	mnemonic=''
丿 Notes  general stroke, sometimes means falling
distinguish=['']	variants=['乀乁']	rank='76.3%'	mnemonic=''
乀 Notes 
distinguish=['']	variants=['丿乁']	rank='0.0%'	mnemonic=''
乁 Notes  No modern meaning, except possibly as a phonetic.  Ancient meaning is similar to 移 or 及, meaning to shift, change, or reach.
distinguish=['']	variants=['丿乀']	rank='0.0%'	mnemonic=''
⺄ Notes  Not sure of the origin of this stroke, listed in the CJK radicals block
distinguish=['']	variants=['']	rank='0.0%'	mnemonic=''
乙 Notes 
distingu

In [None]:
ord('来')

26469

In [54]:
import hanzipy

from hanzipy.decomposer import HanziDecomposer

decomposer = HanziDecomposer()


decomposition = decomposer.decompose("圭", decomposition_type=2)
from pinyin import get as get_pinyin
from tools_configs import Radicals

radical_database = Radicals()
radical_database.load_radical_data()


INFO:root:Done compiling 12040 characters


In [67]:


print(decomposition)
print(decomposer.characters["圭"])
for rad in decomposer.radicals:
	pinyin = get_pinyin(rad)
	radical = rad

	if rad == pinyin:
		if radical_database.is_radical_variant(rad):
			pinyin = radical_database.lookup(rad)['pinyin']
			radical = radical_database.norminal(rad)
	
	print(f'{radical} {pinyin}')

	 
	

{'character': '圭', 'components': ['土']}
{'decomposition_type': 'rd', 'components': ['土']}
一 yī
丨 gǔn
丶 zhǔ
丿 piě
乙 yǐ
⼄ yǐ
乚 yǐn
⼄ yǐ
亅 jué
二 èr
亠 tóu
人 rén
亻 rén
儿 ér
入 rù
八 bā


  return regex.sub("Xem thêm \d+ ví dụ nữa", "", text)
  number = int(regex.search("(\d+)", rad_number).group(1))
  if (match := regex.search("Kangxi Radical (\d+)", rad_number))


KeyboardInterrupt: 

In [62]:
print(r)

BaseChar(圭)


In [63]:
decomposition = decomposer.decompose("圭", decomposition_type=2)
print(decomposition)
print(decomposer.characters["圭"])


{'character': '圭', 'components': ['土']}
{'decomposition_type': 'rd', 'components': ['土']}


In [64]:
import csv

filepath = r"C:\Python311\envs\tudien\Lib\site-packages\hanzipy\data\chinese_charfreq_simpl_trad.csv"
with open(filepath, encoding="utf-8") as freq_file:
    csvreader = csv.reader(freq_file)
    next(csvreader, None)  # skip the headers

    fout = open('wordlists/chinese_charfreq_simpl_trad.txt', 'w', encoding="utf-8")
    fout.write('# chinese_charfreq_simpl_trad in hanzipy package\n')

    for row in csvreader:
        character = row[1]
        fout.write(f'{character}\n')
    fout.close()

In [65]:
import json
hanzi_radicals = {}
radical_filepath = r"C:\Python311\envs\tudien\Lib\site-packages\hanzipy\data\radical_with_meanings_updated.json"
with open(radical_filepath, encoding="utf-8") as radicals_file:
    hanzi_radicals = json.load(radicals_file)

from tools_configs import Radicals
rad_database = Radicals()
rad_database.load_unicode_data()
my_radicals = rad_database.radicals()

new_items = {} 
print(f'{len(my_radicals)=}')
print(f'{len(hanzi_radicals)=}')

for r in my_radicals:
    if r not in hanzi_radicals:
        item = rad_database.lookup(r)
        hanzi_radicals[r] = item['meaning']

print(f'{len(hanzi_radicals)=}')

for r in my_radicals:
    for v in rad_database.variants(r):
        if v not in hanzi_radicals:
            # if r in hanzi_radicals:
            new_items[v] = hanzi_radicals[r]
print(new_items)
print(f'{len(new_items)=}')
hanzi_radicals.update(new_items)
print(f'{len(hanzi_radicals)=}')


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Python311\\envs\\tudien\\Lib\\site-packages\\hanzipy\\data\\radical_with_meanings_updated.json'

In [None]:
from tools_configs import Radicals

rads = Radicals()

rads.load_unicode_data()

# rads.save_radical_data()

Wrong line 2E9A	 █ 	<RESERVED>	→ 2F46 ⽆ kangxi radical not

More than 6


True

In [None]:
# Adds an item to a list if it does not already exist 
def append_if_not_exists(list_items, new_item):
    if new_item not in list_items:
        list_items.append(new_item)

# Adds an item to a list if it does not already exist 
def extend_if_not_exists(list_items, new_list_items):
    for item in new_list_items:
        append_if_not_exists(list_items, item)

print(f'{list1}=', )

list1 = [1, 2, 3, 4,]
list2 = [4,5,6]

append_if_not_exists(list1, 1)
print(f'{list1}=', )

append_if_not_exists(list1, 5)
print(f'{list1}=', )

extend_if_not_exists(list1, list2)
print(f'{list1}=', )


[1, 2, 3, 4, 5]=
[1, 2, 3, 4]=
[1, 2, 3, 4, 5]=
[1, 2, 3, 4, 5, 6]=


In [18]:
from tools_configs import Radicals
rads = Radicals()
rads.load_radical_data()

In [28]:
from tinyunicodeblock import block

all_blocks = set()

for num, rad in enumerate(rads.radicals()):
    item = rads.lookup(rad)

    blocks = [block(v) for v in item['variants']] if item['variants'] else []
    all_blocks.update(blocks)
    all_blocks.add(block(rad))
    
    print(f'{rad}\t{item['number']}\t{block(rad)}\t{'\t'.join(blocks)}')

print(all_blocks)


⼀	1	Kangxi Radicals	CJK Unified Ideographs
⼁	2	Kangxi Radicals	CJK Unified Ideographs
⼂	3	Kangxi Radicals	CJK Radicals Supplement	CJK Unified Ideographs
⼃	4	Kangxi Radicals	CJK Unified Ideographs	CJK Unified Ideographs
⼄	5	Kangxi Radicals	CJK Radicals Supplement	CJK Radicals Supplement	CJK Radicals Supplement	CJK Unified Ideographs	CJK Unified Ideographs	CJK Unified Ideographs
⼅	6	Kangxi Radicals	CJK Unified Ideographs
⼆	7	Kangxi Radicals	CJK Unified Ideographs
⼇	8	Kangxi Radicals	CJK Unified Ideographs
⼈	9	Kangxi Radicals	CJK Radicals Supplement	CJK Unified Ideographs	CJK Unified Ideographs
⼉	10	Kangxi Radicals	CJK Unified Ideographs
⼊	11	Kangxi Radicals	CJK Unified Ideographs
⼋	12	Kangxi Radicals	CJK Unified Ideographs	CJK Unified Ideographs
⼌	13	Kangxi Radicals	CJK Radicals Supplement	CJK Unified Ideographs
⼍	14	Kangxi Radicals	CJK Unified Ideographs
⼎	15	Kangxi Radicals	CJK Unified Ideographs
⼏	16	Kangxi Radicals	CJK Radicals Supplement	CJK Unified Ideographs	CJK Unified Ideographs

In [29]:

with open("wordlists/Top_100_radicals.txt", "r", encoding="utf-8") as fread:
    next(fread)
    kangxi_unicode_set = {}

    for line in fread:
        # print(line)
        simplified,	tradictional, variant, meaning, pinyin, examples, comments, colloquial = line.strip().split("\t")

        if simplified != rads.norminal(simplified):
            print(simplified, pinyin)
        pass

人 rén
口 kǒu
土 tǔ
女 nǚ
心 xīn
手 shǒu
日 rì
月 yuè
木 mù
氵 shuǐ
火 huǒ
纟 (mì)
艹 
讠 yán
辶 (chuò)
钅 jīn
刂 dāo
宀 (mián)
贝 bèi
一 yī
力 lì
又 yòu
犭 (quǎn)
禾 (he)
⺮ zhú
虫 chóng
阜 (fù)
大 dà
广 guǎng
田 tián
目 mù
石 shí
衤 yī
足 zú
马 mǎ
页 yè
巾 (jīn)
米 mǐ
车 chē
八 bā
尸 shī
寸 cùn
山 shān
攵 (pū)
彳 (chí)
十 shí
工 gōng
方 fāng
门 mén
饣 shí
欠 qiàn
儿 ér
冫 bīng
子 zǐ
疒 (nè)
隹 (zhuī)
斤 (jīn)
亠 (tóu)
王 yù, wáng
白 bái
立 lì
羊 yáng
艮 (gèn)
冖 (mì)
厂 (hàn)
皿 (mǐn)
礻 shì
穴 xuè
走 zǒu
雨 yǔ
囗 (wéi)
小 xiǎo
戈 (gē)
几 jī
舌 shé
干 gān
殳 (shū)
夕 xī
止 zhǐ
牜 niú
皮 pí
耳 ěr
辛 xīn
阝 (yì)
酉 (yǒu)
青 qīng
鸟 niǎo
弓 gōng
厶 sī
户 hù
羽 yǔ
舟 chuán
里 lǐ
匕 (bǐ)
夂 (suī)
见 jiàn
卩 (jié)
罒 wǎng
士 shì
勹 (bāo)
