In [70]:
from chin_dict.chindict import ChinDict

cd = ChinDict()

char_result = cd.lookup_char("泪")

print()
print("泪 components:")
print()

for component in char_result.components:
	print(component.character + ":", component.meaning)

# 氵: ['"water" radical in Chinese characters (Kangxi radical 85), occurring in 没, 法, 流 etc', 'see also 三點水|三点水[san1 dian3 shui3]']
# 目: ['eye', 'item', 'section', 'list', 'catalogue', 'table of contents', 'order (taxonomy)', 'goal', 'name', 'title']

print()

word_result = cd.lookup_word("发")

print("Translations for 发:")
print()
for word in word_result:
	print(f'{word.pinyin} {word.meaning}')

# Simplified: 发
# Traditional: 發
# Pinyin: fa1
# Meaning: ['to send out', "to show (one's feeling)", 'to issue', 'to develop', 'to make a bundle of money', 'classifier for gunshots (rounds)']

# Simplified: 发
# Traditional: 髮
# Pinyin: fa4
# Meaning: ['hair', 'Taiwan pr. [fa3]']



泪 components:

氵: ['"water" radical in Chinese characters (Kangxi radical 85), occurring in 没, 法, 流 etc', 'see also 三點水|三点水[san1 dian3 shui3]']
目: ['eye', 'item', 'section', 'list', 'catalogue', 'table of contents', 'order (taxonomy)', 'goal', 'name', 'title']

Translations for 发:

fa1 ['to send out', "to show (one's feeling)", 'to issue', 'to develop', 'to make a bundle of money', 'classifier for gunshots (rounds)']
fa4 ['hair', 'Taiwan pr. [fa3]']


In [71]:
from chin_dict.chindict import ChinDict

PC_NEW_LINE = chr(0xEAB1)
PC_HANVIET_MARK = "HÁN VIỆT"
PC_RELATED_MARK = "LIÊN QUAN"
PC_VIDU_OLD_MARK = "Ví dụ:"
PC_VIDU_NEW_MARK = "VÍ DỤ"
PC_DIAMOND = "❖"
PC_ARROW = "»"
PC_TRIANGLE = "▶"  # ►
PC_DIAMOND_SUIT = "♦"
PC_HEART_SUIT = "♥"
PC_CLUB_SUIT = "♣"
PC_SPADE_SUIT = "♠"

cd = ChinDict()

wordset = set()

with open('dic_words_set.txt', 'r', encoding='utf-8') as fread:
    wordset.update(fread.read())

print(f'{len(wordset)=}')

wordlist = sorted(list(wordset))
import hanzidentifier

fwrite = open('char_info_dict.txt', 'w', encoding='utf-8')
for word in wordlist[:50]:
    if not hanzidentifier.is_simplified(word):
        continue
    
    string = ''

    char_result = cd.lookup_char(word)
    if hasattr(char_result, 'components'):
        pinyin = char_result.pinyin[-1]
        string += f"{word}\t{pinyin}\t{'/'.join(char_result.meaning)}\n{text if text else ''}\n"
        # print(f"{word} {pinyin} meaning: {'/'.join(char_result.meaning)}\n{text if text else ''}")

        tree = char_result.tree(show=False)
        if tree:
            string += f'{tree}\n'

        for component in char_result.components:
            # print(f'\tcomponent  : {component.character} {"/".join(component.meaning) if component.meaning else "(No meaning)"}')
            string += f'  {component.character} {"/".join(component.meaning) if component.meaning else "(No meaning)"}'
        print(string)
        string = string.replace('\n', PC_NEW_LINE)
        fwrite.write(f'{string}\n')
    else:
        print(f"{word} has no components:")



len(wordset)=7873
○ has no components:
〇 has no components:
㗎 has no components:
一 has no components:


NameError: name 'text' is not defined

In [None]:
fwrite.close()

In [None]:
import regex as re

# pattern = re.compile(r'([\p{IsHan}\p{IsBopo}\p{IsHira}\p{IsKatakana}]+)', re.UNICODE)

pattern = re.compile(r'([\p{Block=CJK_Compatibility}\p{Block=CJK_Compatibility_Forms}\p{Block=CJK_Compatibility_Ideographs}\p{Block=CJK_Compatibility_Ideographs_Supplement}\p{Block=CJK_Radicals_Supplement}\p{Block=CJK_Strokes}\p{Block=CJK_Symbols_And_Punctuation}\p{Block=CJK_Unified_Ideographs}\p{Block=CJK_Unified_Ideographs_Extension_A}\p{Block=CJK_Unified_Ideographs_Extension_B}\p{Block=CJK_Unified_Ideographs_Extension_C}\p{Block=CJK_Unified_Ideographs_Extension_D}\p{Block=CJK_Unified_Ideographs_Extension_E}\p{Block=CJK_Unified_Ideographs_Extension_F}\p{Block=Enclosed_CJK_Letters_And_Months}]+)', re.UNICODE)


input = '''举
├── 㐄 (2)
│   ├── 丨 (3)
│   └── 二 (4)
└── 兴 (1)
CharResult(㇇)'''

matches = re.findall(pattern, input)

print(matches)

['举', '㐄', '丨', '二', '兴', '㇇']


In [None]:
import json
try:
    with open('char_dict.json', "r", encoding="utf-8") as fread:
        char_dict = json.load(fread)
except:
    print(f"No file {'char_dict.json'}")

wordset = set()

with open('dic_words_set.txt', 'r', encoding='utf-8') as fread:
    wordset.update(fread.read())

empty = [key for key in char_dict if not char_dict[key]['meaning'] ]

from tools_configs import Radicals
rads = Radicals()
rads.load_radical_data()

In [None]:

print(empty)
print(f'{len(empty)}=')
    
not_rads = [key for key in empty if not rads.is_radical_variant(key)]
print(f'{len(not_rads)}=')
len(char_dict)

['?', '⺀', '⺁', '⺈', '⺊', '⺌', '⺍', '⺕', '⺙', '⺧', '⺪', '⺮', '⺳', '⺶', '⺺', '⺻', '⺼', '⻊', '⻭', '㇀', '㇅', '㇆', '㇇', '㇈', '㇉', '㇎', '㇏', '㇒', '㇖', '㇗', '㇙', '㇜', '㇝', '㇠', '㐄', '㐆', '㐌', '㐬', '㐱', '㑒', '㒸', '㓞', '㔾', '㕡', '㕣', '㕵', '㗊', '㚘', '㝉', '㝴', '㝵', '㞢', '㠯', '㡀', '㢆', '㣎', '㣺', '㧜', '㨨', '㩅', '㬎', '㳟', '㸒', '㸚', '㼌', '䀠', '䂞', '䏌', '䏍', '䏎', '䒑', '䖝', '䖭', '䖵', '䧹', '䩗', '䩭', '並', '亙', '仝', '來', '倉', '倠', '僕', '兒', '兓', '兩', '冎', '冝', '刅', '勽', '匃', '匛', '區', '厈', '厽', '參', '叚', '吂', '咼', '員', '啇', '啚', '喬', '單', '嗇', '囙', '圡', '圤', '圼', '執', '堯', '壯', '壽', '夃', '夋', '夎', '夐', '夒', '夲', '夵', '夾', '妟', '媷', '孨', '寽', '將', '專', '尒', '尞', '屚', '屬', '屰', '嵒', '巸', '帀', '帶', '幹', '幾', '弔', '弜', '強', '從', '復', '忩', '怱', '恆', '悤', '惢', '愛', '慮', '憂', '戓', '戔', '戶', '敄', '敫', '旉', '旲', '昗', '昬', '昷', '曇', '會', '東', '枼', '桼', '條', '棥', '榮', '槀', '欮', '歲', '氶', '氾', '汒', '洰', '湯', '滈', '無', '煩', '爲', '爾', '犾', '狊', '獻', '玨', '甤', '畐', '畕', '畢', '畧', '畫', '異', '畱', '當', '疌', '皅', '盙', '監',

8429

In [None]:
len(wordset)
char_dict_set = set(list(char_dict.keys()))

In [None]:
from chin_dict.chindict import ChinDict

cd = ChinDict()
c = cd.lookup_char( '龢')

pass


In [None]:
a

True

In [None]:
print(a)

True


# Look up chars by radicals


In [None]:
def make_list(list_items):
    if list_items == None:
        return []
    else: 
        return list_items

In [None]:
char_by_radical = {}
char_dict_radical_only = {}

for char in char_dict:
    components = make_list(char_dict[char]['components'])
    if not components:
        continue

    char_dict_radical_only[char] = set([alternatives[comp] for comp in components if comp in alternatives])

    for comp in components:
        if comp in alternatives:
            alt = alternatives[comp]

            if alt not in char_by_radical:
                char_by_radical[alt] = set([char])
            else:
                char_by_radical[alt].add(char)

len(char_by_radical)

211

In [None]:
char = '龟'
print(char_dict[char]['tree'])
print(char_dict_radical_only[char])


龟
├── ⺈ (1)
└── 电 (2)
    ├── 乚 (4)
    └── 曰 (3)
{'⼄', '⼑', '⽈'}


In [None]:
components = ['⼄', '⼑', '⽈']

alt_comps = [alternatives[comp] for comp in components]

item0 = components[0]
alt = alternatives[item0]

chars = char_by_radical[alt]

for comp in components[1:]:
    alti = alternatives[comp]
    chars = chars.intersection(char_by_radical[alti])

# chars = char_by_radical[components[0]] & char_by_radical[components[1]]
print(f"Radicals {' '.join(components)}")
print(f"Kangxi norminals {' '.join(alt_comps)}")
print(f"Contains these components {' '.join(chars)} ")
for char in chars:
    if char_dict_radical_only[char] == set(alt_comps):
        print(f'Contains ONLY these components {char} ')

Radicals ⼄ ⼑ ⽈
Kangxi norminals ⼄ ⼑ ⽈
Contains these components 阄 龟 
Contains ONLY these components 龟 


In [None]:
output_dict = {}

key_set = set()

MAX_DEPTH = 5
depth = 0

while depth < MAX_DEPTH:
    
    depth += 1
while depth < MAX_DEPTH:
    for rad in radical_set:
        item = radical_set[rad]
        output_dict[rad] = {"meaning": item["meaning"],
                "number": item['number'],
                "pinyin": item['pinyin'],
                'characters': char_by_radical[rad]}
        


214

# Search by components
Components 艮 钅
Contains these components 银 锒 
Contains ONLY these components 银 

---

Components 寸 钅
Contains these components 锵 铸 锊 锝 
Contains ONLY these components 铸 

---

Components 车 钅
Contains these components 链 
Contains ONLY these components 链 

--- 

Components 一 丷 木 米 钅
Contains these components 抨 兰 鲆 夹 栟 尊 峡 饼 踯 呯 鲞 迸 頬 粞 摒 秤 郑 蓱 荚 樽 蛱 猷 瓶 枰 浃 奠 駦 来 烂 颊 坪 陕 屏 醚 涞 苹 洴 铼 硖 掷 狭 撙 徕 并 姘 朕 睐 遵 泙 萍 拦 拼 铗 骈 糟 评 莱 郏 伻 送 挟 砰 楢 遒 栏 赉 鳟 侠 腾 平 酋 联 玶 蝤 蹲 怦 关 醾 
Contains ONLY these components 铼 

--- 

Components 木 米 钅
Contains these components 粲 谜 蕃 籼 糯 屎 迷 糅 粘 鳞 粹 懊 籹 掬 番 菊 窸 潘 奧 璨 粞 类 敉 粢 屡 麟 糙 奥 喽 篓 楼 粝 磻 数 粑 噢 糖 咪 糒 擞 麋 澳 褛 搂 糇 糕 粗 麴 糈 粿 偻 来 瘘 璠 籴 旛 醚 糔 涞 藩 宩 燠 娄 精 蒌 眯 辚 糊 粦 蟠 隩 粉 粳 磷 粒 铼 薮 糁 糠 釆 徕 糗 僳 悉 睐 蝼 墦 籽 鄱 粽 料 蹯 糱 粪 踘 糜 嶙 粟 粮 宷 糟 莱 播 粥 粼 膰 粱 粕 缕 嶓 鞠 赉 遴 窭 镂 匊 髅 鬻 燔 皤 繙 释 翻 醾 幡 蟋 



In [None]:
import re
with open("C:/Users/it.fsoft/OneDrive - FPT Software/Personal/Playground/new_make_pleco_dicts/wordlists/radicals-useful_info.txt", 'r', encoding='utf-8') as fread:
    for line in fread.readlines():
        char, pinyin, define = line.strip().split('\t')

        notes = match.group(1) if (match:= re.search(r'Notes:(.+?)', define)) else ''
        distinguish = (match.group(1) if (match:= re.search(r'Distinguish From: (.+)', define)) else '').split(" ")
        variants = (match.group(1) if (match:= re.search(r'Variants:(.+?)', define)) else '').split(" ")
        rank = match.group(1) if (match:= re.search(r'RANK: (.+?)', define)) else ''
        mnemonic = match.group(1) if (match:= re.search(r'Mnemonic:(.+?)', define)) else ''

        print(f'{char} Notes {notes}\n{distinguish=}\t{variants=}\t{rank=}\t{mnemonic=}')
        pass




一 Notes  Characters are classified here when they have a horizontal stroke and don't fit under any other radicals
distinguish=['乛', '亠', '冖', '宀']	variants=['']	rank='88.5%'	mnemonic=''
丨 Notes  Ancient pronunciation and meaning
distinguish=['亅']	variants=['']	rank='66.3%'	mnemonic=''
丨 Notes  Now simply called shu4 - 竖, which means vertical stroke, see gun3
distinguish=['亅']	variants=['']	rank='66.3%'	mnemonic=''
丶 Notes 
distinguish=['']	variants=['']	rank='63.1%'	mnemonic=''
丿 Notes  general stroke, sometimes means falling
distinguish=['']	variants=['乀乁']	rank='76.3%'	mnemonic=''
乀 Notes 
distinguish=['']	variants=['丿乁']	rank='0.0%'	mnemonic=''
乁 Notes  No modern meaning, except possibly as a phonetic.  Ancient meaning is similar to 移 or 及, meaning to shift, change, or reach.
distinguish=['']	variants=['丿乀']	rank='0.0%'	mnemonic=''
⺄ Notes  Not sure of the origin of this stroke, listed in the CJK radicals block
distinguish=['']	variants=['']	rank='0.0%'	mnemonic=''
乙 Notes 
distingu

In [None]:
ord('来')

26469

In [None]:
import hanzipy

from hanzipy.decomposer import HanziDecomposer

decomposer = HanziDecomposer()


decomposition = decomposer.decompose("圭", decomposition_type=2)
from pinyin import get as get_pinyin
from tools_configs import Radicals

radical_database = Radicals()
radical_database.load_radical_data()


INFO:root:Done compiling 12040 characters


In [None]:


print(decomposition)
print(decomposer.characters["圭"])
for rad in decomposer.radicals:
	pinyin = get_pinyin(rad)
	radical = rad

	if rad == pinyin:
		if radical_database.is_radical_variant(rad):
			pinyin = radical_database.lookup(rad)['pinyin']
			radical = radical_database.norminal(rad)
	
	print(f'{radical} {pinyin}')

	 
	

{'character': '圭', 'components': ['土']}
{'decomposition_type': 'rd', 'components': ['土']}
一 yī
丨 gǔn
丶 zhǔ
丿 piě
乙 yǐ
⼄ yǐ
乚 yǐn
⼄ yǐ
亅 jué
二 èr
亠 tóu
人 rén
亻 rén
儿 ér
入 rù
八 bā


  return regex.sub("Xem thêm \d+ ví dụ nữa", "", text)
  number = int(regex.search("(\d+)", rad_number).group(1))
  if (match := regex.search("Kangxi Radical (\d+)", rad_number))


KeyboardInterrupt: 

In [None]:
print(r)

BaseChar(圭)


In [None]:
decomposition = decomposer.decompose("圭", decomposition_type=2)
print(decomposition)
print(decomposer.characters["圭"])


{'character': '圭', 'components': ['土']}
{'decomposition_type': 'rd', 'components': ['土']}


In [None]:
import csv

filepath = r"C:\Python311\envs\tudien\Lib\site-packages\hanzipy\data\chinese_charfreq_simpl_trad.csv"
with open(filepath, encoding="utf-8") as freq_file:
    csvreader = csv.reader(freq_file)
    next(csvreader, None)  # skip the headers

    fout = open('wordlists/chinese_charfreq_simpl_trad.txt', 'w', encoding="utf-8")
    fout.write('# chinese_charfreq_simpl_trad in hanzipy package\n')

    for row in csvreader:
        character = row[1]
        fout.write(f'{character}\n')
    fout.close()

In [None]:
import json
hanzi_radicals = {}
radical_filepath = r"C:\Python311\envs\tudien\Lib\site-packages\hanzipy\data\radical_with_meanings_updated.json"
with open(radical_filepath, encoding="utf-8") as radicals_file:
    hanzi_radicals = json.load(radicals_file)

from tools_configs import Radicals
rad_database = Radicals()
rad_database.load_unicode_data()
my_radicals = rad_database.radicals()

new_items = {} 
print(f'{len(my_radicals)=}')
print(f'{len(hanzi_radicals)=}')

for r in my_radicals:
    if r not in hanzi_radicals:
        item = rad_database.lookup(r)
        hanzi_radicals[r] = item['meaning']

print(f'{len(hanzi_radicals)=}')

for r in my_radicals:
    for v in rad_database.variants(r):
        if v not in hanzi_radicals:
            # if r in hanzi_radicals:
            new_items[v] = hanzi_radicals[r]
print(new_items)
print(f'{len(new_items)=}')
hanzi_radicals.update(new_items)
print(f'{len(hanzi_radicals)=}')


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Python311\\envs\\tudien\\Lib\\site-packages\\hanzipy\\data\\radical_with_meanings_updated.json'

In [None]:
from tools_configs import Radicals

rads = Radicals()

rads.load_unicode_data()

# rads.save_radical_data()

Wrong line 2E9A	 █ 	<RESERVED>	→ 2F46 ⽆ kangxi radical not

More than 6


True

In [None]:
# Adds an item to a list if it does not already exist 
def append_if_not_exists(list_items, new_item):
    if new_item not in list_items:
        list_items.append(new_item)

# Adds an item to a list if it does not already exist 
def extend_if_not_exists(list_items, new_list_items):
    for item in new_list_items:
        append_if_not_exists(list_items, item)

print(f'{list1}=', )

list1 = [1, 2, 3, 4,]
list2 = [4,5,6]

append_if_not_exists(list1, 1)
print(f'{list1}=', )

append_if_not_exists(list1, 5)
print(f'{list1}=', )

extend_if_not_exists(list1, list2)
print(f'{list1}=', )


[1, 2, 3, 4, 5]=
[1, 2, 3, 4]=
[1, 2, 3, 4, 5]=
[1, 2, 3, 4, 5, 6]=


In [None]:
from tools_configs import Radicals
rads = Radicals()
rads.load_radical_data()
from tinyunicodeblock import block
from hanzipy.dictionary import HanziDictionary
from chin_dict.chindict import ChinDict

In [None]:
rads = Radicals()
rads.load_unicode_data()

Wrong line 2E9A	 █ 	<RESERVED>	→ 2F46 ⽆ kangxi radical not



True

In [None]:
list_standalones = ['⺌',
'⺜',
'⺝',
'⻗',
'一',
'业',
'丸',
'乙',
'乡',
'二',
'亡',
'人',
'儿',
'兀',
'入',
'八',
'几',
'刀',
'刁',
'力',
'匕',
'十',
'卜',
'厂',
'又',
'口',
'土',
'士',
'夕',
'大',
'天',
'女',
'子',
'寸',
'小',
'尚',
'尸',
'尺',
'山',
'川',
'工',
'己',
'已',
'巳',
'巾',
'干',
'弓',
'心',
'戈',
'户',
'手',
'才',
'支',
'文',
'斗',
'斤',
'方',
'无',
'日',
'曰',
'月',
'木',
'欠',
'止',
'歹',
'毋',
'母',
'比',
'毛',
'氏',
'气',
'水',
'火',
'爪',
'父',
'片',
'牙',
'牛',
'犬',
'玄',
'玉',
'王',
'瓜',
'瓦',
'甘',
'生',
'用',
'甩',
'田',
'由',
'甲',
'申',
'电',
'白',
'皮',
'目',
'矛',
'矢',
'石',
'示',
'穴',
'立',
'米',
'网',
'羊',
'羽',
'老',
'而',
'耳',
'肉',
'臣',
'自',
'至',
'舌',
'色',
'虎',
'虫',
'血',
'行',
'衣',
'西',
'见',
'角',
'言',
'谷',
'豆',
'贝',
'赤',
'走',
'足',
'身',
'车',
'辛',
'辰',
'邑',
'酉',
'里',
'金',
'门',
'隶',
'雨',
'青',
'非',
'面',
'革',
'韦',
'音',
'页',
'风',
'飞',
'食',
'首',
'香',
'马',
'骨',
'高',
'鬼',
'鱼',
'鸟',
'鹿',
'麦',
'麻',
'黑',
'鼎',
'鼓',
'鼠',
'鼻',
'齐',
'齿',
'龙',
'龟',

]

In [None]:
# list_standalones.remove('丸')
# list_standalones.remove('业')
# list_standalones.remove('尺')

standalone_set = {}

for stand in set(list_standalones):
    if not rads.is_radical_variant(stand):
        print(f'Not a radical: {stand}')
        continue

    rad = rads.norminal(stand)
    item = rads.lookup(rad)

    blocks = []
    
    found =False
    for v in item['variants']:
        name = block(v)

        if (name == 'CJK Unified Ideographs'):
            print(f'{v}\t{rad}')
            found = True
            standalone_set[rad] = v

    if not found:
        print(f'Cant find standalone version for {rad}')

print('Standalons')

for r,s in standalone_set.items():
    print(f'{r}\t{hex(ord(r))}\t{s}\t{hex(ord(s))}')

非	⾮
十	⼗
邑	⾢
斗	⽃
韋	⾱
韦	⾱
犬	⽝
犭	⽝
走	⾛
赱	⾛
鹿	⿅
干	⼲
田	⽥
由	⽥
甲	⽥
申	⽥
电	⽥
甘	⽢
Not a radical: 亡
弓	⼸
辰	⾠
斉	⿑
齊	⿑
齐	⿑
曰	⽈
己	⼰
已	⼰
巳	⼰
支	⽀
匕	⼔
月	⽉
Not a radical: 尚
小	⼩
小	⼩
山	⼭
夕	⼣
二	⼆
羊	⽺
羋	⽺
色	⾊
足	⾜
石	⽯
卜	⼘
Not a radical: 尺
爪	⽖
爫	⽖
月	⽉
至	⾄
貝	⾙
贝	⾙
刀	⼑
刁	⼑
刂	⼑
止	⽌
几	⼏
靑	⾭
青	⾭
大	⼤
己	⼰
已	⼰
巳	⼰
生	⽣
心	⼼
忄	⼼
自	⾃
飛	⾶
飞	⾶
玄	⽞
見	⾒
见	⾒
首	⾸
玉	⽟
玊	⽟
王	⽟
矛	⽭
手	⼿
扌	⼿
才	⼿
龵	⼿
田	⽥
由	⽥
甲	⽥
申	⽥
电	⽥
毛	⽑
士	⼠
方	⽅
風	⾵
风	⾵
穴	⽳
羽	⽻
耳	⽿
谷	⾕
雨	⾬
鼓	⿎
牛	⽜
牜	⽜
門	⾨
门	⾨
戈	⼽
言	⾔
訁	⾔
讠	⾔
黑	⿊
黒	⿊
鼠	⿏
鼡	⿏
寸	⼨
Not a radical: 天
辛	⾟
麥	⿆
麦	⿆
乙	⼄
乚	⼄
乛	⼄
兀	⼪
尢	⼪
尣	⼪
网	⽹
罒	⽹
罓	⽹
頁	⾴
页	⾴
刀	⼑
刁	⼑
刂	⼑
入	⼊
气	⽓
父	⽗
又	⼜
尸	⼫
虍	⾌
虎	⾌
老	⽼
耂	⽼
田	⽥
由	⽥
甲	⽥
申	⽥
电	⽥
革	⾰
力	⼒
隶	⾪
鼻	⿐
衣	⾐
衤	⾐
臣	⾂
白	⽩
麻	⿇
音	⾳
玉	⽟
玊	⽟
王	⽟
豆	⾖
牙	⽛
皮	⽪
田	⽥
由	⽥
甲	⽥
申	⽥
电	⽥
身	⾝
血	⾎
襾	⾑
西	⾑
覀	⾑
欠	⽋
酉	⾣
子	⼦
里	⾥
巛	⼮
巜	⼮
川	⼮
戶	⼾
户	⼾
戸	⼾
冃	⽇
日	⽇
Not a radical: 丸
巾	⼱
毋	⽏
毌	⽏
母	⽏
鬼	⿁
土	⼟
比	⽐
米	⽶
工	⼯
木	⽊
朩	⽊
香	⾹
火	⽕
灬	⽕
食	⾷
飠	⾷
饣	⾷
人	⼈
亻	⼈
面	⾯
靣	⾯
馬	⾺
马	⾺
歯	⿒
齒	⿒
齿	⿒
氏	⽒
民	⽒
雨	⾬
虫	⾍
矢	⽮
魚	⿂
鱼	⿂
立	⽴
厂	⼚
骨	⾻
肉	⾁
丷	⼋
八	⼋
儿	⼉
乡	⼳
幺	⼳
亀	⿔
龜	⿔
龟	⿔
車	⾞
车	⾞
目	⽬
角	⾓
斤	⽄
手	⼿
扌	⼿
才	⼿
龵	⼿
女	⼥

In [None]:
import json
cd = ChinDict()

searcher = HanziDictionary()
with open(r"C:\Users\it.fsoft\AppData\Local\Tony Narlock\unihan_etl\unihan.json", 'r', encoding='utf-8') as fread:
    unihan = json.load(fread)

In [None]:
pass

unihan_dict = {}

for i in unihan:
    unihan_dict[i['char']] = i

In [None]:


all_blocks = set()
standalone_set = {}

for num, rad in enumerate(rads.radicals()):
    item = rads.lookup(rad)

    blocks = []
    
    for v in item['variants']:
        name = block(v)
        blocks.append(name)

        if (name == 'CJK Unified Ideographs'):
            try:
                result = cd.lookup_char(v)

                if result:

                    u = unihan_dict[v]
                    definitions = ', '.join(u['kDefinition'])
                    
                    if definitions.find('radical') >= 0 or definitions.find('kwukyel') >= 0 or definitions.find('Radical') >= 0:
                        continue
                    standalone_set[rad] = v
                    # print(f'{rad}\t{hex(ord(rad))}\t{v}\t{hex(ord(v))}\t{definitions}')

                    pass
            except:
                pass

    all_blocks.update(blocks)
    all_blocks.add(block(rad))
    
    # print(f'{rad}\t{item['number']}\t{block(rad)}\t{'\t'.join(blocks)}\t{'\t'.join(item['variants'])}')

for r,s in standalone_set.items():
    print(f'{r}\t{hex(ord(r))}\t{s}\t{hex(ord(s))}\t{definitions}')


⼀	0x2f00	一	0x4e00	flute, pipe, ancient measure, Kangxi radical 214
⼃	0x2f03	乀	0x4e40	flute, pipe, ancient measure, Kangxi radical 214
⼄	0x2f04	乚	0x4e5a	flute, pipe, ancient measure, Kangxi radical 214
⼆	0x2f06	二	0x4e8c	flute, pipe, ancient measure, Kangxi radical 214
⼈	0x2f08	人	0x4eba	flute, pipe, ancient measure, Kangxi radical 214
⼊	0x2f0a	入	0x5165	flute, pipe, ancient measure, Kangxi radical 214
⼋	0x2f0b	八	0x516b	flute, pipe, ancient measure, Kangxi radical 214
⼏	0x2f0f	几	0x51e0	flute, pipe, ancient measure, Kangxi radical 214
⼑	0x2f11	刁	0x5201	flute, pipe, ancient measure, Kangxi radical 214
⼔	0x2f14	匕	0x5315	flute, pipe, ancient measure, Kangxi radical 214
⼗	0x2f17	十	0x5341	flute, pipe, ancient measure, Kangxi radical 214
⼘	0x2f18	卜	0x535c	flute, pipe, ancient measure, Kangxi radical 214
⼜	0x2f1c	又	0x53c8	flute, pipe, ancient measure, Kangxi radical 214
⼝	0x2f1d	口	0x53e3	flute, pipe, ancient measure, Kangxi radical 214
⼞	0x2f1e	囗	0x56d7	flute, pipe, ancient measure, Kangxi radical

In [None]:
print(block('青'))
print(hex(ord('青')))
chr(0x9752)

CJK Unified Ideographs
0x9752


'青'

In [None]:
print(hex(ord('酉')))
print(hex(ord('⾣')))


0x9149
0x2fa3


In [None]:
my_list = [
'人',
'口',
'土',
'女',
'心',
'手',
'日',
'月',
'木',
'氵',
'火',
'纟',
'艹',
'讠',
'辶',
'钅',
'刂',
'宀',
'贝',
'一',
'力',
'又',
'犭',
'禾',
'⺮',
'虫',
'阜',
'大',
'广',
'田',
'目',
'石',
'衤',
'足',
'马',
'页',
'巾',
'米',
'车',
'八',
'尸',
'寸',
'山',
'攵',
'彳',
'十',
'工',
'方',
'门',
'饣',
'欠',
'儿',
'冫',
'子',
'疒',
'隹',
'斤',
'亠',
'王',
'白',
'立',
'羊',
'艮',
'冖',
'厂',
'皿',
'礻',
'穴',
'走',
'雨',
'囗',
'小',
'戈',
'几',
'舌',
'干',
'殳',
'夕',
'止',
'牜',
'皮',
'耳',
'辛',
'阝',
'酉',
'青',
'鸟',
'弓',
'厶',
'户',
'羽',
'舟',
'里',
'匕',
'夂',
'见',
'卩',
'罒',
'士',
'勹',
]

for i in my_list:
    print(rads.norminal(i))
    # print(f'{hex(ord(i))} {hex(ord(rads.norminal(i)))}')

⼈
⼝
⼟
⼥
⼼
⼿
⽇
⽉
⽊
⽔
⽕
⽷
⾋
⾔
⾡
⾦
⼑
⼧
⾙
⼀
⼒
⼜
⽝
⽲
⽵
⾍
⾩
⼤
⼴
⽥
⽬
⽯
⾐
⾜
⾺
⾴
⼱
⽶
⾞
⼋
⼫
⼨
⼭
⽁
⼻
⼗
⼯
⽅
⾨
⾷
⽋
⼉
⼎
⼦
⽧
⾫
⽄
⼇
⽟
⽩
⽴
⽺
⾉
⼍
⼚
⽫
⽰
⽳
⾛
⾬
⼞
⼩
⼽
⼏
⾆
⼲
⽎
⼣
⽌
⽜
⽪
⽿
⾟
⾩
⾣
⾭
⿃
⼸
⼛
⼾
⽻
⾈
⾥
⼔
⼡
⾒
⼙
⽹
⼠
⼓


In [None]:
my_wiki = [
'一 ',
'丨 ',
'丶 ',
'丿',
'乙',
'亅 ',
'二 ',
'亠 ',
'人',
'儿 ',
'入 ',
'八 ',
'冂 ',
'冖 ',
'冫 ',
'几 ',
'凵 ',
'刀 ',
'力 ',
'勹 ',
'匕 ',
'匚 ',
'匸 ',
'十 ',
'卜 ',
'卩',
'厂 ',
'厶 ',
'又 ',
'口 ',
'囗 ',
'土 ',
'士 ',
'夂 ',
'夊 ',
'夕 ',
'大 ',
'女 ',
'子 ',
'宀 ',
'寸 ',
'小 ',
'尢',
'尸 ',
'屮 ',
'山 ',
'巛',
'工 ',
'己 ',
'巾 ',
'干 ',
'幺',
'广 ',
'廴 ',
'廾 ',
'弋 ',
'弓 ',
'彐',
'彡 ',
'彳 ',
'心',
'戈 ',
'戶',
'手 ',
'支 ',
'攴',
'文 ',
'斗 ',
'斤 ',
'方 ',
'无',
'日 ',
'曰 ',
'月 ',
'木 ',
'欠 ',
'止 ',
'歹',
'殳 ',
'毋',
'比 ',
'毛 ',
'氏 ',
'气 ',
'水',
'火',
'爪',
'父 ',
'爻 ',
'爿',
'片 ',
'牙 ',
'牛 ',
'犬',
'玄 ',
'玉',
'瓜 ',
'瓦 ',
'甘 ',
'生 ',
'用 ',
'田 ',
'疋 ',
'疒 ',
'癶 ',
'白 ',
'皮 ',
'皿 ',
'目',
'矛 ',
'矢 ',
'石 ',
'示',
'禸 ',
'禾 ',
'穴 ',
'立 ',
'竹',
'米 ',
'糸',
'缶 ',
'网 ',
'羊 ',
'羽 ',
'老 ',
'而 ',
'耒 ',
'耳 ',
'聿 ',
'肉 ',
'臣 ',
'自 ',
'至 ',
'臼 ',
'舌 ',
'舛 ',
'舟 ',
'艮 ',
'色 ',
'艸 ',
'虍 ',
'虫 ',
'血 ',
'行 ',
'衣 ',
'襾 ',
'見 ',
'角 ',
'言 ',
'谷 ',
'豆 ',
'豕 ',
'豸 ',
'貝 ',
'赤 ',
'走 ',
'足 ',
'身 ',
'車 ',
'辛 ',
'辰 ',
'辵 ',
'邑 ',
'酉 ',
'釆 ',
'里 ',
'金 ',
'長 ',
'門 ',
'阜 ',
'隶 ',
'隹 ',
'雨 ',
'靑 ',
'非 ',
'面 ',
'革 ',
'韋 ',
'韭 ',
'音 ',
'頁 ',
'風 ',
'飛 ',
'食 ',
'首 ',
'香 ',
'馬 ',
'骨 ',
'高 ',
'髟 ',
'鬥 ',
'鬯 ',
'鬲 ',
'鬼 ',
'魚 ',
'鳥 ',
'鹵 ',
'鹿 ',
'麥 ',
'麻 ',
'黃 ',
'黍 ',
'黑 ',
'黹 ',
'黽 ',
'鼎 ',
'鼓 ',
'鼠 ',
'鼻 ',
'齊 ',
'齒 ',
'龍 ',
'龜 ',
'龠 ',
]

for i in my_list:
    # print(rads.norminal(i))
    print(f'{hex(ord(i))} {hex(ord(rads.norminal(i)))}')

0x4eba 0x2f08
0x53e3 0x2f1d
0x571f 0x2f1f
0x5973 0x2f25
0x5fc3 0x2f3c
0x624b 0x2f3f
0x65e5 0x2f47
0x6708 0x2f49
0x6728 0x2f4a
0x6c35 0x2f54
0x706b 0x2f55
0x7e9f 0x2f77
0x8279 0x2f8b
0x8ba0 0x2f94
0x8fb6 0x2fa1
0x9485 0x2fa6
0x5202 0x2f11
0x5b80 0x2f27
0x8d1d 0x2f99
0x4e00 0x2f00
0x529b 0x2f12
0x53c8 0x2f1c
0x72ad 0x2f5d
0x79be 0x2f72
0x2eae 0x2f75
0x866b 0x2f8d
0x961c 0x2fa9
0x5927 0x2f24
0x5e7f 0x2f34
0x7530 0x2f65
0x76ee 0x2f6c
0x77f3 0x2f6f
0x8864 0x2f90
0x8db3 0x2f9c
0x9a6c 0x2fba
0x9875 0x2fb4
0x5dfe 0x2f31
0x7c73 0x2f76
0x8f66 0x2f9e
0x516b 0x2f0b
0x5c38 0x2f2b
0x5bf8 0x2f28
0x5c71 0x2f2d
0x6535 0x2f41
0x5f73 0x2f3b
0x5341 0x2f17
0x5de5 0x2f2f
0x65b9 0x2f45
0x95e8 0x2fa8
0x9963 0x2fb7
0x6b20 0x2f4b
0x513f 0x2f09
0x51ab 0x2f0e
0x5b50 0x2f26
0x7592 0x2f67
0x96b9 0x2fab
0x65a4 0x2f44
0x4ea0 0x2f07
0x738b 0x2f5f
0x767d 0x2f69
0x7acb 0x2f74
0x7f8a 0x2f7a
0x826e 0x2f89
0x5196 0x2f0d
0x5382 0x2f1a
0x76bf 0x2f6b
0x793b 0x2f70
0x7a74 0x2f73
0x8d70 0x2f9b
0x96e8 0x2fac
0x56d7 0x2f1e
0x5c0f

In [None]:
s1='⺕'
s2='⺕⺕'

from tools_configs import  (PATTERN_ZH, PATTERN_ZH_MUL)

import regex

m = regex.search(PATTERN_ZH, s1)
print(m)

<regex.Match object; span=(0, 1), match='⺕'>


In [None]:
from tools_configs import Radicals
rads = Radicals()
rads.load_radical_data()

list_from_variants = {
'⼀': '一', '⼃': '乀', '⼄': '乚', '⼆': '二', '⼈': '人', '⼊': '入', '⼋': '八', '⼏': '几', '⼑': '刁', '⼔': '匕', '⼗': '十', '⼘': '卜', '⼜': '又', '⼝': '口', '⼞': '囗', '⼟': '土', '⼠': '士', '⼣': '夕', '⼤': '大', '⼦': '子', '⼩': '小', '⼪': '兀', '⼭': '山', '⼮': '川', '⼯': '工', '⼰': '巳', '⼲': '干', '⼳': '幺', '⼴': '广', '⼶': '廿', '⼷': '弋', '⼻': '彳', '⼼': '心', '⼽': '戈', '⼾': '戸', '⼿': '才', '⽀': '支', '⽂': '文', '⽅': '方', '⽆': '旡', '⽇': '日', '⽊': '木', '⽌': '止', '⽍': '歺', '⽎': '殳', '⽏': '母', '⽐': '比', '⽑': '毛', '⽒': '民', '⽔': '水', '⽕': '火', '⽖': '爪', '⽘': '爻', '⽙': '丬', '⽚': '片', '⽛': '牙', '⽟': '王', '⽡': '瓦', '⽢': '甘', '⽣': '生', '⽤': '甩', '⽥': '电', '⽦': '疋', '⽩': '白', '⽪': '皮', '⽫': '皿', '⽮': '矢', '⽲': '禾', '⽴': '立', '⽶': '米', '⽺': '羋', '⾌': '虎', '⾒': '见', '⾙': '贝', '⾛': '赱', '⾞': '车', '⾧': '长', '⾨': '门', '⾴': '页', '⾵': '风', '⾶': '飞', '⾼': '髙', '⿄': '卤', '⿈': '黄', '⿊': '黒', '⿌': '黾', '⿏': '鼡', '⿑': '齐', '⿓': '龙', '⿔': '龟',}

list_from_chardict = {
    '⼀': '一', '⼄': '乙', '⼆': '二', '⼈': '人', '⼉': '儿', '⼊': '入', '⼋': '八', '⼏': '几', '⼑': '刀', '⼒': '力', '⼔': '匕', '⼕': '匚', '⼗': '十', '⼘': '卜', '⼚': '厂', '⼜': '又', '⼝': '口', '⼞': '囗', '⼟': '土', '⼠': '士', '⼣': '夕', '⼤': '大', '⼥': '女', '⼦': '子', '⼨': '寸', '⼩': '小', '⼪': '兀', '⼫': '尸', '⼭': '山', '⼮': '川', '⼯': '工', '⼰': '己', '⼱': '巾', '⼲': '干', '⼳': '幺', '⼴': '广', '⼶': '廿', '⼷': '弋', '⼸': '弓', '⼺': '彡', '⼼': '心', '⼽': '戈', '⼾': '户', '⼿': '手', '⽀': '支', '⽁': '攴', '⽂': '文', '⽃': '斗', '⽄': '斤', '⽅': '方', '⽆': '无', '⽇': '日', '⽈': '曰', '⽉': '月', '⽊': '木', '⽋': '欠', '⽌': '止', '⽍': '歹', '⽎': '殳', '⽏': '毋', '⽐': '比', '⽑': '毛', '⽒': '氏', '⽓': '气', '⽔': '水', '⽕': '火', '⽖': '爪', '⽗': '父', '⽘': '爻', '⽚': '片', '⽛': '牙', '⽜': '牛', '⽝': '犬', '⽞': '玄', '⽟': '玉', '⽠': '瓜', '⽡': '瓦', '⽢': '甘', '⽣': '生', '⽤': '用', '⽥': '田', '⽦': '疋', '⽩': '白', '⽪': '皮', '⽫': '皿', '⽬': '目', '⽭': '矛', '⽮': '矢', '⽯': '石', '⽰': '示', '⽲': '禾', '⽳': '穴', '⽴': '立', '⽵': '竹', '⽶': '米', '⽸': '缶', '⽹': '网', '⽺': '羊', '⽻': '羽', '⽼': '老', '⽽': '而', '⽾': '耒', '⽿': '耳', '⾀': '聿', '⾁': '肉', '⾂': '臣', '⾃': '自', '⾄': '至', '⾅': '臼', '⾆': '舌', '⾇': '舛', '⾈': '舟', '⾉': '艮', '⾊': '色', '⾌': '虎', '⾍': '虫', '⾎': '血', '⾏': '行', '⾐': '衣', '⾑': '襾', '⾒': '见', '⾓': '角', '⾔': '言', '⾕': '谷', '⾖': '豆', '⾗': '豕', '⾙': '贝', '⾚': '赤', '⾛': '走', '⾜': '足', '⾝': '身', '⾞': '车', '⾟': '辛', '⾠': '辰', '⾡': '辵', '⾢': '邑', '⾣': '酉', '⾤': '釆', '⾥': '里', '⾦': '金', '⾧': '长', '⾨': '门', '⾩': '阜', '⾪': '隶', '⾫': '隹', '⾬': '雨', '⾭': '青', '⾮': '非', '⾯': '面', '⾰': '革', '⾱': '韦', '⾲': '韭', '⾳': '音', '⾴': '页', '⾵': '风', '⾶': '飞', '⾷': '食', '⾸': '首', '⾹': '香', '⾺': '马', '⾻': '骨', '⾼': '高', '⾽': '髟', '⾿': '鬯', '⿀': '鬲', '⿁': '鬼', '⿂': '鱼', '⿃': '鸟', '⿄': '卤', '⿅': '鹿', '⿆': '麦', '⿇': '麻', '⿈': '黄', '⿉': '黍', '⿊': '黑', '⿋': '黹', '⿌': '黾', '⿍': '鼎', '⿎': '鼓', '⿏': '鼠', '⿐': '鼻', '⿑': '齐', '⿒': '齿', '⿓': '龙', '⿔': '龟', '⿕': '龠', 


}
pass


In [None]:
set(list_from_chardict.keys()) -set(list_from_variants.keys()) 

print(f'{len(list_from_chardict)=}')

len(list_from_chardict)=186


In [None]:
norm_list_from_chardict = {}

for rad in list_from_chardict:
    norm_list_from_chardict[rads.norminal(rad)] = list_from_chardict[rad]

for rad in rads.radicals():
    if rad in norm_list_from_chardict:
        print(f'{rad} {norm_list_from_chardict[rad]}')
        rads.radical_set[rad]['standalone']=norm_list_from_chardict[rad]
        
    else:
        rads.radical_set[rad]['standalone']=''
        # print(f'{rad} No standalone')
        pass

pass

rads.save_radical_data()

⼀ 一
⼄ 乙
⼆ 二
⼈ 人
⼉ 儿
⼊ 入
⼋ 八
⼏ 几
⼑ 刀
⼒ 力
⼔ 匕
⼕ 匚
⼗ 十
⼘ 卜
⼚ 厂
⼜ 又
⼝ 口
⼞ 囗
⼟ 土
⼠ 士
⼣ 夕
⼤ 大
⼥ 女
⼦ 子
⼨ 寸
⼩ 小
⼪ 兀
⼫ 尸
⼭ 山
⼮ 川
⼯ 工
⼰ 己
⼱ 巾
⼲ 干
⼳ 幺
⼴ 广
⼶ 廿
⼷ 弋
⼸ 弓
⼺ 彡
⼼ 心
⼽ 戈
⼾ 户
⼿ 手
⽀ 支
⽁ 攴
⽂ 文
⽃ 斗
⽄ 斤
⽅ 方
⽆ 无
⽇ 日
⽈ 曰
⽉ 月
⽊ 木
⽋ 欠
⽌ 止
⽍ 歹
⽎ 殳
⽏ 毋
⽐ 比
⽑ 毛
⽒ 氏
⽓ 气
⽔ 水
⽕ 火
⽖ 爪
⽗ 父
⽘ 爻
⽚ 片
⽛ 牙
⽜ 牛
⽝ 犬
⽞ 玄
⽟ 玉
⽠ 瓜
⽡ 瓦
⽢ 甘
⽣ 生
⽤ 用
⽥ 田
⽦ 疋
⽩ 白
⽪ 皮
⽫ 皿
⽬ 目
⽭ 矛
⽮ 矢
⽯ 石
⽰ 示
⽲ 禾
⽳ 穴
⽴ 立
⽵ 竹
⽶ 米
⽸ 缶
⽹ 网
⽺ 羊
⽻ 羽
⽼ 老
⽽ 而
⽾ 耒
⽿ 耳
⾀ 聿
⾁ 肉
⾂ 臣
⾃ 自
⾄ 至
⾅ 臼
⾆ 舌
⾇ 舛
⾈ 舟
⾉ 艮
⾊ 色
⾌ 虎
⾍ 虫
⾎ 血
⾏ 行
⾐ 衣
⾑ 襾
⾒ 见
⾓ 角
⾔ 言
⾕ 谷
⾖ 豆
⾗ 豕
⾙ 贝
⾚ 赤
⾛ 走
⾜ 足
⾝ 身
⾞ 车
⾟ 辛
⾠ 辰
⾡ 辵
⾢ 邑
⾣ 酉
⾤ 釆
⾥ 里
⾦ 金
⾧ 长
⾨ 门
⾩ 阜
⾪ 隶
⾫ 隹
⾬ 雨
⾭ 青
⾮ 非
⾯ 面
⾰ 革
⾱ 韦
⾲ 韭
⾳ 音
⾴ 页
⾵ 风
⾶ 飞
⾷ 食
⾸ 首
⾹ 香
⾺ 马
⾻ 骨
⾼ 高
⾽ 髟
⾿ 鬯
⿀ 鬲
⿁ 鬼
⿂ 鱼
⿃ 鸟
⿄ 卤
⿅ 鹿
⿆ 麦
⿇ 麻
⿈ 黄
⿉ 黍
⿊ 黑
⿋ 黹
⿌ 黾
⿍ 鼎
⿎ 鼓
⿏ 鼠
⿐ 鼻
⿑ 齐
⿒ 齿
⿓ 龙
⿔ 龟
⿕ 龠


# Process IDS database

In [145]:
from tools_configs import *
import regex
char_decompositions = {}

full_char_decompositions = {}
radical_found = set()
radical_norminal_found = set()

rad_db = Radicals()
rad_db.load_radical_data()

non_rad_components = {}

layouts = set(['⿰', '⿱', '⿲', '⿳', '⿴', '⿵', '⿶', '⿷', '⿸', '⿹', '⿺', '⿻', ])

with open("./wordlists/IDS_dictionary.txt", "r", encoding="utf-8") as fread:
    lines = fread.readlines()

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # print(line)
        head, expression = line.split(":")

        if not rad_db.is_radical_variant(head):
            char_decompositions[head] = expression

        else:
            print(f'{head} is a radical already')
            radical_found.add(head)
            radical_norminal_found.add(rad_db.norminal(head))

        if len(expression) < 2:
            print(f'Single {head}\t{expression}')

        pass

    for key in char_decompositions:
        components = char_decompositions[key].split(' ')

        for comp in components:
            if not comp:
                continue

            if comp not in layouts and comp not in char_decompositions and not rad_db.is_radical_variant(comp) and comp[0] != '&':
                non_rad_components.setdefault(comp, set())
                non_rad_components[comp].add(head)

print('Non-radical tokens found')
print('\n'.join(sorted(non_rad_components.keys())))
pass

full_char_decompositions = char_decompositions

round = [0, 0]

# Replaces 2 times to make sure all items are replaced
for i in range(0,2):
    for key in full_char_decompositions:
        expression = full_char_decompositions[key]
        matches = regex.findall(PATTERN_ZH, expression)
        changed = False

        for char in matches:
            if char in full_char_decompositions:
                sub = full_char_decompositions[char]
                if sub == char:
                    continue
                
                round[i] += 1
                print(f'{key}: {char} => {sub}')
                changed = True

                expression = expression.replace(char, sub)

        full_char_decompositions[key] = expression
        
        pass

print(round)


鬲 is a radical already
里 is a radical already
廴 is a radical already
Single 廴	廴
Single 丑	丑
王 is a radical already
Single 丈	丈
毛 is a radical already
Single 为	为
门 is a radical already
Single 门	门
衣 is a radical already
士 is a radical already
Single 戊	戊
勹 is a radical already
Single 勹	勹
豸 is a radical already
Single 豸	豸
黃 is a radical already
攵 is a radical already
Single 攵	攵
缶 is a radical already
Single 缶	缶
麦 is a radical already
戈 is a radical already
兀 is a radical already
子 is a radical already
Single 子	子
丬 is a radical already
Single 丬	丬
Single 乍	乍
行 is a radical already
舟 is a radical already
Single 舟	舟
皿 is a radical already
Single 皿	皿
香 is a radical already
Single 香	⾹
玄 is a radical already
Single 丩	丩
山 is a radical already
Single 山	山
龠 is a radical already
Single 冘	冘
邑 is a radical already
止 is a radical already
Single 止	止
Single 且	且
Single 书	书
戶 is a radical already
户 is a radical already
由 is a radical already
Single 由	由
弋 is a radical already
聿 is a radical already
Single 朿	朿


In [146]:
from tinyunicodeblock import block

c = '𠘧'
print(f'{hex(ord(c))} {block(c)}')

0x20627 CJK Unified Ideographs Extension B


In [147]:
print(f'{len(char_decompositions)}')
print(f'{len(full_char_decompositions)}')
print(f'{len(radical_found)}')
print(f'{len(radical_norminal_found)}')


27228
27228
323
214


In [152]:
with open("./wordlists/IDS_dictionary_radical_perfect.txt", "w", encoding="utf-8") as fwrite:
    items = full_char_decompositions.items()

    for head, expression in items:
        fwrite.write(f'{head}:{expression}\n')


In [153]:

wordlist = [
        "&",
        "-",
        "0",
        "1",
        "2",
        "4",
        "5",
        "6",
        "7",
        "8",
        "9",
        ";",
        "A",
        "B",
        "C",
        "D",
        "E",
        "F",
        "P",
        "③",
        "⑮",
        "△",
        "⿰",
        "⿱",
        "⿲",
        "⿳",
        "⿴",
        "⿵",
        "⿶",
        "⿷",
        "⿸",
        "⿹",
        "⿺",
        "⿻",
        "〢",
        "コ",
        "ス",
        "ユ",
        "㇀",
        "㇇",
        "㇉",
        "㇓",
        "㇣",
        "㐁",
        "㐄",
        "㐅",
        "㐆",
        "㐧",
        "㠯",
        "㱐",
        "㸦",
        "䍏",
        "丈",
        "丏",
        "丐",
        "丑",
        "专",
        "且",
        "世",
        "丘",
        "东",
        "丣",
        "丩",
        "丱",
        "丹",
        "为",
        "乄",
        "久",
        "乍",
        "乎",
        "乐",
        "乑",
        "乗",
        "乜",
        "九",
        "也",
        "书",
        "亊",
        "事",
        "于",
        "井",
        "亜",
        "以",
        "兂",
        "兆",
        "円",
        "冉",
        "册",
        "冘",
        "凸",
        "凹",
        "卌",
        "卍",
        "卐",
        "卝",
        "及",
        "发",
        "史",
        "央",
        "头",
        "孑",
        "孒",
        "孓",
        "尺",
        "州",
        "巨",
        "巴",
        "年",
        "戉",
        "戊",
        "我",
        "戼",
        "承",
        "曱",
        "曲",
        "曳",
        "未",
        "末",
        "本",
        "朱",
        "朿",
        "束",
        "来",
        "東",
        "柬",
        "永",
        "為",
        "熏",
        "爲",
        "疌",
        "禹",
        "禺",
        "粛",
        "肃",
        "肅",
        "重",
        "龴",
        "龶",
        "\ue816",
        "\ue817",
        "\ue818",
        "\ue81e",
        "\ue826",
        "\ue82b",
        "\ue82c",
        "\ue831",
        "艹",
        "",
        "'",
        " ",
        ",",
        "'",
        "",
        "",
        "'",
        " ",
        ",",
        "'",
        "",
        "",
        "'",
        " ",
        ",",
        "'",
        "",
        "",
        "'",
        " ",
        ",",
        "'",
        "",
        "",
        "'",
        " ",
        ",",
        "'",
        "",
         '𧰨', '𩰊', '𩰋', '𫝀', '𫠣']

from hanzipy.decomposer import HanziDecomposer

decomposer = HanziDecomposer()



INFO:root:Done compiling 12040 characters


In [154]:
import json

new_decompositions = {}

for word in wordlist:
    if not word:
        continue
    res = decomposer.tree(word)

    print(res['tree'])

    if res['tree'] != word:
        new_decompositions[word] = res['tree']
    pass

with open('new_decompositions.json', 'w', encoding='utf-8') as fwrite:
    json.dump(new_decompositions, fwrite, indent=4, ensure_ascii=False)

&
-
0
1
2
4
5
6
7
8
9
;
A
B
C
D
E
F
P
③
⑮
△
⿰
⿱
⿲
⿳
⿴
⿵
⿶
⿷
⿸
⿹
⿺
⿻
〢
コ
ス
ユ
㇀
㇇
㇉
㇓
㇣
㐁
├── 丙
│   ├── 一
│   └── 内
│       ├── 人
│       └── 冂
└── 一
㐄
├── 十
└── 60954
    ├── ㇗
    └── 丨
㐅
└── 乂
㐆
├── 38263
│   ├── 𠁣
│   │   ├── 丨
│   │   └── 彐
│   └── ㇆
└── 丶
㐧
├── 才
│   ├── 𠂇
│   │   └── 十
│   └── ㇒
└── 丶
㠯
└── 𤕪
㱐
├── 㱏
│   ├── 二
│   └── 止
└── 丶
㸦
├── 38379
│   ├── 𡕒
│   │   ├── 丅
│   │   │   ├── 一
│   │   │   └── 丨
│   │   └── ㇜
│   └── 亅
└── ㇏
䍏
├── 59349
│   ├── 10001
│   │   └── 丨
│   └── 一
└── 冂
丈
├── 十
└── 乂
    └── ㇒
丏
├── 37756
│   ├── 丅
│   │   ├── 一
│   │   └── 丨
│   └── ㇜
└── ㇆
丐
├── 下
│   ├── 一
│   └── 卜
└── ㇉
丑
├── 刀
└── 二
专
├── 37024
│   ├── 二
│   └── 丨
└── 龴
    └── 厶
且
├── 月
└── 一
世
├── ㇗
└── 廿
丘
├── 斤
└── 一
东
├── 七
│   ├── 乚
│   └── 一
└── 小
丣
├── 一
└── 37740
    └── 60091
丩
├── 丨
└── ㇙
丱
└── 丩
丹
├── 冂
└── 亠
为
├── 力
└── ⺀
乄
├── ㇢
└── 丶
久
├── 勹
└── ㇏
乍
├── 37416
│   ├── 𠂉
│   │   ├── 丿
│   │   └── 一
│   └── 丨
└── 二
乎
├── ㇒
└── 38259
    ├── 𠂇
    │   └── 十
    └── 丷
乐


In [155]:
len(wordlist)

182