In [8]:
from bs4 import BeautifulSoup

import glob
import json
from pinyin import get as pinyinget 

from tools_configs import *

import re

def remove_chinese_bracket(text):
    return re.sub('【(.+?)】', r'\1', text)

def remove_traditional_text(text):
    return re.sub('【.+?】', '', text)

def remove_see_more_examples(text):
    return re.sub('Xem thêm \d+ ví dụ nữa', '', text)

with open(TOP_WORDS_24K, 'r', encoding='utf-8') as fin:
    top_words_24k = set(json.load(fin))

import re
NORMAL = r'([^	↑ ,; 0-9a-zA-Z()一-龥])'

PC_DICT_NAME = '//Trung-Viet Dict'

PC_NEW_LINE = chr(0xEAB1)
PC_HANVIET_MARK = 'HÁN VIỆT'
PC_GOIY_MARK = 'GỢI Ý'
PC_VIDU_OLD_MARK = 'Ví dụ:'
PC_VIDU_NEW_MARK = 'VÍ DỤ'
PC_DIAMOND = '❖'
PC_ARROW = '»'

def pc_make_bold(text):
    return f'{chr(0xEAB2)}{text}{chr(0xEAB3)}'

def pc_make_blue(text):
    # return f'{chr(0xEAC1)}{text}{chr(0xEAC2)}'
    return text

def pc_make_grey(text):
    # return f'{chr(0x3C8F)}{text}{chr(0xEAC2)}'
    return text

def pc_make_italic(text):
    return f'{chr(0xEAB4)}{text}{chr(0xEAB5)}'

def pc_make_link(text):
    return f'{chr(0xEAB8)}{text}{chr(0xEABB)}'

files = glob.glob(f'{HTML_FOLDER}/人*.html')
files = glob.glob(f'{HTML_FOLDER}/*.html')

# files = glob.glob(f'{HTML_FOLDER}/人面.html')
filepath = f'{HTML_FOLDER}/点.html'
filepath = f'{HTML_FOLDER}/詗.html'
filepath = f'{HTML_FOLDER}/人.html'

pleco_string_list = []
pleco_string_list.append(f'{PC_DICT_NAME}\n')

MAX_ITEMS = 100 # 20

log_file = open('loglig.txt', 'w', encoding='utf-8')

for num, filepath in enumerate(files):
    
    # if filepath != 'html\一万.html':
    #     continue

    if num >= MAX_ITEMS:
        break
    
    print(f'Processing {num+1}: {filepath}...')
    pleco_string = ''

    headword, ext = os.path.splitext(os.path.split(filepath)[1])
    # filename = f'{HTML_FOLDER}/{headword}.html'

    if headword not in top_words_24k and headword:
        continue

    url = headword_to_url(headword)

    with open(filepath, 'r', encoding='utf-8') as fin:
        html = fin.read()

    soup = BeautifulSoup(html)

    content_result = soup.find("div", class_="content-result")

    goiy, chitiettu, tukhoahot = list(content_result.next.children)

    # Process Chi tiết từ
    if not (char_pron := chitiettu.find("div", class_="box-word")):
        print(f'{filepath=} has no character pronunciation')
        log_file.write(f'No pronunciation\t{filepath}\n')

        continue
    
    chinese = '' if not (chinese_s := char_pron.find('span', class_='simple-tradition-wrap')) else remove_traditional_text(chinese_s.text)

    pinyin = '' if not (pinyin_s := char_pron.find('span', class_='txt-pinyin')) else pinyin_s.text[1:-1].lower()
    
    viet = '' if not (viet_s := char_pron.find('span', class_='txt-cn_vi')) else viet_s.text[1:-1].lower()

    if not chinese_s:
        log_file.write(f'No Chinese characters\t{filepath}\n')

    pleco_string += f'{chinese}\t{pinyin}\t'

    if viet:
        pleco_string += f'{pc_make_bold(PC_HANVIET_MARK)} {viet}{PC_NEW_LINE}' 

    # print(f'{chinese=} {pinyin=} {viet=}')

    tuloai_list = [item.text.strip() for item in chitiettu.find_all("span", class_="word-kind")]

    for tuloai in chitiettu.find_all("div", class_="box-content"): 
        tuloai_text_s = tuloai.find('div', class_="kind-word")
        
        if tuloai_text_s:
            tuloai_text = tuloai.div.text.strip()
        else:
            tuloai_text = '/'.join(tuloai_list)
        
        tuloai_text = tuloai_text.upper()

        pleco_string += f'{PC_DIAMOND} {pc_make_grey(pc_make_bold(tuloai_text))}{PC_NEW_LINE}'
        # print(f'## {tuloai_text}')

        definitions = tuloai.find_all("div", class_="item-content")

        for num, definition in enumerate(definitions):
            # print(definition.text)

            number = definition.find("div", class_="icon-dot").text
            # mean = definition.find("div", class_="box-mean").text.replace('\n', ' ')
            mean_viet = definition.find("span", class_="simple-tradition-wrap").text.replace('\n', ' ')
            
            mean_chinese = '' if not (mean_chinese_s := definition.find("div", class_="txt-mean-explain")) else mean_chinese_s.text
    
            if not mean_chinese_s:
                log_file.write(f'No Chinese meening\t{filepath}\n')

            pleco_string += pc_make_bold(str(num+1)) + ' ' 
            pleco_string += f'{pc_make_blue(mean_chinese)} {mean_viet}{PC_NEW_LINE}{PC_NEW_LINE}'

            if (example_s := definition.find("div", class_="box-example")):
                example_chinese = example_s.find('p', class_='ex-word').text
                example_chinese = remove_traditional_text(example_chinese)

                example_pron = example_s.find('p', class_='ex-phonetic').text
                example_meaning = example_s.find('p', class_='ex-mean').text
                
                pleco_string += f'{pc_make_bold(PC_VIDU_NEW_MARK)}{PC_NEW_LINE} '
                pleco_string += f'{pc_make_bold(example_chinese)} {pc_make_italic(example_pron)} {example_meaning}{PC_NEW_LINE}{PC_NEW_LINE}'
                
            # print(f'{mean_viet=} {mean_chinese=} {example=}')

    recommendations = goiy.find_all("div", class_="box-item")
    pleco_string += f'{pc_make_bold(PC_GOIY_MARK)}{PC_NEW_LINE}'

    for recommendation in recommendations:
        # print(recommendation.text)

        rec_mean = recommendation.find('div', class_="box-mean").text.replace('\n', ' ')
        
        rec_chinese = remove_traditional_text(recommendation.find('span', class_="simple-tradition-wrap").text).replace('\n', ' ')
        
        rec_pinyin = pinyinget(rec_chinese) if not (rec_pinyin_s := recommendation.find('div', class_="txt-pinyin")) else remove_chinese_bracket(recommendation.find('div', class_="txt-pinyin").text).lower()

        pleco_string += f'{PC_ARROW} {pc_make_bold(rec_chinese)} {pc_make_italic(rec_pinyin)} {rec_mean}{PC_NEW_LINE}{PC_NEW_LINE}'
        # print(f'{rec_mean=} {rec_pinyin=} {rec_chinese=}')

    pleco_string = pleco_string.replace('\n', PC_NEW_LINE)

    pleco_string_list.append(f'{pleco_string}\n')

log_file.close()

with open('hanzii_pleco.txt', 'w', encoding='utf-8') as fout:
    fout.writelines(pleco_string_list)

print(''.join(pleco_string_list))


Processing 1: html\10来.html...
Processing 2: html\2012来.html...
Processing 3: html\20来.html...
Processing 4: html\〇.html...
Processing 5: html\一.html...


KeyboardInterrupt: 

In [None]:
definition.text


In [None]:
definition.text


In [None]:
log_file.close()

In [None]:
print(remove_traditional_text('(点儿) 液体的小滴【(點兒)液體的小滴】', ))

(点儿) 液体的小滴


In [None]:
remove_chinese_bracket('【(點兒)液體的小滴】')


'(點兒)液體的小滴'

In [None]:
chitiettu.find_all("div", class_="slider-content")[1].text

' Danh từ '

In [14]:
      
'''
https://plecoforums.com/threads/multiple-new-lines-in-user-defined-flashcards.5916/#post-44863

|2756| Diamond
EAB1 = new line
EAB2/EAB3 = bold
EAB4/EAB5 = italic
EAB8/EABB = "copy-whatever's-in-this-to-the-Input-Field hyperlinks"
coloured text:
"EAC1 followed by two characters with the highest-order bit 1 and the lowest-order 12 bits representing the first/second halves of a 24-bit RGB color value to start the range, EAC2 to end. So to render a character in green, for example, you'd want EAC1 800F 8F00, then the character, then EAC2."
---
UTF-8: U+EAB1 = '\xee\xaa\xb1'


EAB2/EAB3 = bold
EAB4/EAB5 = italic

eabe ... eabf: Bold
eab8 ... eabb|: Hyperlink

一		one |2756| floor; ceiling|eab1| |eab1||eabe|PINYIN|eabf| y|12b||eab1| |eab1||eabe|FRAME|eabf| 1, |eabe|LESSON|eabf| 1, |eabe|BOOK|eabf| 1, |eabe|PAGE|eabf|  19|eab1| |eab1||eabe|NAVIGATION|eabf| ↑Lesson 1↑ (|eab8|本书1第1课|eabb|) |bb|two|bb| (|eab8|二|eabb|)|eab1| |eab1||eabe|SUBTLEX|eabf| |eab8|一|eabb|, |eab8|一个|eabb|, |eab8|一起|eabb|, |eab8|一样|eabb|, |eab8|一下|eabb|, |eab8|一直|eabb|, |eab8|一切|eabb|, |eab8|一点|eabb|, |eab8|一定|eabb|, |eab8|第一|eabb|, |eab8|一些|eabb|, |eab8|唯一|eabb|, |eab8|一会儿|eabb|, |eab8|一旦|eabb|, |eab8|之一|eabb|, |eab8|一半|eabb|, |eab8|一边|eabb|, |eab8|一般|eabb|, |eab8|一生|eabb|, |eab8|一刻|eabb|, |eab8|一辈子|eabb|, |eab8|一一|eabb|, |eab8|一致|eabb|, |eab8|一会|eabb|, |eab8|一路|eabb|, |eab8|万一|eabb|, |eab8|一分|eabb|, |eab8|一点儿|eabb|, |eab8|一团糟|eabb|, |eab8|一整天|eabb|, |eab8|一面|eabb|, |eab8|一百|eabb|, |eab8|一无所知|eabb|, |eab8|一两|eabb|, |eab8|进一步|eabb|, |eab8|一家|eabb|, |eab8|一百万|eabb|, |eab8|一时|eabb|, |eab8|一千|eabb|, |eab8|一模一样|eabb|, |eab8|一阵子|eabb|, |eab8|一向|eabb|, |eab8|一共|eabb|, |eab8|一阵|eabb|, |eab8|同一个|eabb|, |eab8|一万|eabb|, |eab8|一番|eabb|, |eab8|以防万一|eabb|, |eab8|一下子|eabb|, |eab8|星期一|eabb|, |eab8|一无所有|eabb||eab1| |eab1||eabe|OLDHSK|eabf| |eab8|第(第一)|eabb|, |eab8|一|eabb|, |eab8|一般|eabb|, |eab8|一边|2026|一边|2026||eabb|, |eab8|一点儿|eabb|, |eab8|一定|eabb|, |eab8|一共|eabb|, |eab8|一会儿|eabb|, |eab8|一|2026|就|2026||eabb|, |eab8|一块儿|eabb|, |eab8|一起|eabb|, |eab8|一切|eabb|, |eab8|一下儿|eabb|, |eab8|一些|eabb|, |eab8|一样|eabb|, |eab8|一|2026|也|2026||eabb|, |eab8|一直|eabb|, |eab8|不一定|eabb|, |eab8|差一点儿|eabb|, |eab8|进一步|eabb|, |eab8|统一|eabb|, |eab8|一|eabb|, |eab8|一半|eabb|, |eab8|一边|eabb|, |eab8|一道|eabb|, |eab8|一方面|2026|一方面|2026||eabb|, |eab8|一齐|eabb|, |eab8|一生|eabb|, |eab8|一时|eabb|, |eab8|一同|eabb|, |eab8|一下子|eabb|, |eab8|一致|eabb|, |eab8|一|2026|也|eabb|, |eab8|有(一)点儿|eabb|, |eab8||2026|之一|eabb|, |eab8|万一|eabb|, |eab8|一一|eabb|, |eab8|一带|eabb|, |eab8|一路平安|eabb|, |eab8|一路顺风|eabb|, |eab8|一面|2026|一面|eabb|, |eab8|一系列|eabb|, |eab8|一下儿|eabb|, |eab8|一向|eabb|, |eab8|一再|eabb|, |eab8|一阵|eabb|, |eab8|一口气|eabb|, |eab8|一连|eabb|, |eab8|一旁|eabb|, |eab8|一心|eabb|, |eab8|一行|eabb|, |eab8|有(一)些|eabb|, |eab8|这样一来|eabb|, |eab8|老一辈|eabb|, |eab8|同一|eabb|, |eab8|唯一|eabb|, |eab8|一辈子|eabb|, |eab8|一旦|eabb|, |eab8|一度|eabb|, |eab8|一概|eabb|, |eab8|一概而论|eabb|, |eab8|一个劲儿|eabb|, |eab8|一贯|eabb|, |eab8|一哄而散|eabb|, |eab8|一会儿|2026|一会儿|eabb|, |eab8|一技之长|eabb|, |eab8|一律|eabb|, |eab8|一帆风顺|eabb|, |eab8|一干二净|eabb|, |eab8|一举|eabb|, |eab8|一毛不拔|eabb|, |eab8|一身|eabb|, |eab8|一手|eabb|, |eab8|一头|eabb||eab1| |eab1||eabe|HSK|eabf| |eab8|一|eabb|, |eab8|一点儿|eabb|, |eab8|第一|eabb|, |eab8|一起|eabb|, |eab8|一下|eabb|, |eab8|一般|eabb|, |eab8|一边|eabb|, |eab8|一定|eabb|, |eab8|一共|eabb|, |eab8|一会儿|eabb|, |eab8|一样|eabb|, |eab8|一直|eabb|, |eab8|一切|eabb|, |eab8|统一|eabb|, |eab8|万一|eabb|, |eab8|唯一|eabb|, |eab8|一辈子|eabb|, |eab8|一旦|eabb|, |eab8|一律|eabb|, |eab8|一再|eabb|, |eab8|一致|eabb|, |eab8|不屑一顾|eabb|, |eab8|一度|eabb|, |eab8|一帆风顺|eabb|, |eab8|一贯|eabb|, |eab8|一举两得|eabb|, |eab8|一流|eabb|, |eab8|一目了然|eabb|, |eab8|一如既往|eabb|, |eab8|一丝不苟|eabb|, |eab8|一向|eabb||eab1|


'''
def convert_to_num(match):
    t = match.group(1)

    if ord(t) < 256:
        return t
    else:
        n = hex(ord(t)).replace('0x', '')
        return f'|{n}|'



In [27]:
s='一		one ❖ floor; ceiling PINYIN yī FRAME 1, LESSON 1, BOOK 1, PAGE  19 NAVIGATION ↑Lesson 1↑ (本书1第1课) »two» (二) SUBTLEX 一, 一个, 一起, 一样, 一下, 一直, 一切, 一点, 一定, 第一, 一些, 唯一, 一会儿, 一旦, 之一, 一半, 一边, 一般, 一生, 一刻, 一辈子, 一一, 一致, 一会, 一路, 万一, 一分, 一点儿, 一团糟, 一整天, 一面, 一百, 一无所知, 一两, 进一步, 一家, 一百万, 一时, 一千, 一模一样, 一阵子, 一向, 一共, 一阵, 同一个, 一万, 一番, 以防万一, 一下子, 星期一, 一无所有 OLDHSK 第(第一), 一, 一般, 一边…一边…, 一点儿, 一定, 一共, 一会儿, 一…就…, 一块儿, 一起, 一切, 一下儿, 一些, 一样, 一…也…, 一直, 不一定, 差一点儿, 进一步, 统一, 一, 一半, 一边, 一道, 一方面…一方面…, 一齐, 一生, 一时, 一同, 一下子, 一致, 一…也, 有(一)点儿, …之一, 万一, 一一, 一带, 一路平安, 一路顺风, 一面…一面, 一系列, 一下儿, 一向, 一再, 一阵, 一口气, 一连, 一旁, 一心, 一行, 有(一)些, 这样一来, 老一辈, 同一, 唯一, 一辈子, 一旦, 一度, 一概, 一概而论, 一个劲儿, 一贯, 一哄而散, 一会儿…一会儿, 一技之长, 一律, 一帆风顺, 一干二净, 一举, 一毛不拔, 一身, 一手, 一头 HSK 一, 一点儿, 第一, 一起, 一下, 一般, 一边, 一定, 一共, 一会儿, 一样, 一直, 一切, 统一, 万一, 唯一, 一辈子, 一旦, 一律, 一再, 一致, 不屑一顾, 一度, 一帆风顺, 一贯, 一举两得, 一流, 一目了然, 一如既往, 一丝不苟, 一向'
s='一	yī	HÁN VIỆT nhấtSỐ TỪ① 数目，最小的正整数参看〖数字〗 số một; nhất; một'
converted = re.sub(NORMAL, convert_to_num, s)
print(converted)

一	y|12b|	|eab2|HÁN VI|1ec6|T|eab3| |eab4|nh|1ea5|t|eab5||eab1||eab2|S|1ed0| T|1eea||eab3||eab1||eab2||2460||eab3| |eab2||eac1||ec00||ec00||ecbf||ecff|数目|ff0c|最小的正整数参看|3016|数字|3017||eac2||eab3| s|1ed1| m|1ed9|t; nh|1ea5|t; m|1ed9|t|eab1||eab1|
