In [43]:
import pickle
import json

data = pickle.load(open('wiki.pkl', 'rb'))
#word_table = pickle.load(open('word_table.pkl', 'rb'))

In [105]:
import mysql.connector
from mysql.connector import Error

def get_conn():
    try:
        # Connect to the MySQL database
        connection = mysql.connector.connect(
            host='localhost',       # Replace with your host name
            port=3306,              # Replace with your port number if different
            user='root',   # Replace with your username
            password='debang', # Replace with your password
            database='hyzd' # Replace with your database name
        )
        
        if connection.is_connected():
            db_info = connection.get_server_info()
            #print("Connected to MySQL Server version ", db_info)
            return connection
        
    except Error as e:
        print("Error while connecting to MySQL", e)

def close_conn(connection):
    if connection.is_connected():
        connection.cursor().close()
        connection.close()
        #print("MySQL connection is closed")

def load_word_table_into_db():
    conn = get_conn()
    cursor = conn.cursor()

    data = []
    
    for key, value in word_table.items():
        for v in value:
            data.append((key, v))

    sql = "INSERT INTO char_table (ch, id) VALUES (%s, %s)"
    
    cursor.executemany(sql, data)
    conn.commit()

    close_conn(conn)
        
    #records = cursor.fetchall()

def get_id_from_char(char):

    conn = get_conn()
    cursor = conn.cursor()

    cursor.execute('SELECT id FROM char_table WHERE ch = %s', (char,))
    rows = cursor.fetchall()
    
    id_values = [row[0] for row in rows]
    
    close_conn(conn)

    return id_values

In [106]:
def get_sounds(input_char):
    for i in get_id_from_char(input_char):
        print('INDEX: ' + str(i))
        if 'sounds' in data[i]:
            sounds = data[i]['sounds']
            for s in sounds:
                if 'tags' in s:
                    if ('Mandarin' in s['tags'] and 'Pinyin' in s['tags'] and ( len(s['tags']) == 2 or 'standard' in s['tags'])) and not any(x in s['zh-pron'] for x in ['⁰', '¹', '²', '³', '⁴', '⁵']):
                        print('PY: ' + s['zh-pron'])
                    if 'Cantonese' in s['tags'] and 'Jyutping' in s['tags']:
                        print('JP: ' + s['zh-pron'])
                    if 'Middle-Chinese' in s['tags']:
                        print('MC: ' + s['zh-pron'])
         

In [107]:
from pprint import pprint
from hanziconv import HanziConv

def get_emin(sounds, ipa=True):
    '''
    Returns a list of unique min dong (eastern min) pronunciations
    '''
    # Extract zh-pron and ipa where tags contain Min-Dong
    zh_pron_values = [(entry.get('zh-pron', None), entry.get('ipa', None)) for entry in sounds if 'Min-Dong' in entry.get('tags', [])]
    
    # Filter out None values
    zh_pron_values = [(zh_pron, ipa) for zh_pron, ipa in zh_pron_values if zh_pron is not None or ipa is not None]

    pronunciations = []
    
    for zh_pron, ipa_pron in zh_pron_values:
        if zh_pron is not None:
            pronunciations.append(zh_pron)
        if ipa == True and ipa_pron is not None:
            pronunciations.append(ipa_pron)
    
    return list(set(pronunciations))

def get_pinyin(sounds):
    for s in sounds:
        if 'tags' in s:
            if ('Mandarin' in s['tags'] and 'Pinyin' in s['tags'] and ( len(s['tags']) == 2 or 'standard' in s['tags'])) and not any(x in s['zh-pron'] for x in ['⁰', '¹', '²', '³', '⁴', '⁵']):
                return [s['zh-pron'].lower()]

def get_jp(sounds):
    for s in sounds:
        if 'tags' in s:
            if 'Cantonese' in s['tags'] and 'Jyutping' in s['tags']:
                return [s['zh-pron'].lower().replace('¹', '1').replace('²', '2').replace('³', '3')
                       .replace('⁴', '4').replace('⁵', '5').replace('⁶', '6')]

def get_mc(sounds):
    for s in sounds:
        if 'tags' in s:
            if 'Middle-Chinese' in s['tags']:
                return [s['zh-pron'].lower()]

import chinese_converter

def get_prons(ch):
    ch = HanziConv.toTraditional(ch)
    char_prons = dict()
    i = 0 
    for index in get_id_from_char(ch):

        if 'sounds' in data[index]:
        
            sounds = data[index]['sounds']
            
            emin = get_emin(sounds, ipa=False)
            py = get_pinyin(sounds)
            jp = get_jp(sounds)
            mc = get_mc(sounds)
            
            # None if exact pronunciation already exists
            char_prons[i] = {
                'md': emin if emin not in [entry.get('md') for entry in char_prons.values()] else None,
                'py': py if py not in [entry.get('py') for entry in char_prons.values()] else None,
                'jp': jp if jp not in [entry.get('jp') for entry in char_prons.values()] else None,
                'mc': mc if mc not in [entry.get('mc') for entry in char_prons.values()] else None
            }
            
            i += 1
    
    filtered_char_prons = dict()
    
    i = 0
    
    for key, entry_list in char_prons.items():
        non_none_values = [(k, v) for k, v in entry_list.items() if v]
        if non_none_values:
            temp_dict = {'md': None, 'py': None, 'jp': None, 'mc': None}
            for k, v in non_none_values:
                 temp_dict[k] = v
            filtered_char_prons[i] = temp_dict
            i += 1
        
    return filtered_char_prons

def romanize(str):
    lo_romanization = []
    for char in str:
        lo_romanization.append(get_prons(char))

    return lo_romanization

def get_pron_with_py(ch, py):
    prons = get_prons(ch)
    for key, item in prons.items():
        if item['py']:
            if py in item['py']:
                return item

    return {'jp': None, 'mc': None, 'md': None, 'py': None}
    

In [139]:
#get_prons('和')
get_pron_with_py('和', 'hú')

{'md': None, 'py': ['hú'], 'jp': ['wu4'], 'mc': None}

In [147]:
#A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.

#Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.

#Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.

#Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.

#This code was written by Franki Allegra in February 2020.

#open CEDICT file

with open('cedict_ts.u8') as file:
    text = file.read()
    lines = text.split('\n')
    dict_lines = list(lines)

#define functions

    def parse_line(line):
        parsed = {}
        if line == '':
            dict_lines.remove(line)
            return 0
        line = line.rstrip('/')
        line = line.split('/')
        if len(line) <= 1:
            return 0
        english = line[1]
        char_and_pinyin = line[0].split('[')
        characters = char_and_pinyin[0]
        characters = characters.split()
        traditional = characters[0]
        simplified = characters[1]
        pinyin = char_and_pinyin[1]
        pinyin = pinyin.rstrip()
        pinyin = pinyin.rstrip("]")
        parsed['traditional'] = traditional
        parsed['simplified'] = simplified
        parsed['pinyin'] = pinyin
        parsed['english'] = english
        list_of_dicts.append(parsed)

    def remove_surnames():
        for x in range(len(list_of_dicts)-1, -1, -1):
            if "surname " in list_of_dicts[x]['english']:
                if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']:
                    list_of_dicts.pop(x)
            
    def main():

        #make each line into a dictionary
        print("Parsing dictionary . . .")
        for line in dict_lines:
                parse_line(line)
        
        #remove entries for surnames from the data (optional):

        #print("Removing Surnames . . .")
        #remove_surnames()

        return list_of_dicts


        #If you want to save to a database as JSON objects, create a class Word in the Models file of your Django project:

        # print("Saving to database (this may take a few minutes) . . .")
        # for one_dict in list_of_dicts:
        #     new_word = Word(traditional = one_dict["traditional"], simplified = one_dict["simplified"], english = one_dict["english"], pinyin = one_dict["pinyin"], hsk = one_dict["hsk"])
        #     new_word.save()
        print('Done!')

list_of_dicts = []
parsed_dict = main()
print('done')

Parsing dictionary . . .
done


In [153]:
for entry in parsed_dict:
    if '𫠜' == entry['simplified']:
        print(entry)

{'traditional': '齯', 'simplified': '𫠜', 'pinyin': 'ni2', 'english': 'teeth grown in old age'}


In [111]:
import chinese_converter

#HanziConv.toTraditional('启')
HanziConv.toSimplified('𨭎')

'𨭎'

In [141]:
import pycantonese

def get_jp_from_pycanto(char):
    # Generate jyutping for the word, filtering out None values
    jyutping_chunks = [
        chunk[1] for chunk in pycantonese.characters_to_jyutping(HanziConv.toTraditional(char))
        if chunk[1] is not None
    ]
    jyutping = ' '.join(jyutping_chunks)

    return jyutping

def get_jp_with_py_pron(cur_char, pron):

    jyutping = ''
    
    if not pron or 'jp' not in pron:
        jyutping = get_jp_from_pycanto(cur_char)
    elif pron['jp']:
        jyutping = pron['jp'][0]
    else:
        jyutping = get_jp_from_pycanto(cur_char)

    return jyutping

pron = get_pron_with_py(HanziConv.toTraditional('和'), 'hú')    
print(get_jp_with_py_pron('和', pron)) # should be wo6

wu4


In [154]:
import genanki
import json
import csv

jp_and_mc_and_english = {}

with open('xhzd_corrected.csv', newline='', encoding='utf-8') as csvfile:

    spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')

    for row in spamreader:

        jyutping = ''
        mc = ''
        cur_char = ''
        english = ''
        
        if row[0][0] == ' ':
            cur_char = row[0][1]
        else:
            cur_char = row[0][0]
        
        pron = get_pron_with_py(HanziConv.toTraditional(cur_char), row[3])
        
        jyutping = get_jp_with_py_pron(cur_char, pron)
    
        if pron['mc']:
            mc = pron['mc'][0]

        for entry in parsed_dict:
            if cur_char == entry['simplified']:
                english = entry['english']

        if cur_char not in jp_and_mc_and_english:
            jp_and_mc_and_english[cur_char] = {'jp': jyutping,'mc': mc, 'en': english}
        else:
            j = 2
            while cur_char + str(j) in jp_and_mc_and_english:
                j += 1
            
            jp_and_mc_and_english[cur_char + str(j)] = {'jp': jyutping,'mc': mc, 'en': english}

        if not jyutping:
            print(cur_char + ': ' + row[3])


𫘤: ái
𮩝: ài
𩽾: ān
鲃: bā
𫜨: bà
𫖔: bài
𨭉: bān
𠳐: bāng
𬇙: bèi
𮤲: bì
𨚕: biàn
𬭛: bō
𬷕: bǔ
𥮾: cǎn
镵: chán
𬊤: chǎn
𬬮: chǎng
𮧴: chàng
𬘭: chēn
铖: chéng
𫄨: chī
𫛶: chì
䝙: chū
𬸅: chú
𬺓: chǔ
𬭚: chún
𫚖: cǐ
酂: cuán
酂: cuó又cuán
𨱏: dā
鿎: dá
𫟼: dá
𫄤: da
𬘘: dǎn
𫢸: dàn
𣗋: dǎng
𬍡: dàng
𣱼: dāo
䃅: dī
𬱖: dí
𤧛: dì
䗖: dì
𦰏: diào
𫶇: dié
𬟽: dōng
𮙋: dú
𬭊: dù
𬭚: duì
𫫇: è
𫔍: fán
𬸪: fán
𬴂: fēi
𣸣: fén
𫓧: fū
𫖯: fǔ
𬮿: gài
鿍: gàng
鿔: gē
𬮤: gé
𮝴: gū
䯄: guā
𮉨: guā
鳤: guǎn
𨐈: guāng
鬶: guī
𭚦: guō
啯: guō
𬇹: guó
𬌗: hé
𮩝: hé
𬭶: hēi
𬣳: hén
𫟹: hóng
𭱊: hòng
𬭤: hóu
𬘫: huán
𮝹: huán
𤩽: huán
𮝹: huàn
𨱑: huáng
㧑: huī
𬭬: huì
𬤝: huì
𬴃: huō
锪: huō
𬯀: jī
𫓹: jī
𫌀: jī
𬶨: jì
𪟝: jì
𬶭: jì
𬂩: jiā
篯: jiān
𬣡: jiàn
𬬱: jīn
𬳶: jiōng
䌹: jiǒng
𬶋: jū
钜: jù
𪨗: juē
𫘝: jué
𫛞: jué
𩾌: kāng
𫸩: kōu
𫛭: kuáng
𫠆: kuǐ
𬶟: là
𬒗: lán
𨱍: láng
𫭼: láo
𫄥: lí
𫵷: lì
𫟷: lì
𬍛: lì
梿: lián
𬶠: liàn
𫟅: liáng
𬜯: liǎng
𮉧: liǎng
𪤗: liào
𫚭: liè
𬘭: lín
𬴊: lín
𬭸: lín
𬕂: lǒng
𪣻: lóu
𦝼: lóu又lǘ
硵: lǔ
𫘧: lù
𦝼: lǘ
𬬭: lún
𫭢: lún
𫌨: luó
㑩: luo
祃: mà
鿏: mài
𬜬: màn
𫞩: mén
𫑡: méng
𩾃: miǎn
𬙊: mò


## chars without jyutping w/ pinyin

𫘤: ái
𮩝: ài
𩽾: ān
鲃: bā
𫜨: bà
𫖔: bài
𨭉: bān
𠳐: bāng
𬇙: bèi
𮤲: bì
𨚕: biàn
𬭛: bō
𬷕: bǔ
𥮾: cǎn
镵: chán
𬊤: chǎn
𬬮: chǎng
𮧴: chàng
𬘭: chēn
铖: chéng
𫄨: chī
𫛶: chì
䝙: chū
𬸅: chú
𬺓: chǔ
𬭚: chún
𫚖: cǐ
酂: cuán
酂: cuó又cuán
𨱏: dā
鿎: dá
𫟼: dá
𫄤: da
𬘘: dǎn
𫢸: dàn
𣗋: dǎng
𬍡: dàng
𣱼: dāo
䃅: dī
𬱖: dí
𤧛: dì
䗖: dì
𦰏: diào
𫶇: dié
𬟽: dōng
𮙋: dú
𬭊: dù
𬭚: duì
𫫇: è
𫔍: fán
𬸪: fán
𬴂: fēi
𣸣: fén
𫓧: fū
𫖯: fǔ
𬮿: gài
鿍: gàng
鿔: gē
𬮤: gé
𮝴: gū
䯄: guā
𮉨: guā
鳤: guǎn
𨐈: guāng
鬶: guī
𭚦: guō
啯: guō
𬇹: guó
𬌗: hé
𮩝: hé
𬭶: hēi
𬣳: hén
𫟹: hóng
𭱊: hòng
𬭤: hóu
𬘫: huán
𮝹: huán
𤩽: huán
𮝹: huàn
𨱑: huáng
㧑: huī
𬭬: huì
𬤝: huì
𬴃: huō
锪: huō
𬯀: jī
𫓹: jī
𫌀: jī
𬶨: jì
𪟝: jì
𬶭: jì
𬂩: jiā
篯: jiān
𬣡: jiàn
𬬱: jīn
𬳶: jiōng
䌹: jiǒng
𬶋: jū
钜: jù
𪨗: juē
𫘝: jué
𫛞: jué
𩾌: kāng
𫸩: kōu
𫛭: kuáng
𫠆: kuǐ
𬶟: là
𬒗: lán
𨱍: láng
𫭼: láo
𫄥: lí
𫵷: lì
𫟷: lì
𬍛: lì
梿: lián
𬶠: liàn
𫟅: liáng
𬜯: liǎng
𮉧: liǎng
𪤗: liào
𫚭: liè
𬘭: lín
𬴊: lín
𬭸: lín
𬕂: lǒng
𪣻: lóu
𦝼: lóu又lǘ
硵: lǔ
𫘧: lù
𦝼: lǘ
𬬭: lún
𫭢: lún
𫌨: luó
㑩: luo
祃: mà
鿏: mài
𬜬: màn
𫞩: mén
𫑡: méng
𩾃: miǎn
𬙊: mò
𬭁: mǔ
𫐐: ní
𫠜: ní
鿭: nǐ
𨺙: nì
𫔶: niè
𬪩: nóng
𫭟: ōu
𬉼: ǒu
𬳵: pī
𬬫: pī
𡎚: piǎn
𬭯: piě
𬞟: pín
𨙸: qí
𬨂: qí
: qí
伣: qiàn
𬘬: qiàn
𬧀: qiāng
𬧀: qiàng
𨱇: qiú
𫭟: qū
𪨰: qū
𬒈: qué
𫐓: róu
𮉫: ruí
𢫬: sà
𫮃: shàn
𪨶: shē
𬳽: shēn
𬬹: shén
𫚕: shī
𬤊: shì
𬬸: shù
𬢊: sì
𫗧: sù
𫟦: suì
𬭼: suì
鿎: tǎ
𠉂: tà
𬍡: tāng（又）
𫘦: táo
𫘨: tí
𣨼: tì
𫍣: tóng
𬳿: tú
𬯎: tuí
𪨇: tuí
𬶍: tuó
𬇕: wàn
𣲗: wéi
𬶏: wéi
𬀩: wěi
𮧵: wěi
𫇭: wěi
𬱟: wěi
鳚: wèi
𬭬: wèi又huì
𫘜: wén
𬭩: wēng
𣲘: wǔ
㶉: xī
锡: xī
𫘬: xí
𬭳: xǐ
𬶮: xǐ
𫰰: xiān
𫍯: xián
𤞤: xiǎn
𬀪: xiàn
𬭣: xiàn
𬙋: xiāng
蚃: xiǎng
𫍲: xiǎo
敩: xiào
𫧯: xiè
𬹼: xiè
𤫉: xiè
𫷷: xīn
𫰛: xíng
𬣙: xū
媭: xū
𦈡: xū
𫓶: xuān
𫍽: xuān
𫠊: xuán
碹: xuàn
𫄸: xūn
𬩽: xún
㖊: xún
𬍤: xún
𬊈: xún
𬘓: xún
𫄧: yán
𬸘: yǎn
𬙂: yǎn
𪩘: yǎn
𫛩: yàn
𬺈: yǐ
𫖮: yǐ
𬬩: yì
𫄷: yì
𬟁: yì
𬘡: yīn
𬤇: yīn
𬮱: yīn
訚: yín
𦈠: yǐn
𮐨: yīng
𫘪: yuán
𫐄: yuè
𬸚: yuè
𫖳: yūn
筼: yún
酂: zàn
𥖨: zào
𫗴: zhān
𥇢: zhǎn
𬬿: zhāo
𬶐: zhào
𦭜: zhī
𬃊: zhì
㤘: zhòu
𬣞: zhǔ
𬘯: zhǔn
𬸦: zhuó
咨: zī
疭: zòng
𮉪: zōu
𨱔: zūn

### Pickle *dump* jp_mc_en file

In [155]:
# Store data (serialize)
with open('jp_mc_en.pkl', 'wb') as handle:
    pickle.dump(jp_and_mc_and_english, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Pickle *load* jp_mc_en file

In [156]:
import pickle
jp_and_mc_and_english = pickle.load(open('jp_mc_en.pkl', 'rb'))

In [157]:
jp_and_mc_and_english['和3']

{'jp': 'wu4', 'mc': '', 'en': 'old variant of 和[he2]'}

In [158]:
import genanki
import json
import pycantonese
import chinese_converter
import csv

# Create a new Anki deck with a unique ID and a name
my_deck = genanki.Deck(
  deck_id=123456790,
  name="hanzi",
)

answer_side = """{{FrontSide}}<hr id='answer'>
    {{Pinyin}}<br>
    {{Jyutping}}<br><br>
    {{MiddleChinese}}<br><br>
    {{Definition}}<br><br>
    {{English}}<br><br>
    {{Stroke}} {{Level}} {{Page}}
    """
# Define a model (template) for the cards
# A model specifies the fields and card format
my_model = genanki.Model(
  model_id=123456790,
  name="Basic Model",
  fields=[
    {"name": "Character"},
    {"name": "Pinyin"},
    {"name": "Jyutping"},
    {"name": "MiddleChinese"},
    {"name": "Definition"},
    {"name": "English"},
    {"name": "Stroke"},
    {"name": "Level"},
    {"name": "Page"},
    #{"name": "image"},
  ],templates=[
    {
      "name": "Card 1",
      "qfmt": "<span class='char'>{{Character}}</span>", # Question format
      "afmt": answer_side, # Answer format
      #"afmt": "{{FrontSide}}<hr id='answer'>{{Answer}}<br>{{image}}", # Answer format
    },
  ],
  css="""
  .card {
   font-family: arial;
   font-size: 20px;
   text-align: center;
   color: black;
   background-color: white;
  }

  .char {
    font-size: 200%;
  }
  """
)


with open('xhzd_corrected.csv', newline='', encoding='utf-8') as csvfile:

    spamreader = csv.reader(csvfile, delimiter=',')

    i = 0

    chars = set()
    
    for row in spamreader:

        cur_char = ''
        
        if row[0][0] == ' ':
            cur_char = row[0][1]
        else:
            cur_char = row[0][0]

        j = 0;

        if cur_char not in chars:
            chars.add(cur_char)
        else:
            j = 2
            while cur_char + str(j) in chars:
                j += 1
            cur_char = cur_char + str(j)
            chars.add(cur_char)

        if j == 0:
            j = ''

        
        prons = jp_and_mc_and_english[cur_char]

        jyutping = prons['jp']
        mc = prons['mc']
        english = prons['en']
        
        # Add a card to the deck
        note = genanki.Note(
            model=my_model,
            fields=[row[0] + str(j), row[3], jyutping, mc, row[6], english, row[5], row[4], row[1] + "页"] #fields=["What is the capital of France?", "Paris",'<img src="image.gif">'],) # add note to deck my_deck.add_note(note)
        )
        
        my_deck.add_note(note)

        if i % 1000 == 0:
            print(i)
    
        i += 1
    


# create package for deck
my_package = genanki.Package(my_deck)

# Optionally, add more cards here in a similar manner
#my_package.media_files = ['image.gif']

# Save the deck to a file
my_package.write_to_file('hanzi.apkg')

print("Deck has been created.")


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
Deck has been created.


In [None]:
import genanki
import json
import pycantonese

# Create a new Anki deck with a unique ID and a name
my_deck = genanki.Deck(
  deck_id=123456789,
  name="idoms",
)

# Define a model (template) for the cards
# A model specifies the fields and card format
my_model = genanki.Model(
  model_id=123456789,
  name="Basic Model",
  fields=[
    {"name": "Question"},
    {"name": "Answer"},
    #{"name": "image"},
  ],
  templates=[
    {
      "name": "Card 1",
      "qfmt": "{{Question}}", # Question format
      "afmt": "{{FrontSide}}<hr id='answer'>{{Answer}}", # Answer format
      #"afmt": "{{FrontSide}}<hr id='answer'>{{Answer}}<br>{{image}}", # Answer format
    },
  ],
  css="""
  .card {
   font-family: arial;
   font-size: 20px;
   text-align: center;
   color: black;
   background-color: white;
  }
  """,
)

f = open('idiom.json',)
data = json.load(f)

for entry in data:
    # Add a card to the deck
    note = genanki.Note(
      model=my_model,
      fields=[entry['explanation'] + '<br><br>' + entry['example'], 
              chinese_converter.to_traditional(entry['word']) + '<br>' + entry['word'] + 
              '<br>' + entry['pinyin'] + '<br>' + pycantonese.characters_to_jyutping(entry['word'])[0][1] +
              '<br><br>' + entry['derivation']],
      #fields=["What is the capital of France?", "Paris",'<img src="image.gif">'],
    )

    # add note to deck
    my_deck.add_note(note)

# create package for deck
my_package = genanki.Package(my_deck)

# Optionally, add more cards here in a similar manner
#my_package.media_files = ['image.gif']

# Save the deck to a file
my_package.write_to_file('idoms.apkg')

print("Deck has been created.")


In [64]:
import re

# Sample dictionary data
dictionary_text = """
【六一儿童节】Liù－YīÉrtónɡJié全世界儿童的节日。国际民主妇女联合会为保障全世界儿童的权利，于1949年在莫斯科举行的会议上，决定以6月1日为国际儿童节。也叫六一国际儿童节、国际儿童节、儿童节。
【六艺】liùyì〈名〉❶古代指礼（礼仪）、乐（音乐）、射（射箭）、御（驾车）、书（识字）、数（计算）等六种科目。❷古代指《诗》、《书》、《礼》、《乐》、《易》、《春秋》六种儒家经书。
【六欲】liùyù〈名〉佛教指色欲、形貌欲等六种欲望，泛指人的各种欲望：七情～。
【六指儿】liùzhǐr〈名〉❶长了六个指头的手或脚。❷指手或脚上长有六个指头的人。
【陆】（陸）liù〈数〉“六”的大写。参看1271页【数字】。另见887页lù。
【碌】liù
"""

# Regex to parse an entry line
entry_pattern = re.compile(r'【(.*?)】(.*?)((?:〈.*?〉)+)(.*)')

def parse_entry(line):
    match = entry_pattern.match(line)
    if match:
        word = match.group(1)
        pinyin = match.group(2).strip().lower()
        types_defs = match.group(3)
        definitions_text = match.group(4)

        # Extract types
        types = re.findall(r'〈(.*?)〉', types_defs)

        # Handle cases where definitions might not be properly captured
        if not definitions_text:
            definitions_text = ""

        # Split definitions if they are numbered
        if any(num in definitions_text for num in '❶❷❸❹❺❻❼❽❾❿⓫⓬⓭⓮⓯⓰⓱⓲⓳⓴'):
            definitions = re.split(r'❶|❷|❸|❹|❺|❻|❼|❽|❾|❿|⓫|⓬|⓭|⓮|⓯|⓰|⓱|⓲|⓳|⓴', definitions_text)
        else:
            definitions = [definitions_text]

        # Clean definitions
        definitions = [defn.strip() for defn in definitions if defn.strip()]

        return {
            "word": word,
            "pinyin": pinyin,
            "types": types,
            "definitions": definitions
        }
    else:
        word = re.search(r'【(.*?)】',line)[1]

        if len(word) + 2 != len(line):
            pinyin = re.search(r'[a-zāáǎàōóǒòēéěèīíǐìūúǔùüǖǘǚǜêê̄ếê̌ềm̄ḿm̀ńňǹẑĉŝŋɡ]+',line)[0]
            definition = re.search(r'[a-zāáǎàōóǒòēéěèīíǐìūúǔùüǖǘǚǜêê̄ếê̌ềm̄ḿm̀ńňǹẑĉŝŋɡ]+(\s*(.*))',line)[1]
        else:
            pinyin = ''
            definition = ''
        
        if len(word) == 4:
            types = ['四']
        else:
            []
        
        return {"word": word, "pinyin": pinyin, "types": types, "definitions": definition}

# Parsing the dictionary
entries = []
for line in dictionary_text.strip().split('\n'):
    try:
        entry = parse_entry(line)
        if entry['word']:  # Only append if word is not empty
            entries.append(entry)
    except Exception as e:
        print(f"Failed to parse line: {line} with error {str(e)}")

# Print the parsed dictionary entries
from pprint import pprint
pprint(entries, sort_dicts=False)
print(len(entries))

Failed to parse line: 【六一儿童节】Liù－YīÉrtónɡJié全世界儿童的节日。国际民主妇女联合会为保障全世界儿童的权利，于1949年在莫斯科举行的会议上，决定以6月1日为国际儿童节。也叫六一国际儿童节、国际儿童节、儿童节。 with error local variable 'types' referenced before assignment
Failed to parse line: 【碌】liù with error local variable 'types' referenced before assignment
[{'word': '六艺',
  'pinyin': 'liùyì',
  'types': ['名'],
  'definitions': ['古代指礼（礼仪）、乐（音乐）、射（射箭）、御（驾车）、书（识字）、数（计算）等六种科目。',
                  '古代指《诗》、《书》、《礼》、《乐》、《易》、《春秋》六种儒家经书。']},
 {'word': '六欲',
  'pinyin': 'liùyù',
  'types': ['名'],
  'definitions': ['佛教指色欲、形貌欲等六种欲望，泛指人的各种欲望：七情～。']},
 {'word': '六指儿',
  'pinyin': 'liùzhǐr',
  'types': ['名'],
  'definitions': ['长了六个指头的手或脚。', '指手或脚上长有六个指头的人。']},
 {'word': '陆',
  'pinyin': '（陸）liù',
  'types': ['数'],
  'definitions': ['“六”的大写。参看1271页【数字】。另见887页lù。']}]
4
