In [1]:
import pickle
import json

data = pickle.load(open('wiki.pkl', 'rb'))
# word_table = pickle.load(open('word_table.pkl', 'rb'))

In [2]:
import mysql.connector
from mysql.connector import Error

def get_conn():
    try:
        # Connect to the MySQL database
        connection = mysql.connector.connect(
            host='localhost',       # Replace with your host name
            port=3306,              # Replace with your port number if different
            user='root',   # Replace with your username
            password='debang', # Replace with your password
            database='hyzd' # Replace with your database name
        )
        
        if connection.is_connected():
            db_info = connection.get_server_info()
            #print("Connected to MySQL Server version ", db_info)
            return connection
        
    except Error as e:
        print("Error while connecting to MySQL", e)

def close_conn(connection):
    if connection.is_connected():
        connection.cursor().close()
        connection.close()
        #print("MySQL connection is closed")

def load_word_table_into_db():
    conn = get_conn()
    cursor = conn.cursor()

    data = []
    
    for key, value in word_table.items():
        for v in value:
            data.append((key, v))

    sql = "INSERT INTO char_table (ch, id) VALUES (%s, %s)"
    
    cursor.executemany(sql, data)
    conn.commit()

    close_conn(conn)
        
    #records = cursor.fetchall()

def get_id_from_char(char):

    conn = get_conn()
    cursor = conn.cursor()

    cursor.execute('SELECT id FROM char_table WHERE ch = %s', (char,))
    rows = cursor.fetchall()
    
    id_values = [row[0] for row in rows]
    
    close_conn(conn)

    return id_values

In [3]:
def get_sounds(input_char):
    for i in get_id_from_char(input_char):
        print('INDEX: ' + str(i))
        if 'sounds' in data[i]:
            sounds = data[i]['sounds']
            for s in sounds:
                if 'tags' in s:
                    if ('Mandarin' in s['tags'] and 'Pinyin' in s['tags'] and ( len(s['tags']) == 2 or 'standard' in s['tags'])) and not any(x in s['zh-pron'] for x in ['⁰', '¹', '²', '³', '⁴', '⁵']):
                        print('PY: ' + s['zh-pron'])
                    if 'Cantonese' in s['tags'] and 'Jyutping' in s['tags']:
                        print('JP: ' + s['zh-pron'])
                    if 'Middle-Chinese' in s['tags']:
                        print('MC: ' + s['zh-pron'])
         

In [4]:
from pprint import pprint

def get_emin(sounds, ipa=True):
    '''
    Returns a list of unique min dong (eastern min) pronunciations
    '''
    # Extract zh-pron and ipa where tags contain Min-Dong
    zh_pron_values = [(entry.get('zh-pron', None), entry.get('ipa', None)) for entry in sounds if 'Min-Dong' in entry.get('tags', [])]
    
    # Filter out None values
    zh_pron_values = [(zh_pron, ipa) for zh_pron, ipa in zh_pron_values if zh_pron is not None or ipa is not None]

    pronunciations = []
    
    for zh_pron, ipa_pron in zh_pron_values:
        if zh_pron is not None:
            pronunciations.append(zh_pron)
        if ipa == True and ipa_pron is not None:
            pronunciations.append(ipa_pron)
    
    return list(set(pronunciations))

def get_pinyin(sounds):
    for s in sounds:
        if 'tags' in s:
            if ('Mandarin' in s['tags'] and 'Pinyin' in s['tags'] and ( len(s['tags']) == 2 or 'standard' in s['tags'])) and not any(x in s['zh-pron'] for x in ['⁰', '¹', '²', '³', '⁴', '⁵']):
                return [s['zh-pron'].lower()]

def get_jp(sounds):
    for s in sounds:
        if 'tags' in s:
            if 'Cantonese' in s['tags'] and 'Jyutping' in s['tags']:
                return [s['zh-pron'].lower()]

def get_mc(sounds):
    for s in sounds:
        if 'tags' in s:
            if 'Middle-Chinese' in s['tags']:
                return [s['zh-pron'].lower()]

import chinese_converter

def get_prons(ch):
    ch = chinese_converter.to_traditional(ch)
    char_prons = dict()
    i = 0 
    for index in get_id_from_char(ch):


        if 'sounds' in data[index]:
        
            sounds = data[index]['sounds']
            
            emin = get_emin(sounds, ipa=False)
            py = get_pinyin(sounds)
            jp = get_jp(sounds)
            mc = get_mc(sounds)
            
            # None if exact pronunciation already exists
            char_prons[i] = {
                'md': emin if emin not in [entry.get('md') for entry in char_prons.values()] else None,
                'py': py if py not in [entry.get('py') for entry in char_prons.values()] else None,
                'jp': jp if jp not in [entry.get('jp') for entry in char_prons.values()] else None,
                'mc': mc if mc not in [entry.get('mc') for entry in char_prons.values()] else None
            }
            
            i += 1
    
    filtered_char_prons = dict()
    
    i = 0
    
    for key, entry_list in char_prons.items():
        non_none_values = [(k, v) for k, v in entry_list.items() if v]
        if non_none_values:
            temp_dict = {'md': None, 'py': None, 'jp': None, 'mc': None}
            for k, v in non_none_values:
                 temp_dict[k] = v
            filtered_char_prons[i] = temp_dict
            i += 1
        
    return filtered_char_prons

def romanize(str):
    lo_romanization = []
    for char in str:
        lo_romanization.append(get_prons(char))

    return lo_romanization

In [65]:
pprint(romanize('唔'))

[{0: {'jp': ['m⁴'], 'mc': None, 'md': None, 'py': None},
  1: {'jp': None, 'mc': None, 'md': None, 'py': ['ńg']},
  2: {'jp': ['ng⁴'], 'mc': None, 'md': None, 'py': ['wú']},
  3: {'jp': None, 'mc': None, 'md': None, 'py': ['wù']}}]


In [None]:
import genanki
import json
import pycantonese

# Create a new Anki deck with a unique ID and a name
my_deck = genanki.Deck(
  deck_id=123456789,
  name="idoms",
)

# Define a model (template) for the cards
# A model specifies the fields and card format
my_model = genanki.Model(
  model_id=123456789,
  name="Basic Model",
  fields=[
    {"name": "Question"},
    {"name": "Answer"},
    #{"name": "image"},
  ],
  templates=[
    {
      "name": "Card 1",
      "qfmt": "{{Question}}", # Question format
      "afmt": "{{FrontSide}}<hr id='answer'>{{Answer}}", # Answer format
      #"afmt": "{{FrontSide}}<hr id='answer'>{{Answer}}<br>{{image}}", # Answer format
    },
  ],
  css="""
  .card {
   font-family: arial;
   font-size: 20px;
   text-align: center;
   color: black;
   background-color: white;
  }
  """,
)

f = open('idiom.json',)
data = json.load(f)

for entry in data:
    # Add a card to the deck
    note = genanki.Note(
      model=my_model,
      fields=[entry['explanation'] + '<br><br>' + entry['example'], 
              chinese_converter.to_traditional(entry['word']) + '<br>' + entry['word'] + 
              '<br>' + entry['pinyin'] + '<br>' + pycantonese.characters_to_jyutping(entry['word'])[0][1] +
              '<br><br>' + entry['derivation']],
      #fields=["What is the capital of France?", "Paris",'<img src="image.gif">'],
    )

    # add note to deck
    my_deck.add_note(note)

# create package for deck
my_package = genanki.Package(my_deck)

# Optionally, add more cards here in a similar manner
#my_package.media_files = ['image.gif']

# Save the deck to a file
my_package.write_to_file('idoms.apkg')

print("Deck has been created.")


In [64]:
import re

# Sample dictionary data
dictionary_text = """
【六一儿童节】Liù－YīÉrtónɡJié全世界儿童的节日。国际民主妇女联合会为保障全世界儿童的权利，于1949年在莫斯科举行的会议上，决定以6月1日为国际儿童节。也叫六一国际儿童节、国际儿童节、儿童节。
【六艺】liùyì〈名〉❶古代指礼（礼仪）、乐（音乐）、射（射箭）、御（驾车）、书（识字）、数（计算）等六种科目。❷古代指《诗》、《书》、《礼》、《乐》、《易》、《春秋》六种儒家经书。
【六欲】liùyù〈名〉佛教指色欲、形貌欲等六种欲望，泛指人的各种欲望：七情～。
【六指儿】liùzhǐr〈名〉❶长了六个指头的手或脚。❷指手或脚上长有六个指头的人。
【陆】（陸）liù〈数〉“六”的大写。参看1271页【数字】。另见887页lù。
【碌】liù
"""

# Regex to parse an entry line
entry_pattern = re.compile(r'【(.*?)】(.*?)((?:〈.*?〉)+)(.*)')

def parse_entry(line):
    match = entry_pattern.match(line)
    if match:
        word = match.group(1)
        pinyin = match.group(2).strip().lower()
        types_defs = match.group(3)
        definitions_text = match.group(4)

        # Extract types
        types = re.findall(r'〈(.*?)〉', types_defs)

        # Handle cases where definitions might not be properly captured
        if not definitions_text:
            definitions_text = ""

        # Split definitions if they are numbered
        if any(num in definitions_text for num in '❶❷❸❹❺❻❼❽❾❿⓫⓬⓭⓮⓯⓰⓱⓲⓳⓴'):
            definitions = re.split(r'❶|❷|❸|❹|❺|❻|❼|❽|❾|❿|⓫|⓬|⓭|⓮|⓯|⓰|⓱|⓲|⓳|⓴', definitions_text)
        else:
            definitions = [definitions_text]

        # Clean definitions
        definitions = [defn.strip() for defn in definitions if defn.strip()]

        return {
            "word": word,
            "pinyin": pinyin,
            "types": types,
            "definitions": definitions
        }
    else:
        word = re.search(r'【(.*?)】',line)[1]

        if len(word) + 2 != len(line):
            pinyin = re.search(r'[a-zāáǎàōóǒòēéěèīíǐìūúǔùüǖǘǚǜêê̄ếê̌ềm̄ḿm̀ńňǹẑĉŝŋɡ]+',line)[0]
            definition = re.search(r'[a-zāáǎàōóǒòēéěèīíǐìūúǔùüǖǘǚǜêê̄ếê̌ềm̄ḿm̀ńňǹẑĉŝŋɡ]+(\s*(.*))',line)[1]
        else:
            pinyin = ''
            definition = ''
        
        if len(word) == 4:
            types = ['四']
        else:
            []
        
        return {"word": word, "pinyin": pinyin, "types": types, "definitions": definition}

# Parsing the dictionary
entries = []
for line in dictionary_text.strip().split('\n'):
    try:
        entry = parse_entry(line)
        if entry['word']:  # Only append if word is not empty
            entries.append(entry)
    except Exception as e:
        print(f"Failed to parse line: {line} with error {str(e)}")

# Print the parsed dictionary entries
from pprint import pprint
pprint(entries, sort_dicts=False)
print(len(entries))

Failed to parse line: 【六一儿童节】Liù－YīÉrtónɡJié全世界儿童的节日。国际民主妇女联合会为保障全世界儿童的权利，于1949年在莫斯科举行的会议上，决定以6月1日为国际儿童节。也叫六一国际儿童节、国际儿童节、儿童节。 with error local variable 'types' referenced before assignment
Failed to parse line: 【碌】liù with error local variable 'types' referenced before assignment
[{'word': '六艺',
  'pinyin': 'liùyì',
  'types': ['名'],
  'definitions': ['古代指礼（礼仪）、乐（音乐）、射（射箭）、御（驾车）、书（识字）、数（计算）等六种科目。',
                  '古代指《诗》、《书》、《礼》、《乐》、《易》、《春秋》六种儒家经书。']},
 {'word': '六欲',
  'pinyin': 'liùyù',
  'types': ['名'],
  'definitions': ['佛教指色欲、形貌欲等六种欲望，泛指人的各种欲望：七情～。']},
 {'word': '六指儿',
  'pinyin': 'liùzhǐr',
  'types': ['名'],
  'definitions': ['长了六个指头的手或脚。', '指手或脚上长有六个指头的人。']},
 {'word': '陆',
  'pinyin': '（陸）liù',
  'types': ['数'],
  'definitions': ['“六”的大写。参看1271页【数字】。另见887页lù。']}]
4
