In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib.parse
import concurrent.futures
import time
import tqdm.notebook 
import tqdm
from fontTools.ttLib import TTFont


# Latest update: 6/1/2025
# Unicode version: 16.0.0
# 5/27 notes: I'm thinking of having the ultimate df has ["char", "pinyin", "codepoint"] columns.
# But how can I transfer the unsupported(by Mac) characters to the match core? Obviously, I can't copy paste 
# Need to be done: 
#   V Optimization (500 characters take 20 mins -> 1 min)
#   V Add these characters and their pinyin into the dictionary df
#   - Check Damon's data and take them into the dictionary
#       V Add another Unicode.txt file. Format: unicode standard notation(USN), character
#       V Ex. U+4E03, 七
#       V Create a new Dictionary.txt sorted according to the USN, to replace the old one
#       V New target: Only update the characters that are being used in the FT: cjk_extension_A-H.csv 
#       V Use the code point to get URL encode. 
#       - Get the range of wanted characters and apply them into cjk-parser. It should work
#   - Optimize and organize the code

# Tasks:
# 1. V The current 500s: orginal_pinyin_update.txt
# 2. \ The CJK A-G extention (only get the pinyins into the csv file, not in the pinyin.txt yet)
# 3. X The remaining 5000s supported by BabelStone
# Note: The 2nd & 3rd tasks are not able to be done because there are index gaps between different CJK extensions and the index-driven parser can't read the characters well.

In [2]:
# Functions and Set-ups

# Unicode code point -> Char (Not all Chinese characters are supported display)
def get_character_from_codepoint(codepoint):
    int_unicode = int(codepoint[2:], 16) # Hex string -> int
    char = chr(int_unicode)
    # print(f"Character from codepoint {codepoint}: {char}")
    
    return char


# Symbol-toned Pinyin -> Number-toned Pinyin       
from random import uniform
def translate_toned_vowels(pinyin):
    tone_map = { 'ā':['a', 1],'á':['a', 2],'ǎ':['a', 3],'à':['a', 4],
                 'ē':['e', 1],'é':['e', 2],'ě':['e', 3],'è':['e', 4],
                 'ī':['i', 1],'í':['i', 2],'ǐ':['i', 3],'ì':['i', 4], 
                 'ō':['o', 1],'ó':['o', 2],'ǒ':['o', 3],'ò':['o', 4],
                 'ū':['u', 1],'ú':['u', 2],'ǔ':['u', 3],'ù':['u', 4] }
    for c in pinyin:
        match = tone_map.get(c)
        if match:
            pinyin = pinyin.replace(c, match[0]) + str(match[1])
    return pinyin   

In [3]:
# URL encoded Str -> Pinyin
def get_pinyin_from_shidian(chinese_char, encoded=False):
    time.sleep(uniform(0.2, 0.5))  # Random sleep to avoid hitting the server too hard
    encoded_str = urllib.parse.quote(chinese_char) if not encoded else chinese_char
    url = f"https://www.shidianguji.com/character/{encoded_str}#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF"
    res = requests.get(url)
    res.raise_for_status()
    soup = BeautifulSoup(res.text, 'html.parser')
    vo_div = soup.findAll("div", class_="pinyin")
    span = soup.find("span", class_="vo5tsmom", string=lambda t: t and t.strip() == "拼音")
    run = True
    while run:
        try:
            if vo_div:
                if len(vo_div) == 1:
                    chinese_text = vo_div[0].text.strip()
                    return translate_toned_vowels(chinese_text)
                else:
                    chinese_text_list = []
                    for vo in vo_div:
                        chinese_text = translate_toned_vowels(vo.text.strip()) 
                        chinese_text_list.append(chinese_text)
                        chinese_texts = ",".join(chinese_text_list)
                    return chinese_texts
            elif span: 
                chinese_text = span.find_next().text.strip()
                return translate_toned_vowels(chinese_text)
            else:
                # Can't find the pinyin information
                return None
        except Exception as e:
            if e.response.status_code == 444:
                print(f"Error at requesting from web: {e}")
                input("Paused due to frequent request. Wait a second and press Enter to retry...")
            else:
                print(f"Error at requesting from web: {e}")
                decision = input("Enter e to exit, or any other key to continue...")
                if decision.lower() == 'e':
                    run = False

              
# test
print(get_pinyin_from_shidian("𪛐")) # Expect output: ['chui1', 'chui4']
print(get_pinyin_from_shidian(r"%E9%AF%BB", encoded=True)) # Expect output: ['la4']
print(get_pinyin_from_shidian("䧌")) # Expect output: ['la4']

  vo_div = soup.findAll("div", class_="pinyin")


chui1,chui4
la4
sui1


In [4]:
# Return (idx, character, new_pinyin) from shidian
def process_character(row_data, encoded=False):
    idx = row_data.name
    char = row_data["character"]
    old_pinyin = row_data["pinyin"]
    try: 
        new_pinyin = get_pinyin_from_shidian(char, encoded=encoded)
        if new_pinyin:
            return (idx, char ,new_pinyin)
        return (idx, char, "")
    except Exception as e:
        print(f"Error processing character {char} at transiting data: {e}")
        return (idx, char, "")

# Build updated dataframe
def build_updated_dataframe(df, encoded=False):
    df_updated = df.copy()
    missing_pinyin_rows = [row for idx, row in df_updated[df_updated["pinyin"].isna()].iterrows()]
    row_results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        future_to_char = {executor.submit(process_character, row_data, encoded): row_data for row_data in missing_pinyin_rows}
        for future in tqdm.tqdm(concurrent.futures.as_completed(future_to_char), total=len(missing_pinyin_rows)):
            try: 
                result = future.result()
                if result:
                    row_results.append(result)
            except Exception as e:
                print(f"Error processing character at build df: {e}")

    for idx, char, new_pinyin in row_results:
        df_updated.at[idx, "pinyin"] = new_pinyin
        print(f"{idx}, {char}, updated pinyin: {df_updated.loc[idx, "pinyin"]}")
    
    print(f"Dataframe updation finished. Total characters processed: {len(row_results)}")
    return df_updated

In [5]:
# Task 1: Update the original missing pinyins in pinyin.txt

# Get exist dictionary from dictionary.txt
df_dictionary = pd.read_csv("dictionary.txt", sep="=", names=["character", "pinyin"])
df_dictionary["codepoint"] = df_dictionary["character"].apply(lambda x: f"U+{ord(x):04X}")
df_dictionary.sort_values(by="codepoint", inplace=True)
df_dictionary

Unnamed: 0,character,pinyin,codepoint
0,一,yi1,U+4E00
1,丁,"ding1,zheng1",U+4E01
2,丂,kao3,U+4E02
3,七,qi1,U+4E03
4,丄,"shang4,shang3",U+4E04
...,...,...,...
20897,龡,chui1,U+9FA1
20898,龢,he2,U+9FA2
20899,龣,jue2,U+9FA3
20900,龤,xie2,U+9FA4


In [None]:
# NaN counts in ["pinyin"]
df_dictionary["pinyin"].replace("", None, inplace=True)
df_dictionary[df_dictionary["pinyin"].isna()].shape

# Get the orginal missing pinyins
df_first_task = build_updated_dataframe(df_dictionary)
df_dictionary.to_csv("original_pinyin_update.txt", sep="=", index=False, header=False)
print("Original missing pinyins updated and saved to original_pinyin_update.txt")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dictionary["pinyin"].replace("", None, inplace=True)
  vo_div = soup.findAll("div", class_="pinyin")
  6%|▌         | 26/465 [00:05<01:30,  4.83it/s]


In [6]:
# Task 2: CJK A-H extension characters
df_extension = pd.read_csv("cjk_extensions_A-H.csv", sep=",", names=["Extension", "character", "codepoint"], header=None, skiprows=1)
pinyin_col_position = df_extension.columns.get_loc("character") + 1
df_extension.insert(pinyin_col_position, "pinyin", None)
df_extension


Unnamed: 0,Extension,character,pinyin,codepoint
0,Extension A,㗳,,U+35F3
1,Extension A,䗁,,U+45C1
2,Extension A,㪟,,U+3A9F
3,Extension A,㤃,,U+3903
4,Extension A,㘓,,U+3613
...,...,...,...,...
13094,Extension H,𲈜,,U+3221C
13095,Extension H,𱜇,,U+31707
13096,Extension H,𱳮,,U+31CEE
13097,Extension H,𲈊,,U+3220A


In [7]:
df_extension_1 = df_extension.iloc[:4500]
df_extension_2 = df_extension.iloc[4501:9001]
df_extension_3 = df_extension.iloc[9002:]

In [None]:
df_extension_pinyin_1 = build_updated_dataframe(df_extension_1)
_ = input()
df_extension_pinyin_2 = build_updated_dataframe(df_extension_2)
_ = input()
df_extension_pinyin_3 = build_updated_dataframe(df_extension_3)
_ = input()
df_extension_pinyin = pd.concat([df_extension_pinyin_1, df_extension_pinyin_2, df_extension_pinyin_3], ignore_index=True)

  vo_div = soup.findAll("div", class_="pinyin")
100%|██████████| 4500/4500 [10:19<00:00,  7.27it/s]


2, 㪟, updated pinyin: dun1,dui4
0, 㗳, updated pinyin: ta3,da1
4, 㘓, updated pinyin: lan2
8, 䌌, updated pinyin: qie4,xi2
9, 㵩, updated pinyin: yi4
3, 㤃, updated pinyin: fang2
5, 䖲, updated pinyin: xun2,zong1
6, 䧟, updated pinyin: xian4
7, 䏎, updated pinyin: feng2
1, 䗁, updated pinyin: qi1,ji4
10, 䧌, updated pinyin: sui1
15, 䌣, updated pinyin: zuan3
17, 㧰, updated pinyin: lüe4,luo4
14, 䀥, updated pinyin: shuo4,li4
13, 㽓, updated pinyin: 
16, 䁎, updated pinyin: cheng2,ting2,cheng1
11, 㺓, updated pinyin: ze2
18, 䏰, updated pinyin: run4
12, 㶴, updated pinyin: chi3
26, 䈘, updated pinyin: ci2
23, 䌽, updated pinyin: cai3
19, 䡴, updated pinyin: chong1
20, 㮏, updated pinyin: nai4
21, 䄽, updated pinyin: tian3
22, 㝘, updated pinyin: yong2
25, 㣒, updated pinyin: ceng4
27, 㳋, updated pinyin: qiu1
24, 䟑, updated pinyin: yue4
33, 䳵, updated pinyin: chi4
32, 䋬, updated pinyin: tian1
31, 䣫, updated pinyin: li2
34, 㚽, updated pinyin: qiao3
30, 㩆, updated pinyin: jiu4
28, 㠌, updated pinyin: qi1
29, 䞞, upd

100%|██████████| 4499/4499 [09:35<00:00,  7.82it/s]


4509, 䇗, updated pinyin: zhong1
4502, 䬕, updated pinyin: xiang1,qiang3
4506, 㠴, updated pinyin: ren4
4507, 㯈, updated pinyin: su4
4512, 㭇, updated pinyin: yuan4
4505, 㜿, updated pinyin: xu4
4501, 䦶, updated pinyin: zheng4
4511, 䎙, updated pinyin: pin1
4504, 䁮, updated pinyin: qian2
4510, 䐳, updated pinyin: yu2
4508, 䙧, updated pinyin: xun1
4515, 䑤, updated pinyin: jin4
4513, 㴛, updated pinyin: zhi4
4503, 䞣, updated pinyin: che3,che4,qie4
4516, 㫝, updated pinyin: di1
4521, 㠊, updated pinyin: qu1
4514, 䊺, updated pinyin: hu4
4523, 䏍, updated pinyin: yuan4,yuan1
4517, 䍒, updated pinyin: mou3
4518, 䒞, updated pinyin: chen2,yin2
4524, 㔺, updated pinyin: shi4
4519, 䜜, updated pinyin: wei4
4520, 䳋, updated pinyin: tong2,tong1,xiao1
4522, 㚱, updated pinyin: qiu1
4529, 㵧, updated pinyin: ge2
4525, 㰣, updated pinyin: zi1
4526, 䋄, updated pinyin: wang3
4535, 䀠, updated pinyin: ju4
4528, 㿿, updated pinyin: ya3
4527, 㪫, updated pinyin: rui4
4532, 䈫, updated pinyin: na4
4530, 㞸, updated pinyin: sui4

 83%|████████▎ | 3387/4098 [06:52<01:37,  7.31it/s]

Error processing character 𪟘: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AA%9F%98#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF


 83%|████████▎ | 3393/4098 [06:53<01:02, 11.23it/s]

Error processing character 𫁟: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AB%81%9F#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF


 83%|████████▎ | 3397/4098 [06:53<01:03, 11.06it/s]

Error processing character 𫛩: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AB%9B%A9#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF
Error processing character 𫈭: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AB%88%AD#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF


 83%|████████▎ | 3400/4098 [06:53<01:00, 11.52it/s]

Error processing character 𪻛: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AA%BB%9B#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF
Error processing character 𫐉: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AB%90%89#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF
Error processing character 𪳆: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AA%B3%86#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF
Error processing character 𪟢: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AA%9F%A2#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF


 83%|████████▎ | 3405/4098 [06:54<00:49, 13.88it/s]

Error processing character 𪥼: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AA%A5%BC#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF
Error processing character 𫔀: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AB%94%80#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF
Error processing character 𪬸: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AA%AC%B8#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF


 83%|████████▎ | 3407/4098 [06:54<00:57, 12.11it/s]

Error processing character 𪶎: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AA%B6%8E#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF
Error processing character 𪼑: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AA%BC%91#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF
Error processing character 𫂱: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AB%82%B1#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF
Error processing character 𪧟: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AA%A7%9F#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF
Error processing character 𪴱: 444 Client Error: Unknown for url: https://www.shidianguji.com/character/%F0%AA%B4%B1#%E5%AD%97%E5%BD%A2%E4%BF%A1%E6%81%AF


100%|██████████| 4098/4098 [08:15<00:00,  8.28it/s]


9009, 𣲉, updated pinyin: 
9008, 𠅛, updated pinyin: si1
9007, 𨫠, updated pinyin: 
9004, 𡇳, updated pinyin: hong2
9002, 𩢼, updated pinyin: kuang1
9001, 𢌜, updated pinyin: ting2
9006, 𦈅, updated pinyin: die2
9005, 𥞁, updated pinyin: chen1
9010, 𩐳, updated pinyin: yun4
9012, 𡸯, updated pinyin: 
9003, 𠊧, updated pinyin: bing4
9011, 𧆾, updated pinyin: ju4
9013, 𦶸, updated pinyin: 
9015, 𤌁, updated pinyin: kang4,hang1
9016, 𡴯, updated pinyin: e4
9014, 𢌝, updated pinyin: chen4
9018, 𤧫, updated pinyin: jin，dui，jin114
9020, 𣠄, updated pinyin: ling2
9019, 𩒚, updated pinyin: guang1
9017, 𨔝, updated pinyin: dong4
9021, 𤊴, updated pinyin: he4,huo4
9022, 𩾵, updated pinyin: jiu4
9028, 𣓶, updated pinyin: 
9024, 𢓵, updated pinyin: xiu4
9025, 𦥏, updated pinyin: zhi4
9027, 𢰹, updated pinyin: 
9026, 𪆣, updated pinyin: 
9023, 𩉼, updated pinyin: shen1
9032, 𦛙, updated pinyin: sheng4
9029, 𤤆, updated pinyin: 
9030, 𨙉, updated pinyin: 
9031, 𨎴, updated pinyin: dang1
9037, 𥫸, updated pinyin: shui3
9033, 𧧰, upda

In [8]:
df_extension_pinyin_3 = build_updated_dataframe(df_extension_3)

  vo_div = soup.findAll("div", class_="pinyin")
100%|██████████| 4097/4097 [09:58<00:00,  6.85it/s]

9008, 𠅛, updated pinyin: si1
9009, 𣲉, updated pinyin: 
9006, 𦈅, updated pinyin: die2
9004, 𡇳, updated pinyin: hong2
9007, 𨫠, updated pinyin: 
9005, 𥞁, updated pinyin: chen1
9010, 𩐳, updated pinyin: yun4
9002, 𩢼, updated pinyin: kuang1
9011, 𧆾, updated pinyin: ju4
9003, 𠊧, updated pinyin: bing4
9013, 𦶸, updated pinyin: 
9012, 𡸯, updated pinyin: 
9014, 𢌝, updated pinyin: chen4
9018, 𤧫, updated pinyin: jin，dui，jin114
9015, 𤌁, updated pinyin: kang4,hang1
9016, 𡴯, updated pinyin: e4
9017, 𨔝, updated pinyin: dong4
9020, 𣠄, updated pinyin: ling2
9019, 𩒚, updated pinyin: guang1
9021, 𤊴, updated pinyin: he4,huo4
9025, 𦥏, updated pinyin: zhi4
9022, 𩾵, updated pinyin: jiu4
9024, 𢓵, updated pinyin: xiu4
9026, 𪆣, updated pinyin: 
9027, 𢰹, updated pinyin: 
9023, 𩉼, updated pinyin: shen1
9028, 𣓶, updated pinyin: 
9030, 𨙉, updated pinyin: 
9029, 𤤆, updated pinyin: 
9031, 𨎴, updated pinyin: dang1
9032, 𦛙, updated pinyin: sheng4
9033, 𧧰, updated pinyin: zhi3
9039, 𤩲, updated pinyin: ge2
9038, 𦅡, updated




In [None]:

df_extension_pinyin_3.to_csv("cjk_extension_pinyin_3.txt", sep=",", index=False, header=False)

In [None]:
# Task 2: Update the CJK A-G extension characters
# Issue: The CJK A-G extension codepoints are not continuous. It won't be easy to apply because the cjk-parser is index-drivened.

In [None]:
# Get supported characters from the BabelStoneHan font
# font_path = "/Users/callalilyleaf/Downloads/BabelStoneHan.ttf"
# font = TTFont(font_path)
# def is_char_supported(font_path, char):
#     cmap = None
#     for table in font['cmap'].tables:
#         if table.isUnicode():
#             cmap = table.cmap
#             break
#     if cmap is None:
#         return False
#     return ord(char) in cmap

# # Get supported characters from the BabelStoneHan font
# df_char_unicode_update = pd.read_csv("cjk_extensions_A-H.csv", sep=",", skiprows=1, names=["extention", "character", "codepoint"])
# df_char_unicode_update.drop(columns=["extention"], inplace=True)
# df_char_unicode_update.head(10)


In [None]:
# Convert codepoint column to integer values
codepoints = df_ult_sorted["codepoint"].apply(lambda x: int(x[2:], 16)).sort_values().reset_index(drop=True)

# Check for gaps in the sequence
gaps = codepoints.diff().fillna(1) != 1

if gaps.sum() == 1:
    print("The codepoints are continuous.")
else:
    print("There are gaps in the codepoints at the following indices:")
    print(codepoints[gaps].index.tolist())

In [None]:
# Get supported characters from the BabelStoneHan font
df_char_unicode_update = pd.read_csv("cjk_extensions_A-H.csv", sep=",", skiprows=1, names=["extention", "character", "codepoint"])
df_char_unicode_update.drop(columns=["extention"], inplace=True)
df_char_unicode_update.head(10)

# df_babel_supported = df_char_unicode_update[df_char_unicode_update["character"].apply(lambda x: is_char_supported(font_path, x))]
# df_babel_supported.drop(columns=["extention"], inplace=True)
# df_babel_supported["pinyin"] = None
# df_babel_supported

In [None]:
# Get unsupported characters from the BabelStoneHan font
df_babel_not_supported = df_char_unicode_update[~df_char_unicode_update["character"].apply(lambda x: is_char_supported(font_path, x))]
df_babel_not_supported.drop(columns=["extention"], inplace=True)
df_babel_not_supported["pinyin"] = None
df_babel_not_supported

In [None]:
# Merge df_babel_supported with df_dictionary without duplicates
df_supported_merged_raw = pd.concat([df_char_unicode_update, df_dictionary], ignore_index=True).sort_values("codepoint").drop_duplicates(subset=["codepoint"], keep="first")
df_supported_merged_raw


In [None]:
# Replace NaN in "pinyin" with None
df_supported_merged_raw["pinyin"].replace("", None, inplace=True)

In [None]:
# Duplicate check
print(f"Any duplicated rows: {df_supported_merged_raw["codepoint"].duplicated().any()}")

# NaN & empty string check
print(f"Na value rows and columns: {df_supported_merged_raw[df_supported_merged_raw["pinyin"].isna()].shape}")
print(f"Empty Strings in pinyin column: {df_supported_merged_raw["pinyin"].eq("").sum()}")

In [None]:
df_supported_merged_pinyin = build_updated_dataframe(df_supported_merged_raw)
df_supported_merged_pinyin

In [None]:
df_supported_merged_pinyin.head(40)

In [None]:
# Before
print("Rows with missing pinyin before:", df_supported_merged_raw[df_supported_merged_raw["pinyin"].isna()].shape[0])

# After
print("Rows with missing pinyin after:", df_supported_merged_pinyin[df_supported_merged_pinyin["pinyin"] == ""].shape[0])

In [None]:
# Get pinyin unsupported rows
df_unsupported_merged_pinyin = df_supported_merged_pinyin[df_supported_merged_pinyin["pinyin"] == ""] 
df_unsupported_merged_pinyin


In [None]:
# Get updated_dictionary.txt
df_supported_merged_pinyin_clean[["character", "pinyin"]].to_csv("updated_dictionary.txt", sep="=", index=False, header=False)

In [None]:
# Get unsupported characters in updated_unknown.txt
df_babel_not_supported_final = pd.concat([df_babel_not_supported, df_supported_merged_pinyin_cant], ignore_index=True).drop_duplicates(subset=["codepoint"], keep="first")
df_babel_not_supported_final["pinyin"] = None
df_babel_not_supported_final[["character", "pinyin"]].to_csv("updated_unknown.txt", sep="=", index=False, header=False)