In [1]:
import glob
from pathlib import Path

import pandas as pd

from utils.utils import get_repo_dir
from utils.dataset_utils import clean_definition, generate_and_save_character_images  # NOQA

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
BASE_DATA_DIR = get_repo_dir() / Path("data")

OUT_DATA_DIR = BASE_DATA_DIR / "data"
FONT_DIR = BASE_DATA_DIR / "fonts"
UNIHAN_DIR = BASE_DATA_DIR / "unihan"

In [4]:
long_dfs_list = []
for file_path in sorted(glob.glob(str(UNIHAN_DIR / "*.txt"))):
    this_df_long = pd.read_table(file_path, comment="#", names=["Unicode", "Key", "Value"])
    long_dfs_list.append(this_df_long)
df_long = pd.concat(long_dfs_list, ignore_index=True)

In [5]:
df_full = df_long.pivot(index="Unicode", columns="Key", values="Value").reset_index()
df_full.insert(1, "Unicode Int", df_full["Unicode"].map(lambda val: int(str(val).removeprefix("U+"), 16)))
df_full.insert(2, "Character", df_full["Unicode Int"].map(chr))
df_full = df_full.sort_values("Unicode Int", ignore_index=True)
display(df_full)

Key,Unicode,Unicode Int,Character,kAccountingNumeric,kAlternateTotalStrokes,kBigFive,kCCCII,kCNS1986,kCNS1992,kCangjie,...,kTotalStrokes,kTraditionalVariant,kUnihanCore2020,kVietnamese,kVietnameseNumeric,kXHC1983,kXerox,kZVariant,kZhuang,kZhuangNumeric
0,U+3400,13312,㐀,,,,,,,TM,...,5,,,,,,,,,
1,U+3401,13313,㐁,,,,,,,MOW,...,6,,,,,,,,,
2,U+3402,13314,㐂,,,,,,,PPP,...,6,,,,,,,,,
3,U+3403,13315,㐃,,,,,,,OML,...,3,,,,,,,,,
4,U+3404,13316,㐄,,,,,,,JV,...,3,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98677,U+323AB,205739,𲎫,,,,,,,,...,24,,,,,,,,,
98678,U+323AC,205740,𲎬,,,,,,,,...,15,,,,,,,,,
98679,U+323AD,205741,𲎭,,,,,,,,...,19,,,,,,,,,
98680,U+323AE,205742,𲎮,,,,,,,,...,11,,,,,,,,,


In [6]:
df = df_full.copy()
#df = df.query(" or ".join(f"`{col}`.notna()" for col in ["kCihaiT", "kFennIndex", "kGSR", "kKarlgren", "kMatthews", "kSBGY", "kSMSZD2003Index", "kTGHZ2013", "kXHC1983"]))
#df = df.query("`kMandarin`.notna()")

df.insert(3, "Chinese Definition", df["kDefinition"].map(clean_definition, na_action="ignore"))
df = df.query("`Chinese Definition`.str.len() > 0")

df.insert(4, "Filename", df["Unicode Int"].map(lambda val: f"{val}.png"))
display(df)

Key,Unicode,Unicode Int,Character,Chinese Definition,Filename,kAccountingNumeric,kAlternateTotalStrokes,kBigFive,kCCCII,kCNS1986,...,kTotalStrokes,kTraditionalVariant,kUnihanCore2020,kVietnamese,kVietnameseNumeric,kXHC1983,kXerox,kZVariant,kZhuang,kZhuangNumeric
0,U+3400,13312,㐀,hillock or mound,13312.png,,,,,,...,5,,,,,,,,,
1,U+3401,13313,㐁,"to lick; to taste, a mat, bamboo bark",13313.png,,,,,,...,6,,,,,,,,,
5,U+3405,13317,㐅,five,13317.png,,,,,,...,2,,,,,,,,,
6,U+3406,13318,㐆,"to follow, to trust to; to put confidence in; ...",13318.png,,,,,,...,6,,,,,,,,,
12,U+340C,13324,㐌,a tribe of savages in South China,13324.png,,,,,,...,5,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94468,U+31335,201525,𱌵,in the manner a toothless person chewing food,201525.png,,,,,,...,15,U+4DA8,,,,,,,,
94472,U+31339,201529,𱌹,uneven,201529.png,,,,,,...,17,U+9F75,,,,,,,,
94481,U+31342,201538,𱍂,flight of a dragon,201538.png,,,,,,...,10,U+9F96,,,,,,,,
94486,U+31347,201543,𱍇,a kind of turtle,201543.png,,,,,,...,12,U+4DB1,,,,,,,,


In [8]:
out_data_dfs = []
for font_path in sorted(glob.glob(str(FONT_DIR / "*.ttf"))):
    font_name = Path(font_path).stem
    this_out_df = generate_and_save_character_images(df, font_path, OUT_DATA_DIR / font_name, image_size=(32, 32), font_size=25)

    this_out_df["Filename"] = this_out_df["Filename"].map(lambda val: str(Path(font_name) / val))
    this_out_df["Filename GC"] = this_out_df["Filename"].map(lambda val: str(Path("/content/data") / val))
    this_out_df = this_out_df[[c for c in this_out_df.columns if not c.startswith("k")]]
    out_data_dfs.append(this_out_df)
out_df = pd.concat(out_data_dfs, ignore_index=True)
display(out_df)

  0%|          | 0/22460 [00:00<?, ?it/s]

100%|██████████| 22460/22460 [00:00<00:00, 23043.15it/s]


Key,Unicode,Unicode Int,Character,Chinese Definition,Filename,Filename GC
0,U+3400,13312,㐀,hillock or mound,NotoSansSC-Regular/13312.png,/content/data/NotoSansSC-Regular/13312.png
1,U+3401,13313,㐁,"to lick; to taste, a mat, bamboo bark",NotoSansSC-Regular/13313.png,/content/data/NotoSansSC-Regular/13313.png
2,U+3405,13317,㐅,five,NotoSansSC-Regular/13317.png,/content/data/NotoSansSC-Regular/13317.png
3,U+3406,13318,㐆,"to follow, to trust to; to put confidence in; ...",NotoSansSC-Regular/13318.png,/content/data/NotoSansSC-Regular/13318.png
4,U+340C,13324,㐌,a tribe of savages in South China,NotoSansSC-Regular/13324.png,/content/data/NotoSansSC-Regular/13324.png
...,...,...,...,...,...,...
20394,U+2CE23,183843,𬸣,to soar,NotoSansSC-Regular/183843.png,/content/data/NotoSansSC-Regular/183843.png
20395,U+2CE26,183846,𬸦,phoenix,NotoSansSC-Regular/183846.png,/content/data/NotoSansSC-Regular/183846.png
20396,U+2CE88,183944,𬺈,bite,NotoSansSC-Regular/183944.png,/content/data/NotoSansSC-Regular/183944.png
20397,U+30EDD,200413,𰻝,biangbiang noodles,NotoSansSC-Regular/200413.png,/content/data/NotoSansSC-Regular/200413.png


In [9]:
with open(OUT_DATA_DIR / "metadata.jsonl", "w") as f:
    f.write(out_df.to_json(orient="records", lines=True))

In [10]:
with open(OUT_DATA_DIR / "metadata_top1000.jsonl", "w") as f:
    f.write(out_df.iloc[:1000].to_json(orient="records", lines=True))

In [None]:
# df.to_pickle(OUT_DATA_DIR / "full_metadata.pkl")