In [None]:
import glob
from pathlib import Path

import pandas as pd

from core.utils.dataset_utils import generate_and_save_paired_character_images  # NOQA
from core.utils.repo_utils import get_repo_dir

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
BASE_DATA_DIR = get_repo_dir() / Path("data")

OUT_DATA_DIR = BASE_DATA_DIR / "data_char2char"
FONT_DIR = BASE_DATA_DIR / "fonts"
UNIHAN_DIR = BASE_DATA_DIR / "unihan"

In [4]:
long_dfs_list = []
for file_path in sorted(glob.glob(str(UNIHAN_DIR / "*.txt"))):
    this_df_long = pd.read_table(file_path, comment="#", names=["Unicode", "Key", "Value"])
    long_dfs_list.append(this_df_long)
df_long = pd.concat(long_dfs_list, ignore_index=True)

In [5]:
df_full = df_long.pivot(index="Unicode", columns="Key", values="Value").reset_index()
df_full["Unicode Int"] = df_full["Unicode"].map(lambda val: int(str(val).removeprefix("U+"), 16))
df_full["Character"] = df_full["Unicode Int"].map(chr)
df_full = df_full.sort_values("Unicode Int", ignore_index=True)
df_full = df_full.set_index(sorted(c for c in df_full.columns if not c.startswith("k"))).reset_index()
display(df_full)

Key,Character,Unicode,Unicode Int,kAccountingNumeric,kAlternateTotalStrokes,kBigFive,kCCCII,kCNS1986,kCNS1992,kCangjie,kCantonese,kCheungBauer,kCheungBauerIndex,kCihaiT,kCompatibilityVariant,kCowles,kDaeJaweon,kDefinition,kEACC,kFanqie,kFenn,kFennIndex,kFourCornerCode,kGB0,kGB1,kGB3,kGB5,kGB7,kGB8,kGSR,kGradeLevel,kHDZRadBreak,kHKGlyph,kHanYu,kHangul,kHanyuPinlu,kHanyuPinyin,kIBMJapan,kIICore,kIRGDaeJaweon,...,kKarlgren,kKorean,kKoreanEducationHanja,kKoreanName,kLau,kMainlandTelegraph,kMandarin,kMatthews,kMeyerWempe,kMojiJoho,kMorohashi,kNelson,kOtherNumeric,kPhonetic,kPrimaryNumeric,kPseudoGB1,kRSAdobe_Japan1_6,kRSUnicode,kSBGY,kSMSZD2003Index,kSMSZD2003Readings,kSemanticVariant,kSimplifiedVariant,kSpecializedSemanticVariant,kSpoofingVariant,kStrange,kTGH,kTGHZ2013,kTaiwanTelegraph,kTang,kTotalStrokes,kTraditionalVariant,kUnihanCore2020,kVietnamese,kVietnameseNumeric,kXHC1983,kXerox,kZVariant,kZhuang,kZhuangNumeric
0,㐀,U+3400,13312,,,,,,,TM,jau1,,,,,,,(same as 丘) hillock or mound,,,,,,,,,,,,,,,,10015.030,,,,,,,...,,,,,,,qiū,,,MJ000004,00034,,,,,,,1.4,,,,U+4E18,,,,,,,,,5,,,,,,,,,
1,㐁,U+3401,13313,,,,,,,MOW,,,,37.103,,,,"to lick; to taste, a mat, bamboo bark",,他紺 他念,,,,,,,,,,,,,,10019.020,,,10019.020:tiàn,,,,...,,,,,,,tiàn,,,MJ000005,00039,,,,,,,1.5,442.07 444.28,,,,,,,,,,,,6,,,,,,,,,
2,㐂,U+3402,13314,,,,,,,PPP,,,,,,,,"(non-standard Japanese variant of 喜), to like,...",,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,MJ000006,H001,0265,,,,,C+13698+1.1.5 V+13697+21.2.4 V+13699+1.1.5,1.5,,,,,,,,,,,,,6,,,,,,,,,
3,㐃,U+3403,13315,,,,,,,OML,zim1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,2.2,,,,,,,,O,,,,,3,,,,,,,,,
4,㐄,U+3404,13316,,,,,,,JV,kwaa1,,,,,,,,,,,,,,,,,,,,,,,10009.060,,,,,,,...,,,,,,,kuà,,,MJ068055 MJ000008:E0100 MJ000007:E0101 MJ06805...,00072:E0101,,,,,,,2.2,310.04 424.03,,,,,,,,,,,,3,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98677,𲎫,U+323AB,205739,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,211'.16,,,,,,,,,,,,,24,,,,,,,,,
98678,𲎬,U+323AC,205740,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,212''.5,,,,,,,,,,,,,15,,,,,,,,,
98679,𲎭,U+323AD,205741,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,212''.9,,,,,,,,,,,,,19,,,,,,,,,
98680,𲎮,U+323AE,205742,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,213'.4,,,,,,,,,,,,,11,,,,,,,,,


In [6]:
df = df_full.copy()

df = df.query("`kSimplifiedVariant`.notna() or `kTraditionalVariant`.notna()").reset_index(drop=True)

df["Unicode (S)"] = df["kSimplifiedVariant"].fillna(df["Unicode"]).str.split(" ")
df["Unicode (T)"] = df["kTraditionalVariant"].fillna(df["Unicode"]).str.split(" ")
df = df.explode("Unicode (S)", ignore_index=True)
df = df.explode("Unicode (T)", ignore_index=True)

df = df.query("`Unicode (S)` != `Unicode (T)`").reset_index(drop=True)

df["Unicode Int (S)"] = df["Unicode (S)"].map(lambda val: int(str(val).removeprefix("U+"), 16))
df["Unicode Int (T)"] = df["Unicode (T)"].map(lambda val: int(str(val).removeprefix("U+"), 16))
df["Character (S)"] = df["Unicode Int (S)"].map(chr)
df["Character (T)"] = df["Unicode Int (T)"].map(chr)

df["Filename"] = df.groupby("Unicode")["Unicode Int"].transform(lambda vals: [f"{v}_{ix}.png" for ix, v in enumerate(vals)])

df = df.set_index(sorted(c for c in df.columns if not c.startswith("k"))).reset_index()
display(df)

Key,Character,Character (S),Character (T),Filename,Unicode,Unicode (S),Unicode (T),Unicode Int,Unicode Int (S),Unicode Int (T),kAccountingNumeric,kAlternateTotalStrokes,kBigFive,kCCCII,kCNS1986,kCNS1992,kCangjie,kCantonese,kCheungBauer,kCheungBauerIndex,kCihaiT,kCompatibilityVariant,kCowles,kDaeJaweon,kDefinition,kEACC,kFanqie,kFenn,kFennIndex,kFourCornerCode,kGB0,kGB1,kGB3,kGB5,kGB7,kGB8,kGSR,kGradeLevel,kHDZRadBreak,kHKGlyph,...,kKarlgren,kKorean,kKoreanEducationHanja,kKoreanName,kLau,kMainlandTelegraph,kMandarin,kMatthews,kMeyerWempe,kMojiJoho,kMorohashi,kNelson,kOtherNumeric,kPhonetic,kPrimaryNumeric,kPseudoGB1,kRSAdobe_Japan1_6,kRSUnicode,kSBGY,kSMSZD2003Index,kSMSZD2003Readings,kSemanticVariant,kSimplifiedVariant,kSpecializedSemanticVariant,kSpoofingVariant,kStrange,kTGH,kTGHZ2013,kTaiwanTelegraph,kTang,kTotalStrokes,kTraditionalVariant,kUnihanCore2020,kVietnamese,kVietnameseNumeric,kXHC1983,kXerox,kZVariant,kZhuang,kZhuangNumeric
0,㐷,㐷,傌,13367_0.png,U+3437,U+3437,U+508C,13367,13367,20620,,,,,,,ONVM,maa6,,,,,,,"a kind of punishment in Han Dynasty, name of c...",,,,,,,,,,,,,,,,...,,,,,,,mǎ,,,,,,,863*,,,,9.3,,,,,,,,,,,,,5,U+508C,,,,,,,,
1,㐹,㐹,㑶,13369_0.png,U+3439,U+3439,U+3476,13369,13369,13430,,,,,,,OOMN,hei3,,,,,,,"(standard form of 仡) strong; valiant, a minori...",,,,,,,,,,,,,,,,...,,,,,,,yì,,,MJ000039,00406,,,,,,,9.4,,,,,,,,,,,,,6,U+3476,,,,,,,,
2,㐽,㐽,偑,13373_0.png,U+343D,U+343D,U+5051,13373,13373,20561,,,,,,,OHNK,fung1,,,,,,,"(simplified form of 偑) name of a place, last name",,,,,,,,,,,,,,,,...,,,,,,,fēng,,,,,,,,,,,9.4,,,,,,,,,,,,,6,U+5051,,,,,,,,
3,㑇,㑇,㑳,13383_0.png,U+3447,U+3447,U+3473,13383,13383,13427,,,,,,,ONSM,zau3,,,,,,,"clever, ingenious; cute; pretty",,,,,2727.7,,,,,,,,,,,...,,,,,,,zhòu,,,,,,,,,,,9.5,,26.07,,,,,,,2013:6602,482.140:zhòu,,,7,U+3473,G,,,1506.160:zhòu,,,,
4,㑈,㑈,倲,13384_0.png,U+3448,U+3448,U+5032,13384,13384,20530,,,,,,,OKD,dung1,,,,,,,"(simplified form) rude; barbarous, stupid; dul...",,,,,,,,,,,,,,,,...,,,,,,,dòng,,,,,,,,,,,9.5,,,,,,,,,,,,,7,U+5032,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13088,𱍈,𱍈,龞,201544_0.png,U+31348,U+31348,U+9F9E,201544,201544,40862,,,,,,,,bit3,,,,,,,turtle,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,213'.11,,,,,,,,,,,,,18,U+9F9E,,,,,,,,
13089,𱍉,𱍉,𪛕,201545_0.png,U+31349,U+31349,U+2A6D5,201545,201545,173781,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,214.7,,,,,,,,,,,,,24,U+2A6D5,,,,,,,,
13090,𱙋,𱙋,𮱚,202315_0.png,U+3164B,U+3164B,U+2EC5A,202315,202315,191578,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,38.5,,,,,,,,,,,,,8,U+2EC5A,,,,,,,lo*,
13091,𱵭,𮵚,𱵭,204141_0.png,U+31D6D,U+2ED5A,U+31D6D,204141,191834,204141,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,113.13,,,,,U+2ED5A,,,,,,,,18,,,,,,,,,


In [8]:
out_data_dfs = []
for font_path in sorted(glob.glob(str(FONT_DIR / "*.ttf"))):
    font_name = Path(font_path).stem
    this_out_df = generate_and_save_paired_character_images(df, font_path, OUT_DATA_DIR / font_name, image_size=(32, 32), font_size=25)

    this_out_df["Filename (S)"] = this_out_df["Filename"].map(lambda val: str(Path(font_name) / "Simplified" / val))
    this_out_df["Filename (T)"] = this_out_df["Filename"].map(lambda val: str(Path(font_name) / "Traditional" / val))
    this_out_df["Filename GC (S)"] = this_out_df["Filename (S)"].map(lambda val: str(Path("/content/data") / val))
    this_out_df["Filename GC (T)"] = this_out_df["Filename (T)"].map(lambda val: str(Path("/content/data") / val))
    this_out_df = this_out_df[sorted(c for c in this_out_df.columns if not c.startswith("k"))]
    out_data_dfs.append(this_out_df)
out_df = pd.concat(out_data_dfs, ignore_index=True)
display(out_df)

  0%|          | 0/13093 [00:00<?, ?it/s]

100%|██████████| 13093/13093 [00:14<00:00, 929.58it/s] 


Key,Character,Character (S),Character (T),Filename,Filename (S),Filename (T),Filename GC (S),Filename GC (T),Unicode,Unicode (S),Unicode (T),Unicode Int,Unicode Int (S),Unicode Int (T)
0,㐷,㐷,傌,13367_0.png,NotoSansSC-Regular/Simplified/13367_0.png,NotoSansSC-Regular/Traditional/13367_0.png,/content/data/NotoSansSC-Regular/Simplified/13...,/content/data/NotoSansSC-Regular/Traditional/1...,U+3437,U+3437,U+508C,13367,13367,20620
1,㐹,㐹,㑶,13369_0.png,NotoSansSC-Regular/Simplified/13369_0.png,NotoSansSC-Regular/Traditional/13369_0.png,/content/data/NotoSansSC-Regular/Simplified/13...,/content/data/NotoSansSC-Regular/Traditional/1...,U+3439,U+3439,U+3476,13369,13369,13430
2,㐽,㐽,偑,13373_0.png,NotoSansSC-Regular/Simplified/13373_0.png,NotoSansSC-Regular/Traditional/13373_0.png,/content/data/NotoSansSC-Regular/Simplified/13...,/content/data/NotoSansSC-Regular/Traditional/1...,U+343D,U+343D,U+5051,13373,13373,20561
3,㑇,㑇,㑳,13383_0.png,NotoSansSC-Regular/Simplified/13383_0.png,NotoSansSC-Regular/Traditional/13383_0.png,/content/data/NotoSansSC-Regular/Simplified/13...,/content/data/NotoSansSC-Regular/Traditional/1...,U+3447,U+3447,U+3473,13383,13383,13427
4,㑈,㑈,倲,13384_0.png,NotoSansSC-Regular/Simplified/13384_0.png,NotoSansSC-Regular/Traditional/13384_0.png,/content/data/NotoSansSC-Regular/Simplified/13...,/content/data/NotoSansSC-Regular/Traditional/1...,U+3448,U+3448,U+5032,13384,13384,20530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5964,𬹼,𬹼,齘,183932_0.png,NotoSansSC-Regular/Simplified/183932_0.png,NotoSansSC-Regular/Traditional/183932_0.png,/content/data/NotoSansSC-Regular/Simplified/18...,/content/data/NotoSansSC-Regular/Traditional/1...,U+2CE7C,U+2CE7C,U+9F58,183932,183932,40792
5965,𬺈,𬺈,齮,183944_0.png,NotoSansSC-Regular/Simplified/183944_0.png,NotoSansSC-Regular/Traditional/183944_0.png,/content/data/NotoSansSC-Regular/Simplified/18...,/content/data/NotoSansSC-Regular/Traditional/1...,U+2CE88,U+2CE88,U+9F6E,183944,183944,40814
5966,𬺓,𬺓,齼,183955_0.png,NotoSansSC-Regular/Simplified/183955_0.png,NotoSansSC-Regular/Traditional/183955_0.png,/content/data/NotoSansSC-Regular/Simplified/18...,/content/data/NotoSansSC-Regular/Traditional/1...,U+2CE93,U+2CE93,U+9F7C,183955,183955,40828
5967,𰻝,𰻝,𰻞,200413_0.png,NotoSansSC-Regular/Simplified/200413_0.png,NotoSansSC-Regular/Traditional/200413_0.png,/content/data/NotoSansSC-Regular/Simplified/20...,/content/data/NotoSansSC-Regular/Traditional/2...,U+30EDD,U+30EDD,U+30EDE,200413,200413,200414


In [9]:
with open(OUT_DATA_DIR / "metadata.jsonl", "w") as f:
    f.write(out_df.to_json(orient="records", lines=True))

In [10]:
with open(OUT_DATA_DIR / "metadata_top1000.jsonl", "w") as f:
    f.write(out_df.iloc[:1000].to_json(orient="records", lines=True))

In [None]:
# df.to_pickle(OUT_DATA_DIR / "full_metadata.pkl")