In [2]:
import pandas as pd, random, os, sys, json, urllib.request
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont

# ─────────────────────── TOP-LEVEL KNOBS ──────────────────────────
CSV_FILE        = "multilingual_data.csv"   # the full master list
OUT_DIR         = Path("data")              # where PNGs + metadata land
IMAGE_SIZE      = 128
IMAGES_PER_CHAR = 5                         # default full-set duplication

# ─── MINI-DATASET SWITCH ──────────────────────────────────────────
# If MINI is False the script uses every row in multilingual_data.csv.
# If MINI is True it will:
#    • keep only the code points listed in MINI_CODES
#    • duplicate each of those MINI_COPIES times
MINI         = False
MINI_CODES   = ["0041","0641","0915"]                   # hex Unicode codes for the test
MINI_COPIES  = 10

# ───────────────────── Font definitions & auto-download ───────────
FONTS = {
    "latin":      "NotoSans-Regular.ttf",
    "arabic":     "NotoNaskhArabic-Regular.ttf",
    "devanagari": "NotoSansDevanagari-Regular.ttf",
}
FONT_URL = {
    "NotoSans-Regular.ttf":
        "https://github.com/googlefonts/noto-fonts/raw/main/hinted/ttf/NotoSans/NotoSans-Regular.ttf",
    "NotoNaskhArabic-Regular.ttf":
        "https://github.com/googlefonts/noto-fonts/raw/main/hinted/ttf/NotoNaskhArabic/NotoNaskhArabic-Regular.ttf",
    "NotoSansDevanagari-Regular.ttf":
        "https://github.com/googlefonts/noto-fonts/raw/main/hinted/ttf/NotoSansDevanagari/NotoSansDevanagari-Regular.ttf",
}

def ensure_font(ttf):
    if Path(ttf).exists():
        return
    print(f"Downloading {ttf}")
    urllib.request.urlretrieve(FONT_URL[ttf], ttf)

for ttf in FONTS.values():
    ensure_font(ttf)

# ───────────────────── internal helpers ───────────────────────────
def check_row(row):
    return int(row.Unicode, 16) == ord(row.Character)

def pick_font(ch):
    cp = ord(ch)
    if   0x0600 <= cp <= 0x06FF: pool = FONTS["arabic"]
    elif 0x0900 <= cp <= 0x097F or cp == 0x0950: pool = FONTS["devanagari"]
    else: pool = FONTS["latin"]
    return ImageFont.truetype(pool, IMAGE_SIZE - 28)

def render(ch, font):
    img = Image.new("L", (IMAGE_SIZE, IMAGE_SIZE), 255)
    draw = ImageDraw.Draw(img)
    w, h = draw.textbbox((0, 0), ch, font=font)[2:]
    x, y = (IMAGE_SIZE - w)//2, (IMAGE_SIZE - h)//2 - 8
    draw.text((x, y), ch, font=font, fill=0)
    return img

# ───────────────────── main build routine ─────────────────────────
def build(csv_file=CSV_FILE,
          out_dir=OUT_DIR,
          images_per_char=IMAGES_PER_CHAR,
          mini=False, mini_codes=None, mini_copies=100):
    
    df = pd.read_csv(csv_file, dtype=str)
    if not df.apply(check_row, axis=1).all():
        raise ValueError("Unicode ↔ glyph mismatch in CSV")

    if mini:
        df = df[df["Unicode"].isin(mini_codes)].reset_index(drop=True)
        images_per_char = 1          # we’ll duplicate via DataFrame
        df = pd.concat([df]*mini_copies, ignore_index=True)
        df["dup_id"] = df.groupby("Unicode").cumcount()
        df["file_name"] = df["Unicode"] + "_" + df.dup_id.astype(str).str.zfill(3) + ".png"
    else:
        df["file_name"] = df["Unicode"].apply(lambda x: f"{x}.png")

    out_dir.mkdir(exist_ok=True)

    # (1) generate PNGs (one per unique file_name)
    for row in df.drop_duplicates("file_name").itertuples(index=False):
        img = render(row.Character, pick_font(row.Character))
        img.save(out_dir / row.file_name)

    # (2) metadata
    df[["file_name", "caption"]].to_json(out_dir / "metadata.jsonl",
                                         orient="records", lines=True,
                                         force_ascii=False)
    df.to_csv(out_dir / "char_dataset.csv", index=False)
    print(f"✅  Wrote {len(df)} rows and {df.file_name.nunique()} PNGs → {out_dir}")

# ───────────────────── call it! ────────────────────────────────────
build(mini=MINI, mini_codes=MINI_CODES, mini_copies=MINI_COPIES)

Downloading NotoSans-Regular.ttf
Downloading NotoNaskhArabic-Regular.ttf
Downloading NotoSansDevanagari-Regular.ttf
✅  Wrote 30 rows and 30 PNGs → data
