## Define parameters

In [5]:
codes = ["0041", "0641", "0915"]  # list of hex Unicode codes (e.g. Latin 'A', Arabic, Devanagari)
copies = 10                     # number of images per code-point
out_dir = "data_tiny"     

## Generate the mini dataset

In [6]:
import pandas as pd
from pathlib import Path

# --- load the full table (assuming multilingual_data.csv is in the current working directory) -----------
full = pd.read_csv("multilingual_data.csv", dtype=str)  # multilingual_data.csv should have columns: Unicode,Character,caption
sel = full[full["Unicode"].isin(codes)].reset_index(drop=True)

if sel.empty:
    raise SystemExit("None of the given codes found in multilingual_data.csv")

rows = []
for _, row in sel.iterrows():
    for i in range(copies):
        fname = f"{row.Unicode}_{i:03d}.png"
        rows.append({"file_name": fname, "caption": row.caption,
                     "Unicode": row.Unicode, "Character": row.Character})

tiny = pd.DataFrame(rows)

# --- export CSV + JSONL ---------------------------------------------------------
out_path = Path(out_dir).resolve()
out_path.mkdir(exist_ok=True)

tiny.to_csv(out_path / "char_dataset.csv", index=False)
tiny[["file_name", "caption"]].to_json(out_path / "metadata.jsonl",
                                       orient="records", lines=True,
                                       force_ascii=False)

print(f"✅  wrote {len(tiny)} rows to {out_path/'metadata.jsonl'}; "
      f"now call build_dataset.py --out-dir {out_path} --csv {out_path/'char_dataset.csv'} "
      f"--images-per-char 1 to actually render the PNGs")

✅  wrote 30 rows to /workspaces/multilingual_diffusion/data_tiny/metadata.jsonl; now call build_dataset.py --out-dir /workspaces/multilingual_diffusion/data_tiny --csv /workspaces/multilingual_diffusion/data_tiny/char_dataset.csv --images-per-char 1 to actually render the PNGs
