In [2]:
from pathlib import Path
from collections import defaultdict, Counter
from PIL import Image
import pandas as pd
import json, random, os

# --- Configure these two paths ---
ROOT = Path("/home/noushath/NSResearch/ASLR/data/RGB ArSL dataset/").resolve()
OUTDIR = Path("/home/noushath/NSResearch/ASLR/splits/"); OUTDIR.mkdir(parents=True, exist_ok=True)
SEED = 42
ALLOWED_EXT = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}

def is_image_ok(path: Path) -> bool:
    try:
        with Image.open(path) as im:
            im.verify()
        return True
    except Exception:
        return False

def split_per_class(files, seed=42):
    random.Random(seed).shuffle(files)
    n = len(files)
    if n >= 7:
        test = max(1, round(0.15 * n))
        val  = max(1, round(0.15 * n))
        train = n - test - val
        if train < 1:
            train, val, test = max(1, n-2), 1, 1
    elif n == 6: train, val, test = 4, 1, 1
    elif n == 5: train, val, test = 3, 1, 1
    elif n == 4: train, val, test = 2, 1, 1
    elif n == 3: train, val, test = 1, 1, 1
    elif n == 2: train, val, test = 1, 0, 1
    else:        train, val, test = 1, 0, 0
    return files[:train], files[train:train+val], files[train+val:train+val+test]

# 1) Scan dataset: expect 1 subfolder per class beneath ROOT
class_to_files = defaultdict(list)
for cls_dir in sorted([p for p in ROOT.iterdir() if p.is_dir()]):
    cls = cls_dir.name
    for p in cls_dir.rglob("*"):
        if p.suffix.lower() in ALLOWED_EXT and p.is_file():
            if is_image_ok(p):
                class_to_files[cls].append(p)
            else:
                print(f"Skipped corrupt image: {p}")

classes = sorted(class_to_files.keys())
print(f"Found {len(classes)} classes under {ROOT}")

# 2) Split per class and collect rows
train_rows, val_rows, test_rows = [], [], []
for cls in classes:
    files = sorted(class_to_files[cls])
    tr, va, te = split_per_class(files, seed=SEED)
    for p in tr:  train_rows.append({"image_path": str(p), "label": cls})
    for p in va:  val_rows.append({"image_path": str(p), "label": cls})
    for p in te:  test_rows.append({"image_path": str(p), "label": cls})
    print(f"{cls:25s} total={len(files):4d} | train={len(tr):4d} val={len(va):4d} test={len(te):4d}")

# 3) Write CSVs and classes.json
df_tr, df_va, df_te = pd.DataFrame(train_rows), pd.DataFrame(val_rows), pd.DataFrame(test_rows)
df_tr.to_csv(OUTDIR / "train.csv", index=False)
df_va.to_csv(OUTDIR / "val.csv",   index=False)
df_te.to_csv(OUTDIR / "test.csv",  index=False)

with open(OUTDIR / "classes.json", "w", encoding="utf-8") as f:
    json.dump({"classes": classes}, f, ensure_ascii=False, indent=2)

print(f"\nWrote: {len(df_tr)} train, {len(df_va)} val, {len(df_te)} test → {OUTDIR}")


Found 31 classes under /home/noushath/NSResearch/ASLR/data/RGB ArSL dataset
Ain                       total= 244 | train= 170 val=  37 test=  37
Al                        total= 276 | train= 194 val=  41 test=  41
Alef                      total= 287 | train= 201 val=  43 test=  43
Beh                       total= 307 | train= 215 val=  46 test=  46
Dad                       total= 266 | train= 186 val=  40 test=  40
Dal                       total= 235 | train= 165 val=  35 test=  35
Feh                       total= 255 | train= 179 val=  38 test=  38
Ghain                     total= 230 | train= 162 val=  34 test=  34
Hah                       total= 246 | train= 172 val=  37 test=  37
Heh                       total= 253 | train= 177 val=  38 test=  38
Jeem                      total= 210 | train= 146 val=  32 test=  32
Kaf                       total= 264 | train= 184 val=  40 test=  40
Khah                      total= 250 | train= 174 val=  38 test=  38
Laa                       t