In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
from PIL import Image, UnidentifiedImageError
from sklearn.model_selection import train_test_split

# adjust this to your dataset location
DATA_ROOT = Path("./asl_alphabet_train")
ARTIFACTS = Path("./artifacts")
ARTIFACTS.mkdir(exist_ok=True)

SEED = 429

In [10]:
rows = []
for class_dir in sorted(DATA_ROOT.iterdir()):
    if not class_dir.is_dir():
        continue
    label = class_dir.name
    for img_path in class_dir.glob("*.jpg"):
        rows.append((str(img_path.resolve()), label))

df = pd.DataFrame(rows, columns=["path", "label"])
print("Total images:", len(df))
print(f"{len(df['label'].unique())} Classes found:", sorted(df['label'].unique()))

Total images: 87000
29 Classes found: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']


In [12]:
keep = [chr(c) for c in range(ord('A'), ord('Z')+1)] + ['space']
drop = {'del', 'nothing'}

df = df[~df['label'].isin(drop)]
df = df[df['label'].isin(keep)].reset_index(drop=True)

print("Dropped classes:", sorted(drop))

print(f"{len(df['label'].unique())} Classes kept:", sorted(df['label'].unique()))

Dropped classes: ['del', 'nothing']
27 Classes kept: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'space']


In [None]:
# check for corrupt images
good_idx = []
for i, p in enumerate(df['path']):
    try:
        with Image.open(p) as im:
            im.verify()
        good_idx.append(i)
    except (UnidentifiedImageError, OSError):
        pass

df = df.iloc[good_idx].reset_index(drop=True)
print("Images after corrupt check:", len(df))

Images after corrupt check: 81000


In [15]:
# encode lables to numeric ids and save
classes = sorted(keep)
class_to_idx = {c: i for i, c in enumerate(classes)}
df['y'] = df['label'].map(class_to_idx)

with open(ARTIFACTS / "class_to_idx.json", "w") as f:
    json.dump(class_to_idx, f, indent=2)

In [16]:
# split data into train and validation sets
indices = np.arange(len(df))
labels = df['y'].to_numpy()

train_idx, val_idx = train_test_split(
    indices, test_size=0.2, stratify=labels, random_state=SEED
)

np.save(ARTIFACTS / "train_idx.npy", train_idx)
np.save(ARTIFACTS / "val_idx.npy", val_idx)

print(f"Train size: {len(train_idx)}, Val size: {len(val_idx)}")

Train size: 64800, Val size: 16200


In [17]:
# save train and validation manifests
df_train = df.iloc[train_idx].reset_index(drop=True)
df_val   = df.iloc[val_idx].reset_index(drop=True)

df.to_csv(ARTIFACTS / "manifest_filtered.csv", index=False)
df_train.to_csv(ARTIFACTS / "train_manifest.csv", index=False)
df_val.to_csv(ARTIFACTS / "val_manifest.csv", index=False)

print("Artifacts saved in:", ARTIFACTS.resolve())

Artifacts saved in: /Users/zainbharde/Documents/GitHub/Bharde_429_Final/artifacts
