# Data Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path

# Project root in Drive
PROJECT_ROOT = Path("/content/drive/MyDrive/Bharde_429_Final")
PROJECT_ROOT.mkdir(parents=True, exist_ok=True)

# Zip (already in Drive from data prep / T-A)
ZIP_PATH = PROJECT_ROOT / "asl-alphabet.zip"

print("Project root:", PROJECT_ROOT)
print("Zip exists:", ZIP_PATH.exists())

Mounted at /content/drive
Project root: /content/drive/MyDrive/Bharde_429_Final
Zip exists: True


In [None]:
import shutil

LOCAL_BASE  = Path("/content/asl_alphabet_unzipped")
LOCAL_TRAIN = LOCAL_BASE / "asl_alphabet_train"
LOCAL_TEST  = LOCAL_BASE / "asl_alphabet_test"
LOCAL_BASE.mkdir(parents=True, exist_ok=True)

if LOCAL_TRAIN.exists() and any(LOCAL_TRAIN.iterdir()):
    print("Using existing local unzipped data at:", LOCAL_TRAIN)
else:
    assert ZIP_PATH.exists(), "asl-alphabet.zip not found in PROJECT_ROOT."
    print("Unzipping from Drive zip -> /content ...")
    !unzip -q -o "{ZIP_PATH}" -d "{LOCAL_BASE}"
    wrapper = LOCAL_BASE / "asl_alphabet"
    if wrapper.exists():
        inner_train = wrapper / "asl_alphabet_train"
        inner_test  = wrapper / "asl_alphabet_test"
        if inner_train.exists():
            LOCAL_TRAIN.mkdir(exist_ok=True)
            for item in inner_train.iterdir():
                shutil.move(str(item), str(LOCAL_TRAIN))
        if inner_test.exists():
            LOCAL_TEST.mkdir(exist_ok=True)
            for item in inner_test.iterdir():
                shutil.move(str(item), str(LOCAL_TEST))
        shutil.rmtree(wrapper, ignore_errors=True)
    inner_train2 = LOCAL_TRAIN / "asl_alphabet_train"
    if inner_train2.exists():
        for item in inner_train2.iterdir():
            shutil.move(str(item), str(LOCAL_TRAIN))
        shutil.rmtree(inner_train2, ignore_errors=True)
    inner_test2 = LOCAL_TEST / "asl_alphabet_test"
    if inner_test2.exists():
        for item in inner_test2.iterdir():
            shutil.move(str(item), str(LOCAL_TEST))
        shutil.rmtree(inner_test2, ignore_errors=True)

print("Train root:", LOCAL_TRAIN)
print("Test root :", LOCAL_TEST)

Unzipping from Drive zip -> /content ...
Train root: /content/asl_alphabet_unzipped/asl_alphabet_train
Test root : /content/asl_alphabet_unzipped/asl_alphabet_test


In [None]:
# current parent
BASE = Path("/content/asl_alphabet_unzipped/asl-alphabet/asl_alphabet_train")
# the nested one
INNER = BASE / "asl_alphabet_train"

if INNER.exists() and INNER.is_dir():
    for item in INNER.iterdir():
        shutil.move(str(item), str(BASE))
    shutil.rmtree(INNER)
    print("Flattened: moved contents up to", BASE)
else:
    print("No nested folder to flatten at:", INNER)

No nested folder to flatten at: /content/asl_alphabet_unzipped/asl-alphabet/asl_alphabet_train/asl_alphabet_train


In [None]:
# current parent
BASE = Path("/content/asl_alphabet_unzipped/asl-alphabet/asl_alphabet_test")
# the nested one
INNER = BASE / "asl_alphabet_test"

if INNER.exists() and INNER.is_dir():
    for item in INNER.iterdir():
        shutil.move(str(item), str(BASE))
    shutil.rmtree(INNER)
    print("Flattened: moved contents up to", BASE)
else:
    print("No nested folder to flatten at:", INNER)

No nested folder to flatten at: /content/asl_alphabet_unzipped/asl-alphabet/asl_alphabet_test/asl_alphabet_test


# Augment Training Data

In [None]:
from torchvision import transforms

offline_augment_tfm = transforms.Compose([
    transforms.RandomRotation(degrees=15),  # ±20° rotation
    transforms.ColorJitter(
        brightness=0.3,
        contrast=0.3,
        saturation=0.3,
    ),
])

In [None]:
from pathlib import Path
from PIL import Image
import random

FRACTION_TO_AUGMENT = 0.30   # 30%
RANDOM_SEED = 42

random.seed(RANDOM_SEED)

train_root = LOCAL_TRAIN  # e.g. Path("/content/asl_alphabet_unzipped/asl_alphabet_train")

class_dirs = sorted([d for d in train_root.iterdir() if d.is_dir()])
print("Class folders found:", [d.name for d in class_dirs])

for class_dir in class_dirs:
    image_paths = sorted(
        [p for p in class_dir.iterdir()
         if p.suffix.lower() in {".jpg", ".jpeg", ".png"}]
    )

    n_images = len(image_paths)
    n_to_aug = int(FRACTION_TO_AUGMENT * n_images)

    if n_to_aug == 0:
        print(f"Skipping {class_dir.name} (no images).")
        continue

    images_to_aug = random.sample(image_paths, n_to_aug)

    print(f"Class {class_dir.name}: {n_images} total → augmenting {n_to_aug} images")

    for img_path in images_to_aug:
        with Image.open(img_path) as img:
            img = img.convert("RGB")
            aug_img = offline_augment_tfm(img)
            aug_img.save(img_path)   # overwrite

Class folders found: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']
Class A: 2999 total → augmenting 899 images
Class B: 3000 total → augmenting 900 images
Class C: 3000 total → augmenting 900 images
Class D: 3000 total → augmenting 900 images
Class E: 3000 total → augmenting 900 images
Class F: 3000 total → augmenting 900 images
Class G: 3000 total → augmenting 900 images
Class H: 3000 total → augmenting 900 images
Class I: 3000 total → augmenting 900 images
Class J: 3000 total → augmenting 900 images
Class K: 3000 total → augmenting 900 images
Class L: 3000 total → augmenting 900 images
Class M: 3000 total → augmenting 900 images
Class N: 3000 total → augmenting 900 images
Class O: 3000 total → augmenting 900 images
Class P: 3000 total → augmenting 900 images
Class Q: 3000 total → augmenting 900 images
Class R: 3000 total → augmenting 900 images
Class S: 3000 total → augment

In [None]:
import shutil

AUG_TRAIN_DRIVE = PROJECT_ROOT / "asl_alphabet_train_augmented"

# remove if exists so we don't mix old data
if AUG_TRAIN_DRIVE.exists():
    shutil.rmtree(AUG_TRAIN_DRIVE)

print("Copying augmented training data to Drive…")
shutil.copytree(LOCAL_TRAIN, AUG_TRAIN_DRIVE)

print("Saved at:", AUG_TRAIN_DRIVE)

Copying augmented training data to Drive…
Saved at: /content/drive/MyDrive/Bharde_429_Final/asl_alphabet_train_augmented


In [None]:
# sanity check that all images were saved to new folder
from pathlib import Path

AUG_TRAIN_DRIVE = PROJECT_ROOT / "asl_alphabet_train_augmented"

class_dirs = sorted([d for d in AUG_TRAIN_DRIVE.iterdir() if d.is_dir()])

print("Classes found:", [d.name for d in class_dirs])
print("-" * 50)

total = 0
counts = {}

for class_dir in class_dirs:
    images = [
        p for p in class_dir.iterdir()
        if p.suffix.lower() in {".jpg", ".jpeg", ".png"}
    ]
    counts[class_dir.name] = len(images)
    total += len(images)
    print(f"{class_dir.name}: {len(images)} images")

print("-" * 50)
print("TOTAL IMAGES:", total)

Classes found: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']
--------------------------------------------------
A: 2999 images
B: 3000 images
C: 3000 images
D: 3000 images
E: 3000 images
F: 3000 images
G: 3000 images
H: 3000 images
I: 3000 images
J: 3000 images
K: 3000 images
L: 3000 images
M: 3000 images
N: 3000 images
O: 3000 images
P: 3000 images
Q: 3000 images
R: 3000 images
S: 3000 images
T: 3000 images
U: 3000 images
V: 3000 images
W: 3000 images
X: 3000 images
Y: 3000 images
Z: 3000 images
del: 3000 images
nothing: 3000 images
space: 3000 images
--------------------------------------------------
TOTAL IMAGES: 86999


In [None]:
!ls "$AUG_TRAIN_DRIVE"

A  C  del  F  H  J  L  N	O  Q  S      T	V  X  Z
B  D  E    G  I  K  M  nothing	P  R  space  U	W  Y


# Zip training folder and save in drive

In [None]:
LOCAL_ZIP = "/content/asl_alphabet_train_augmented.zip"

import shutil

# zip locally (FAST)
shutil.make_archive(
    base_name=LOCAL_ZIP.replace(".zip", ""),
    format="zip",
    root_dir=str(AUG_TRAIN_DRIVE)
)

KeyboardInterrupt: 

In [None]:
!ls -lh "$ZIP_OUTPUT"

-rw------- 1 root root 511M Dec  4 21:17 /content/drive/MyDrive/Bharde_429_Final/asl_alphabet_train_augmented.zip
