# Cell 1: Imports & Config

In [1]:
import os
import glob
import shutil
from PIL import Image
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Direktori
RAW_DIR   = "../data/Indonesian Spices Dataset"
SPLIT_DIR = "../data_split"
RATIOS    = (0.7, 0.2, 0.1)

valid_exts = (".jpg", ".jpeg", ".png", ".bmp", ".gif")

# Buat folder split
for subset in ["train","val","test"]:
    os.makedirs(os.path.join(SPLIT_DIR, subset), exist_ok=True)

print("Config siap. RAW_DIR =", RAW_DIR)


Config siap. RAW_DIR = ../data/Indonesian Spices Dataset


# Cell 2: Fungsi Bersihkan & Cek Gambar

In [2]:
def is_image_valid(path):
    """Cek ekstensi dan korupsi dengan PIL."""
    if not path.lower().endswith(valid_exts):
        return False
    try:
        Image.open(path).verify()
        return True
    except:
        return False

def clean_folder(folder):
    """Hapus file non-gambar dan gambar korup."""
    removed = []
    for cls in os.listdir(folder):
        cls_dir = os.path.join(folder, cls)
        for fname in os.listdir(cls_dir):
            path = os.path.join(cls_dir, fname)
            if not is_image_valid(path):
                removed.append(path)
                os.remove(path)
    return removed

# Bersihkan RAW_DIR sebelum split
bad_raw = clean_folder(RAW_DIR)
print(f"Dihapus dari RAW (non-gambar/korup): {len(bad_raw)} file")


Dihapus dari RAW (non-gambar/korup): 0 file


# Cell 3: Split Data

In [3]:
for cls in os.listdir(RAW_DIR):
    src = os.path.join(RAW_DIR, cls)
    if not os.path.isdir(src): continue

    files = glob.glob(os.path.join(src, "*"))
    train_f, temp = train_test_split(files, test_size=1-RATIOS[0], random_state=42)
    val_f,  test_f = train_test_split(temp, test_size=RATIOS[2]/(RATIOS[1]+RATIOS[2]), random_state=42)

    for subset, flist in zip(["train","val","test"], [train_f, val_f, test_f]):
        dst_cls = os.path.join(SPLIT_DIR, subset, cls)
        os.makedirs(dst_cls, exist_ok=True)
        for f in flist:
            shutil.copy(f, dst_cls)

print("📦 Data split selesai")


📦 Data split selesai


# Cell 4: Pembersihan & Validasi Setelah Split

In [5]:
removed_after = []
for subset in ["train","val","test"]:
    folder = os.path.join(SPLIT_DIR, subset)
    removed_after += clean_folder(folder)

print(f"Dihapus setelah split (non-gambar/korup): {len(removed_after)} file")


Dihapus setelah split (non-gambar/korup): 0 file


# Cell 5: Ringkasan Hasil Split

In [6]:
for subset in ["train","val","test"]:
    total = sum(len(files) for _,_,files in os.walk(os.path.join(SPLIT_DIR, subset)))
    print(f" • {subset}: {total} gambar")

print("✅ Preprocessing lengkap: split + cleaning + validasi")


 • train: 4526 gambar
 • val: 1302 gambar
 • test: 682 gambar
✅ Preprocessing lengkap: split + cleaning + validasi
