In [3]:
import os
import numpy as np
import cv2
from tqdm import tqdm  # Import tqdm for progress bars
from sklearn.model_selection import train_test_split

# Path to dataset
DATA_DIR = 'data'

# Parameters
IMG_SIZE = 28  # Resize images to 28x28 (like MNIST)
NUM_CLASSES = 10

def load_data(data_dir):
    images, labels = [], []
    print("Loading images...")
    for label in tqdm(range(NUM_CLASSES), desc="Processing folders", unit="folder"):
        folder_path = os.path.join(data_dir, str(label))
        for file_name in tqdm(os.listdir(folder_path), desc=f"Loading {label}", unit="file", leave=False):
            img_path = os.path.join(folder_path, file_name)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Load as grayscale
            img_resized = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
            images.append(img_resized)
            labels.append(label)
    images = np.array(images) / 255.0  # Normalize to [0, 1]
    labels = np.array(labels)
    return images, labels

# Load and split data
if __name__ == "__main__":
    print("Starting data preprocessing...")
    images, labels = load_data(DATA_DIR)
    
    print("Splitting data into train, validation, and test sets...")
    # First split: Train + Validation/Test
    X_temp, X_test, y_temp, y_test = train_test_split(
        images, labels, test_size=0.2, random_state=42
    )
    
    # Second split: Train/Validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=0.25, random_state=42
    )  
    # 0.25 x 0.8 = 0.2 (20% validation)

    # Save data to files
    print("Saving processed data...")
    with tqdm(total=6, desc="Saving files", unit="file") as pbar:
        np.save('X_train.npy', X_train)
        pbar.update(1)
        np.save('X_val.npy', X_val)
        pbar.update(1)
        np.save('X_test.npy', X_test)
        pbar.update(1)
        np.save('y_train.npy', y_train)
        pbar.update(1)
        np.save('y_val.npy', y_val)
        pbar.update(1)
        np.save('y_test.npy', y_test)
        pbar.update(1)
    
    print("\nData successfully split into train, validation, and test sets!")
    print(f"Training set: {len(X_train)} samples")
    print(f"Validation set: {len(X_val)} samples")
    print(f"Test set: {len(X_test)} samples")


Starting data preprocessing...
Loading images...


Processing folders: 100%|██████████| 10/10 [05:19<00:00, 31.93s/folder]


Splitting data into train, validation, and test sets...
Saving processed data...


Saving files: 100%|██████████| 6/6 [00:00<00:00, 18.01file/s]


Data successfully split into train, validation, and test sets!
Training set: 12933 samples
Validation set: 4311 samples
Test set: 4311 samples



