In [None]:
import os
import pandas as pd
from translate import translate  
import tensorflow as tf
from sklearn.model_selection import train_test_split
from PIL import Image

# 1- Build DataFrame with translated labels

dataset_path = r"/Users/ia_dev/Desktop/Work/archive/raw-img"
data = []

# Iterate through each folder in the dataset directory
for folder in os.listdir(dataset_path):
    folder_path = os.path.join(dataset_path, folder)

    # Check if the current path is actually a directory
    if os.path.isdir(folder_path):
        # Get the label from the 'translate' dict (not shown), or default to folder name
        label = translate.get(folder, folder)

        # Iterate through files inside the specific category folder 
        for file in os.listdir(folder_path):
            # Filter specifically for image formats
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                img_path = os.path.join(folder_path, file)
                # Store the full path and its corresponding label
                data.append([img_path, label])

# Create a Pandas DataFrame to manage metadata
df = pd.DataFrame(data, columns=["image_path", "label"])

# Display the first few rows and distribution stats
print(df.head())
print("Total images:", len(df))
print(df['label'].value_counts())

# 2- Train / Test Split (90/10)

train_df, test_df = train_test_split(
    df,
    test_size=0.1,
    stratify=df['label'],
    random_state=42
)
print("Train size:", len(train_df))
print("Test size:", len(test_df))

# 3- Function to convert DataFrame to tf.data.Dataset

IMG_SIZE = 224
BATCH_SIZE = 32

def df_to_dataset(df, shuffle=True, batch_size=32):
    # Extract file paths and convert labels from strings to integer codes
    paths = df['image_path'].values
    labels = df['label'].astype('category').cat.codes.values  # string -> integer
    # Create a basic TensorSliceDataset
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    
    # Internal function to load and preprocess images
    def process_path(path, label):
        # Read the raw file from disk
        img = tf.io.read_file(path)
        # Decode the JPEG format into a tensor (0-255 integers)
        img = tf.image.decode_jpeg(img, channels=3)
        # Resize image to the input size required by the model (224x224)
        img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE])
        # Normalize pixel values to range [0, 1] (crucial for convergence)
        img = img / 255.0  
        return img, label
    
    # Apply the processing function in parallel using available CPU cores
    ds = ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
    
    # Shuffle the dataset to prevent the model from learning order-based patterns
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    
    # Group examples into batches
    ds = ds.batch(batch_size)
    # Prefetch data to GPU memory while the CPU prepares the next batch (pipeline optimization)
    ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
    
    return ds

# 4- Create TensorFlow Datasets

# Create the actual dataset objects ready for training
train_dataset = df_to_dataset(train_df, batch_size=BATCH_SIZE)
# Do not shuffle test data.
test_dataset = df_to_dataset(test_df, shuffle=False, batch_size=BATCH_SIZE)

# 5- Quick verification

# Take 1 batch from the training set to verify shapes and values
for images, labels in train_dataset.take(1):
    print("Image batch shape:", images.shape)
    print("Label batch shape:", labels.shape)
    print("Example labels:", labels[:10].numpy())
    print("Min pixel:", images.numpy().min())
    print("Max pixel:", images.numpy().max())

# Visualize the first image
img = images[0].numpy()
Image.fromarray((img * 255).astype("uint8")).show()


                                          image_path    label
0  /Users/ia_dev/Desktop/Work/archive/raw-img/gal...  chicken
1  /Users/ia_dev/Desktop/Work/archive/raw-img/gal...  chicken
2  /Users/ia_dev/Desktop/Work/archive/raw-img/gal...  chicken
3  /Users/ia_dev/Desktop/Work/archive/raw-img/gal...  chicken
4  /Users/ia_dev/Desktop/Work/archive/raw-img/gal...  chicken
Total images: 26179
label
dog          4863
ragno        4821
chicken      3098
horse        2623
butterfly    2112
cow          1866
squirrel     1862
sheep        1820
cat          1668
elephant     1446
Name: count, dtype: int64
Train size: 23561
Test size: 2618


2026-01-28 14:33:04.549774: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Max
2026-01-28 14:33:04.549799: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 36.00 GB
2026-01-28 14:33:04.549804: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 14.04 GB
2026-01-28 14:33:04.549822: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2026-01-28 14:33:04.549830: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Image batch shape: (32, 224, 224, 3)
Label batch shape: (32,)
Example labels: [4 0 7 4 2 2 4 7 7 4]
Min pixel: 0.0
Max pixel: 1.0


2026-01-28 14:33:09.901911: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
