- Preprocessing dataset by optimizing memory and also saving the preprocessed data into the directory preprocessed_data

- The following code is for preprocessing data  batch wise and saving them in each directory

#### Done the preprocessing with this code below:

In [None]:
import tensorflow as tf
import numpy as np
import os

# Define dataset paths
train_dir = "D:/BirdFeatherClassification/dataset/split_data/train"
val_dir = "D:/BirdFeatherClassification/dataset/split_data/val"
test_dir = "D:/BirdFeatherClassification/dataset/split_data/test"

# Define preprocessing parameters
IMG_SIZE = (128, 128)  
BATCH_SIZE = 16

# Function to load and preprocess images
def load_image(image_path, label):
    """Load an image, decode it, resize, normalize, and handle errors."""
    img = tf.io.read_file(image_path)
    
    # Try decoding different image formats
    try:
        img = tf.image.decode_jpeg(img, channels=3)  # Try JPEG first
    except:
        try:
            img = tf.image.decode_png(img, channels=3)  # Try PNG
        except:
            try:
                img = tf.image.decode_bmp(img, channels=3)  # Try BMP
            except:
                try:
                    img = tf.image.decode_gif(img)  # Try GIF
                    img = img[0]  # Extract first frame if GIF
                except:
                    print(f"Skipping unsupported or corrupted image: {image_path.numpy().decode()}")
                    return tf.zeros(IMG_SIZE + (3,)), -1  # Return dummy image & invalid label
    
    img = tf.image.resize(img, IMG_SIZE)  # Resize image
    img = img / 255.0  # Normalize pixel values to [0, 1]
    return img, label

# Function to create a dataset from a directory
def create_dataset(directory):
    """Create a TensorFlow dataset from a directory of images."""
    class_names = sorted(os.listdir(directory))  # Sort to maintain label consistency
    image_paths = []
    labels = []

    for class_idx, class_name in enumerate(class_names):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            for img_file in os.listdir(class_path):
                img_path = os.path.join(class_path, img_file)
                image_paths.append(img_path)
                labels.append(class_idx)

    # Convert lists to TensorFlow datasets
    path_ds = tf.data.Dataset.from_tensor_slices(image_paths)
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    dataset = tf.data.Dataset.zip((path_ds, label_ds))
    
    # Apply image processing function
    dataset = dataset.map(lambda path, lbl: tf.py_function(
        func=load_image, inp=[path, lbl], Tout=(tf.float32, tf.int32)), num_parallel_calls=tf.data.AUTOTUNE)

    dataset = dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)  # Batch processing
    return dataset

# Load datasets
train_dataset = create_dataset(train_dir)
val_dataset = create_dataset(val_dir)
test_dataset = create_dataset(test_dir)

# Paths to save preprocessed datasets
train_save_path = "D:/BirdFeatherClassification/dataset/split_data/preprocessed_data/train"
val_save_path = "D:/BirdFeatherClassification/dataset/split_data/preprocessed_data/val"
test_save_path = "D:/BirdFeatherClassification/dataset/split_data/preprocessed_data/test"

# Function to save dataset in batches
def save_batches(dataset, save_path, dataset_type):
    """Save dataset batches as compressed .npz files."""
    os.makedirs(save_path, exist_ok=True)
    batch_index = 0
    for batch_images, batch_labels in dataset:
        valid_indices = tf.where(batch_labels != -1)[:, 0]  # Ignore invalid images
        batch_images = tf.gather(batch_images, valid_indices)
        batch_labels = tf.gather(batch_labels, valid_indices)

        if len(batch_images) > 0:  # Save only if valid images exist
            np.savez_compressed(os.path.join(save_path, f"{dataset_type}_batch_{batch_index}.npz"),
                                images=batch_images.numpy(), labels=batch_labels.numpy())
            batch_index += 1

# Save train, validation, and test datasets
save_batches(train_dataset, train_save_path, "train")
save_batches(val_dataset, val_save_path, "val")
save_batches(test_dataset, test_save_path, "test")

print("Datasets successfully preprocessed and saved.")


In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt

# Define the path to preprocessed data
train_dir = "D:/BirdFeatherClassification/dataset/split_data/preprocessed_data/train"
val_dir = "D:/BirdFeatherClassification/dataset/split_data/preprocessed_data/val"
test_dir = "D:/BirdFeatherClassification/dataset/split_data/preprocessed_data/test"

# Function to load and display a batch of images from the preprocessed data
def visualize_batch(batch_file_path):
    # Load the batch data
    data = np.load(batch_file_path)
    images = data['images']
    labels = data['labels']

    # Display the first 5 images in the batch (you can change the number if needed)
    num_images = min(5, len(images))
    
    plt.figure(figsize=(10, 10))
    for i in range(num_images):
        plt.subplot(1, num_images, i + 1)
        plt.imshow(images[i])
        plt.title(f"Label: {labels[i]}")
        plt.axis('off')
    plt.show()

# Function to load a specific batch file (e.g., the first batch in the train set)
def load_sample_batch(directory):
    # Get the list of batch files in the directory
    batch_files = [f for f in os.listdir(directory) if f.endswith('.npz')]
    
    # Load the first batch file (or you can modify to load any batch index)
    if batch_files:
        sample_batch_file = os.path.join(directory, batch_files[0])  # Load the first batch file
        visualize_batch(sample_batch_file)
    else:
        print("No batch files found in this directory!")

# Visualize a sample batch from the train, val, and test directories
print("Visualizing a sample from train data:")
load_sample_batch(train_dir)

print("Visualizing a sample from val data:")
load_sample_batch(val_dir)

print("Visualizing a sample from test data:")
load_sample_batch(test_dir)
