In [1]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from tensorflow.keras.preprocessing import image

# Define paths
backup_path = 'backup'
data_path = 'data'

# Create folder structure inside 'data'
os.makedirs(os.path.join(data_path, 'train', 'spam'), exist_ok=True)
os.makedirs(os.path.join(data_path, 'train', 'non_spam'), exist_ok=True)
os.makedirs(os.path.join(data_path, 'validation', 'spam'), exist_ok=True)
os.makedirs(os.path.join(data_path, 'validation', 'non_spam'), exist_ok=True)
os.makedirs(os.path.join(data_path, 'test', 'spam'), exist_ok=True)
os.makedirs(os.path.join(data_path, 'test', 'non_spam'), exist_ok=True)

# Define ImageDataGenerator for augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=30,       # Random rotations
    width_shift_range=0.2,   # Random horizontal shifts
    height_shift_range=0.2,  # Random vertical shifts
    shear_range=0.2,         # Random shear
    zoom_range=0.2,          # Random zoom
    horizontal_flip=True,    # Random horizontal flip
    fill_mode='nearest'      # How to fill newly created pixels
)

# Function to augment and save images from the backup folder to the data folder
def augment_and_save_images(src_folder, dest_folder, num_augmented_images=100):
    # Load the images from the folder
    for filename in os.listdir(src_folder):
        if filename.endswith('.jpg') or filename.endswith('.png'):
            img_path = os.path.join(src_folder, filename)
            img = load_img(img_path)  # Load image
            x = img_to_array(img)     # Convert image to array
            x = np.expand_dims(x, axis=0)  # Add batch dimension

            # Generate augmented images and save them to the destination folder
            i = 0
            for batch in train_datagen.flow(x, batch_size=1, save_to_dir=dest_folder, save_prefix='aug', save_format='jpg'):
                i += 1
                if i >= num_augmented_images:
                    break  # Stop after generating num_augmented_images

# Augment and save the images from the 'spam' and 'non_spam' directories
augment_and_save_images(os.path.join(backup_path, 'spam'), os.path.join(data_path, 'train', 'spam'))
augment_and_save_images(os.path.join(backup_path, 'non_spam'), os.path.join(data_path, 'train', 'non_spam'))

# To create validation and test sets, you can move a portion of the augmented data into those folders
# Example: move 10% of the images to the validation and test folders (you can adjust this percentage)
import shutil

def move_files_to_validation_test(src_folder, val_folder, test_folder, val_ratio=0.1, test_ratio=0.1):
    # Get all augmented images
    augmented_images = [f for f in os.listdir(src_folder) if f.endswith('.jpg')]
    total_images = len(augmented_images)

    # Calculate number of images for validation and test
    val_count = int(total_images * val_ratio)
    test_count = int(total_images * test_ratio)

    # Move the validation and test images
    for img_name in augmented_images[:val_count]:
        shutil.move(os.path.join(src_folder, img_name), os.path.join(val_folder, img_name))

    for img_name in augmented_images[val_count:val_count + test_count]:
        shutil.move(os.path.join(src_folder, img_name), os.path.join(test_folder, img_name))

# Move augmented images to validation and test folders
move_files_to_validation_test(os.path.join(data_path, 'train', 'spam'), 
                              os.path.join(data_path, 'validation', 'spam'), 
                              os.path.join(data_path, 'test', 'spam'))

move_files_to_validation_test(os.path.join(data_path, 'train', 'non_spam'), 
                              os.path.join(data_path, 'validation', 'non_spam'), 
                              os.path.join(data_path, 'test', 'non_spam'))

print("Augmentation and data split complete!")


KeyboardInterrupt: 