In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split
import hashlib

In [2]:
def create_directory_structure(base_dir):
    """
    Create the directory structure for the dataset.

    Args:
        base_dir (str): Base directory where train, val, and test directories will be created.
    """
    for dataset in ['train', 'val', 'test']:
        path = os.path.join(base_dir, dataset)
        os.makedirs(path, exist_ok=True)

In [3]:
def generate_unique_filename(file_path, target_dir):
    """
    Generate a unique filename to prevent overwriting.

    Args:
        file_path (str): The original file path.
        target_dir (str): The target directory where the file will be copied.

    Returns:
        str: A unique filename.
    """
    file_name, file_ext = os.path.splitext(os.path.basename(file_path))
    # Create a unique identifier based on the original file path
    unique_id = hashlib.md5(file_path.encode()).hexdigest()
    unique_filename = f"{file_name}_{unique_id}{file_ext}"
    return unique_filename

In [4]:
def copy_files(file_list, source_base_dir, target_dir):
    """
    Copy files from source to target directory with unique filenames.

    Args:
        file_list (list): List of file paths relative to the source base directory.
        source_base_dir (str): Source base directory.
        target_dir (str): Target directory where files will be copied.
    """
    for file_path in file_list:
        source_path = os.path.join(source_base_dir, file_path)
        unique_filename = generate_unique_filename(file_path, target_dir)
        target_path = os.path.join(target_dir, unique_filename)
        shutil.copy2(source_path, target_path)

In [5]:
# Define the base directory and user/letter directories
source_base_dir = 'dataset5_preprocessed_50x50/'  # Original dataset directory
target_base_dir = 'dataset5_50x50_split/'  # New directory for split data

# Create the directory structure for train, val, test
create_directory_structure(target_base_dir)

# Users and letters in the dataset
users = ['A', 'B', 'C', 'D', 'E']
letters = [chr(i) for i in range(ord('a'), ord('z') + 1) if chr(i) not in ['j', 'z']]

# Split ratio
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Set a random seed for reproducibility
random_seed = 231

# Collect all file paths
file_paths = []
for user in users:
    for letter in letters:
        user_letter_dir = os.path.join(user, letter)
        full_path = os.path.join(source_base_dir, user_letter_dir)
        files = os.listdir(full_path)
        file_paths.extend([os.path.join(user_letter_dir, f) for f in files])

# Shuffle and split the data
train_paths, test_paths = train_test_split(file_paths, test_size=(1 - train_ratio), random_state=random_seed)
val_paths, test_paths = train_test_split(test_paths, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=random_seed)

# Copy the files to the respective directories
copy_files(train_paths, source_base_dir, os.path.join(target_base_dir, 'train'))
copy_files(val_paths, source_base_dir, os.path.join(target_base_dir, 'val'))
copy_files(test_paths, source_base_dir, os.path.join(target_base_dir, 'test'))

# Print the sizes of each split
print(f"Number of training images: {len(train_paths)}")
print(f"Number of validation images: {len(val_paths)}")
print(f"Number of test images: {len(test_paths)}")

print("Data has been split and copied to the train, val, and test directories.")

Number of training images: 46041
Number of validation images: 9866
Number of test images: 9867
Data has been split and copied to the train, val, and test directories.
