In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split
import hashlib

In [2]:
def create_directory_structure(base_dir):
    for dataset in ['train', 'val', 'test']:
        path = os.path.join(base_dir, dataset)
        os.makedirs(path, exist_ok=True)

In [3]:
def generate_unique_filename(file_path, target_dir):
    file_name, file_ext = os.path.splitext(os.path.basename(file_path))
    # create identifier
    unique_id = hashlib.md5(file_path.encode()).hexdigest()
    unique_filename = f"{file_name}_{unique_id}{file_ext}"
    return unique_filename

In [4]:
# added in letter here 
def copy_files(file_list, source_base_dir, target_dir):
    for file_path in file_list:
        source_path = os.path.join(source_base_dir, file_path)
        letter = file_path.split("/")[0]
        unique_filename = generate_unique_filename(file_path, target_dir)
        unique_filename = letter + "_" + unique_filename
        target_path = os.path.join(target_dir, unique_filename)
        shutil.copy2(source_path, target_path)

In [5]:
source_base_dir = 'newdata_preprocessed_50x50/'  
target_base_dir = 'new_data_50x50_split/'  

create_directory_structure(target_base_dir)

# letters in the dataset
letters = [chr(i) for i in range(ord('A'), ord('Z') + 1) if chr(i) not in ['J', 'Z']]

# Split ratio
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Set a random seed for reproducibility
random_seed = 231

# Collect all file paths
file_paths = []
for letter in letters:
    full_path = os.path.join(source_base_dir, letter)
    files = os.listdir(full_path)
    file_paths.extend([os.path.join(letter, f) for f in files])

# Shuffle and split the data
train_paths, test_paths = train_test_split(file_paths, test_size=(1 - train_ratio), random_state=random_seed)
val_paths, test_paths = train_test_split(test_paths, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=random_seed)

# Copy the files to directories
copy_files(train_paths, source_base_dir, os.path.join(target_base_dir, 'train'))
copy_files(val_paths, source_base_dir, os.path.join(target_base_dir, 'val'))
copy_files(test_paths, source_base_dir, os.path.join(target_base_dir, 'test'))

# Print the sizes of each split
print(f"Number of training images: {len(train_paths)}")
print(f"Number of validation images: {len(val_paths)}")
print(f"Number of test images: {len(test_paths)}")

print("Data has been split and copied to the train, val, and test directories.")

Number of training images: 133856
Number of validation images: 28684
Number of test images: 28684
Data has been split and copied to the train, val, and test directories.
