In [None]:
import random
import os
import shutil

# Define the source directory containing the images
source_directory = "C:\\Users\\artso\\OneDrive\\바탕 화면\\OneDrive - Syracuse University\\Classes\\IST 691 We 9\\Final Project\\testing_demo\\Images\\"

# Define the base destination directory
destination_base_directory = "C:\\Users\\artso\\OneDrive\\바탕 화면\\OneDrive - Syracuse University\\Classes\\IST 691 We 9\\Final Project\\testing_demo\\"

# Function to filter existing files
def filter_existing_files(category, file_count):
    existing_files = []
    for i in range(1, file_count + 1):
        file_path = os.path.join(source_directory, category, f"{i}.jpg")
        if os.path.exists(file_path):
            existing_files.append(i)
    return existing_files

# Function to split the data into training, test, and validation sets
def split_data(file_indices, train_ratio=0.7, test_ratio=0.15, val_ratio=0.15):
    file_count = len(file_indices)
    train_count = int(file_count * train_ratio)
    test_count = int(file_count * test_ratio)
    val_count = file_count - train_count - test_count

    # Split the data exactly
    training_set = set(random.sample(file_indices, train_count))
    remaining_numbers = list(set(file_indices) - training_set)
    test_set = set(random.sample(remaining_numbers, test_count))
    validation_set = set(remaining_numbers) - test_set

    # Ensure exact counts by adjusting
    if len(validation_set) != val_count:
        difference = len(validation_set) - val_count
        if difference > 0:
            # Remove extra items from validation set
            validation_set = set(random.sample(validation_set, val_count))
        else:
            # Add missing items to validation set from remaining numbers not in test set
            missing_items = set(random.sample(list(set(remaining_numbers) - validation_set - test_set), -difference))
            validation_set.update(missing_items)
            test_set -= missing_items

    # Debugging: Print counts to verify
    print(f"Expected - Training: {train_count}, Test: {test_count}, Validation: {val_count}")
    print(f"Actual - Training: {len(training_set)}, Test: {len(test_set)}, Validation: {len(validation_set)}")

    return training_set, test_set, validation_set

# Categories with their respective file counts
categories_file_counts = {
    "Agony": 8912,
    "Happy": 9000,
    "Neutral": 9000,
    "Sad": 9000,
    "Scared": 9216
}

# Ensure destination directories exist
for category in categories_file_counts:
    os.makedirs(os.path.join(destination_base_directory, "Training", category), exist_ok=True)
    os.makedirs(os.path.join(destination_base_directory, "Test", category), exist_ok=True)
    os.makedirs(os.path.join(destination_base_directory, "Validation", category), exist_ok=True)

# Function to copy files with existence check
def copy_files(file_set, src_folder, dst_folder, category):
    count = 0  # Count the number of files copied
    copied_files = []  # List to store copied files for detailed output
    for x in file_set:
        src = os.path.join(source_directory, src_folder, f"{x}.jpg")
        dst = os.path.join(destination_base_directory, dst_folder, category, f"{x}.jpg")
        if os.path.exists(src):
            shutil.copyfile(src, dst)
            copied_files.append(f"{x}.jpg")
            count += 1
        else:
            print(f"File {src} does not exist and will be skipped.")
    print(f"Total files copied to {dst_folder}/{category}: {count}")
    return copied_files

# Split data and copy files for each category
for category, file_count in categories_file_counts.items():
    print(f"\nFiltering existing files for category: {category}")
    existing_files = filter_existing_files(category, file_count)
    print(f"Total existing files for {category}: {len(existing_files)}")

    print(f"\nSplitting data for category: {category}")
    training_set, test_set, validation_set = split_data(existing_files)

    print(f"Training set size for {category}: {len(training_set)}")
    print(f"Test set size for {category}: {len(test_set)}")
    print(f"Validation set size for {category}: {len(validation_set)}")

    print(f"\nCopying files for category: {category}")

    print("\nCopying to Training set:")
    training_files = copy_files(training_set, category, "Training", category)
    print(f"Files copied to Training/{category}: {training_files[:10]} ...")  # Print first 10 for brevity

    print("\nCopying to Test set:")
    test_files = copy_files(test_set, category, "Test", category)
    print(f"Files copied to Test/{category}: {test_files[:10]} ...")  # Print first 10 for brevity

    print("\nCopying to Validation set:")
    validation_files = copy_files(validation_set, category, "Validation", category)
    print(f"Files copied to Validation/{category}: {validation_files[:10]} ...")  # Print first 10 for brevity

print("\nFiles successfully copied to the respective directories.")



Filtering existing files for category: Agony
Total existing files for Agony: 8843

Splitting data for category: Agony
Expected - Training: 6190, Test: 1326, Validation: 1327
Actual - Training: 6190, Test: 1326, Validation: 1327
Training set size for Agony: 6190
Test set size for Agony: 1326
Validation set size for Agony: 1327

Copying files for category: Agony

Copying to Training set:
Total files copied to Training/Agony: 6190
Files copied to Training/Agony: ['1.jpg', '2.jpg', '4.jpg', '6.jpg', '8.jpg', '9.jpg', '11.jpg', '12.jpg', '13.jpg', '14.jpg'] ...

Copying to Test set:
Total files copied to Test/Agony: 1326
Files copied to Test/Agony: ['8195.jpg', '8200.jpg', '8202.jpg', '10.jpg', '8204.jpg', '8205.jpg', '15.jpg', '22.jpg', '26.jpg', '28.jpg'] ...

Copying to Validation set:
Total files copied to Validation/Agony: 1327
Files copied to Validation/Agony: ['3.jpg', '5.jpg', '7.jpg', '21.jpg', '25.jpg', '8217.jpg', '27.jpg', '8220.jpg', '8221.jpg', '8222.jpg'] ...

Filtering exis