# Splitting Dataset into Training, Validation and Testing Directories

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import os
import shutil
from PIL import Image

In [2]:
dataset_dir = 'E:\\Jupyter Directory\\Deep Learning\\Image Classification Project\\Image Data Classification\\PetImages'
base_dir = 'E:\\Jupyter Directory\\Deep Learning\\Image Classification Project\\Image Data Classification\\split_dataset'

In [3]:
train_dir = os.path.join(base_dir,'train')
validation_dir = os.path.join(base_dir,'validation')
test_dir = os.path.join(base_dir,'test')

In [4]:
os.makedirs(base_dir, exist_ok=True)

In [8]:
for category in ['Cat', 'Dog']:
    os.makedirs(os.path.join(train_dir,category), exist_ok=True)
    os.makedirs(os.path.join(validation_dir,category),exist_ok=True)
    os.makedirs(os.path.join(test_dir,category),exist_ok=True)

In [14]:
def split_data(category, split_ratio=(0.8,0.1,0.1)):
    src_dir = os.path.join(dataset_dir,category)
    images = os.listdir(src_dir)
    train_images,val_test_images = train_test_split(images,test_size=0.2)
    val_images, test_images = train_test_split(val_test_images, test_size = 0.5)

    for img in train_images:
        shutil.copy(os.path.join(src_dir,img),os.path.join(train_dir,category,img))
    for img in val_images:
        shutil.copy(os.path.join(src_dir,img), os.path.join(validation_dir,category,img))
    for img in test_images:
        shutil.copy(os.path.join(src_dir,img),os.path.join(test_dir,category,img))

In [15]:
split_data('Cat')
split_data('Dog')

# Removing Corrupt Images

In [19]:
img_dir = 'E:\\Jupyter Directory\\Deep Learning\\Image Classification Project\\Image Data Classification\\split_dataset'

def is_image_file(filepath):
    """Check if a file is a valid image."""
    try:
        with Image.open(filepath) as img:
            img.verify() 
        return True
    except (IOError, SyntaxError) as e:
        print(f"Invalid image file: {filepath}")
        return False

def clean_directory(directory):
    """Clean invalid images from the directory, including subdirectories."""
    for root, dirs, files in os.walk(directory, topdown=False):
        for file in files:
            filepath = os.path.join(root, file)
            if not is_image_file(filepath):
                try:
                    os.remove(filepath)
                except PermissionError as e:
                    print(f"Permission error while removing file: {e}")
        for dir in dirs:
            dirpath = os.path.join(root, dir)
            if not os.listdir(dirpath):
                try:
                    os.rmdir(dirpath)
                except PermissionError as e:
                    print(f"Permission error while removing directory: {e}")
clean_directory(img_dir)
