## Initial stage

In this section we'll see the original state of the datasets. As we see the current folders are not balanced. Ideaally we would like the training - testing - validated be baalancced in 80 -10 -10. 

In [27]:
import os
import random
import re
from sklearn.model_selection import train_test_split
import shutil
import shutil

In [28]:
def count_images(data_dir="data", subdirs=["test", "train", "val"], classes=["NORMAL", "PNEUMONIA"]):
    """
    Counts the number of images in each subdirectory and class of data_dir.
    """
    for subdir in subdirs:
        print(f"\nDirectory: {subdir}")
        for cls in classes:
            class_dir = os.path.join(data_dir, subdir, cls)
            if not os.path.exists(class_dir):
                print(f"Directory {class_dir} not found")
                continue
            num_imgs = len(os.listdir(class_dir))
            print(f"\tClass: {cls} -- Number of images: {num_imgs}")

    print("Image counting complete.")


In [29]:
count_images()


Directory: test
	Class: NORMAL -- Number of images: 234
	Class: PNEUMONIA -- Number of images: 390

Directory: train
	Class: NORMAL -- Number of images: 1341
	Class: PNEUMONIA -- Number of images: 3875

Directory: val
	Class: NORMAL -- Number of images: 8
	Class: PNEUMONIA -- Number of images: 8
Image counting complete.


## Creating one folder with all informatio 

I will create one new folder with all images 

In [30]:
def copy_images_to_data2(data_dir="data", data2_dir="data2"):
    """
    This function copies all images from the NORMAL and PNEUMONIA 
    subdirectories in the test, train, and val
    subdirectories of data_dir to data2_dir.
    """
    # Create data2 directory if it doesn't exist
    if not os.path.exists(data2_dir):
        os.mkdir(data2_dir)

    # Loop through the subdirectories and classes and copy the images to data2
    subdirs = ["test", "train", "val"]
    classes = ["NORMAL", "PNEUMONIA"]

    for subdir in subdirs:
        for cls in classes:
            class_dir = os.path.join(data_dir, subdir, cls)
            images = os.listdir(class_dir)
            for img in images:
                src_path = os.path.join(class_dir, img)
                dst_path = os.path.join(data2_dir, img)
                shutil.copy(src_path, dst_path)

    print("All images have been copied to data2 directory.")

In [31]:
copy_images_to_data2(data_dir="data", data2_dir="data2")

All images have been copied to data2 directory.


Now I assign data either to normal or pneumonia. With this code we discover that the normal patients are label with either "IM-0" or NORM, while the pneumonia patients are labeled "person". 

In [32]:
data2_dir = "data2"

# Retrieve all image filenames in data2 folder
image_filenames = os.listdir(data2_dir)

# Create a dictionary to store the count of different first 4 letters
name_counts = {}

# Iterate over each image filename and extract the first 4 letters
for filename in image_filenames:
    name = filename[:4]
    if name not in name_counts:
        name_counts[name] = 0
    name_counts[name] += 1

# Print the count of different first 4 letters
for name, count in name_counts.items():
    print(f"{name}: {count}")

NORM: 917
pers: 4273
IM-0: 666


In [33]:
data2_dir = "./data2"
normal_dir = os.path.join(data2_dir, "NORMAL")
pneumonia_dir = os.path.join(data2_dir, "PNEUMONIA")

# Create the NORMAL and PNEUMONIA folders if they don't exist
os.makedirs(normal_dir, exist_ok=True)
os.makedirs(pneumonia_dir, exist_ok=True)

# Retrieve all image filenames in data2 folder
image_filenames = os.listdir(data2_dir)

# Iterate over each image filename and move the images to the appropriate folders
for filename in image_filenames:
    src_path = os.path.join(data2_dir, filename)
    if filename.startswith("IM-0") or filename.startswith("NORM"):
        dst_path = os.path.join(normal_dir, filename)
        if not os.path.isdir(src_path) and src_path != dst_path:
            shutil.move(src_path, dst_path)
    elif filename.startswith("person"):
        dst_path = os.path.join(pneumonia_dir, filename)
        if not os.path.isdir(src_path) and src_path != dst_path:
            shutil.move(src_path, dst_path)


In [39]:
from sklearn.model_selection import train_test_split
import random
import os
import shutil

def split_images(data_dir, classes, train_size, test_size, val_size):
    # Shuffle images within each class
    for cls in classes:
        class_dir = os.path.join(data_dir, cls)
        image_filenames = os.listdir(class_dir)
        random.shuffle(image_filenames)
        for filename in image_filenames:
            src_path = os.path.join(class_dir, filename)
            dst_path = os.path.join(class_dir, filename)
            shutil.move(src_path, dst_path)

    # Divide images into train, test, and val folders
    for cls in classes:
        class_dir = os.path.join(data_dir, cls)
        image_filenames = os.listdir(class_dir)

        # Split image filenames into train, test, and val sets
        train_filenames, testval_filenames = train_test_split(image_filenames, train_size=train_size, test_size=(test_size + val_size), random_state=42)
        test_filenames, val_filenames = train_test_split(testval_filenames, train_size=(test_size / (test_size + val_size)), test_size=(val_size / (test_size + val_size)), random_state=42)

        # Create train, test, and val folders if they don't exist
        train_dir = os.path.join(data_dir, "train", cls)
        test_dir = os.path.join(data_dir, "test", cls)
        val_dir = os.path.join(data_dir, "val", cls)
        os.makedirs(train_dir, exist_ok=True)
        os.makedirs(test_dir, exist_ok=True)
        os.makedirs(val_dir, exist_ok=True)

        # Move images to respective train, test, and val folders
        for filename in train_filenames:
            src_path = os.path.join(class_dir, filename)
            dst_path = os.path.join(train_dir, filename)
            shutil.move(src_path, dst_path)
        for filename in test_filenames:
            src_path = os.path.join(class_dir, filename)
            dst_path = os.path.join(test_dir, filename)
            shutil.move(src_path, dst_path)
        for filename in val_filenames:
            src_path = os.path.join(class_dir, filename)
            dst_path = os.path.join(val_dir, filename)
            shutil.move(src_path, dst_path)


In [40]:
data_dir = "data2"
classes = ["NORMAL", "PNEUMONIA"]
train_size = 0.8
test_size = 0.1
val_size = 0.1

split_images(data_dir, classes, train_size, test_size, val_size)

In [52]:
def reorganize_folders(data_dir):
    folders = ['test', 'train', 'val']
    subfolders = ['NORMAL', 'PNEUMONIA']

    for folder in folders:
        folder_dir = os.path.join(data_dir, folder)
        os.makedirs(folder_dir, exist_ok=True)
        
        for subfolder in subfolders:
            subfolder_dir = os.path.join(folder_dir, subfolder)
            os.makedirs(subfolder_dir, exist_ok=True)

            source_dir = os.path.join(data_dir, subfolder, folder)
            image_files = os.listdir(source_dir)

            for image_file in image_files:
                source_path = os.path.join(source_dir, image_file)
                destination_path = os.path.join(subfolder_dir, image_file)
                shutil.move(source_path, destination_path)


In [53]:
data_dir = "data2"
reorganize_folders(data_dir)