In [None]:
import pandas as pd
import os
import shutil

# Paths to data in Kaggle
csv_path = '/kaggle/input/data/Data_Entry_2017.csv'  # CSV file path
specific_image_folder = '/kaggle/input/data/images_001/images'  # Folder to process (change if needed)
cardiomegaly_folder = '/kaggle/working/cardiomegaly_images/'  # Folder for Cardiomegaly images
not_cardiomegaly_folder = '/kaggle/working/not_cardiomegaly_images/'  # Folder for Not Cardiomegaly images

# Create directories for storing output
os.makedirs(cardiomegaly_folder, exist_ok=True)
os.makedirs(not_cardiomegaly_folder, exist_ok=True)

# Load the CSV file
data = pd.read_csv(csv_path)

# Get a list of all images in the specific folder
image_files_in_folder = os.listdir(specific_image_folder)
image_files_in_folder_set = set(image_files_in_folder)  # Use a set for faster lookup

# Counter for separated images
cardiomegaly_count = 0
non_cardiomegaly_count = 0

# Loop through the rows of the CSV and process images only from the specific folder
for index, row in data.iterrows():
    image_file = row['Image Index']
    
    # Only process if the image is present in the specific folder
    if image_file in image_files_in_folder_set:
        labels = row['Finding Labels']
        src_path = os.path.join(specific_image_folder, image_file)
        
        try:
            # Check if the image has "Cardiomegaly" in the labels
            if 'Cardiomegaly' in labels:
                shutil.copy(src_path, os.path.join(cardiomegaly_folder, image_file))
                cardiomegaly_count += 1  # Increase the counter
            else:
                shutil.copy(src_path, os.path.join(not_cardiomegaly_folder, image_file))
                non_cardiomegaly_count += 1  # Increase the counter
        except FileNotFoundError:
            # If the file is not found in the folder, continue without crashing
            print(f"Image {image_file} not found in {specific_image_folder}. Skipping.")
    else:
        print(f"Image {image_file} not found in {specific_image_folder}. Skipping.")

# Print summary
print(f"Separated {cardiomegaly_count} Cardiomegaly images and {non_cardiomegaly_count} non-Cardiomegaly images.")
