In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import shutil
from glob import glob

# Source root
source_root = '/kaggle/input/chest-xray-pneumonia/chest_xray'
# /kaggle/input/chest-xray-pneumonia
# Target directory
target_dir = '/kaggle/working/Data_Med_vs_NonMed'
os.makedirs(target_dir, exist_ok=True)

# CSV entries
data_entries = []

# Counter for image naming
counter = 1

# Recursively go through train, test, val
for split in ['train', 'test', 'val']:
    split_path = os.path.join(source_root, split)
    
    # Go through categories: NORMAL, PNEUMONIA
    for category in os.listdir(split_path):
        category_path = os.path.join(split_path, category)
        
        if not os.path.isdir(category_path):
            continue
        
        for img_path in glob(os.path.join(category_path, '*.jpeg')):
            new_name = f'IMG_{counter}.jpg'
            new_path = os.path.join(target_dir, new_name)
            
            # Copy and rename
            shutil.copy(img_path, new_path)
            
            # Add to CSV data
            data_entries.append([new_name, 1])  # 1 = Medical
            
            counter += 1

        print(f"✅ Processed {counter - 1} medical images")

# Save CSV
df = pd.DataFrame(data_entries, columns=['image_name', 'label'])
df.to_csv('/kaggle/working/labels.csv', index=False)

print(f"✅ Processed {counter - 1} medical images")


✅ Processed 3875 medical images
✅ Processed 5216 medical images
✅ Processed 5606 medical images
✅ Processed 5840 medical images
✅ Processed 5848 medical images
✅ Processed 5856 medical images
✅ Processed 5856 medical images


In [5]:
from pathlib import Path
from PIL import Image
import pydicom
import tifffile as tiff


# Paths
output_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
output_dir.mkdir(parents=True, exist_ok=True)

existing_csv_path = Path("/kaggle/working/labels.csv")
temp_csv_path = Path("/kaggle/working/temp_labels.csv")

# Load existing labels and get counter
if existing_csv_path.exists():
    df_existing = pd.read_csv(existing_csv_path)
    counter = int(df_existing["image_name"].str.extract(r'IMG_(\d+)')[0].astype(int).max()) + 1
    print("Current Images:" ,counter)
else:
    df_existing = pd.DataFrame(columns=["image_name", "label"])
    counter = 1
    print("labels.csv Not Found!!")
data_entries = []
# Prepare temp DataFrame
# df_temp = pd.DataFrame(columns=["image_name", "label"])

# Process TIFF and DICOM
dicom_dir = Path("/kaggle/input/siim-medical-images/dicom_dir")
tiff_dir = Path("/kaggle/input/siim-medical-images/tiff_images")

# Function to save image
def save_image(img_array, filename):
    img_pil = Image.fromarray(img_array)
    img_pil.convert("RGB").save(output_dir / filename, "JPEG")

# Process DICOM
for path in dicom_dir.glob("*.dcm"):
    try:
        dcm = pydicom.dcmread(path)
        image_array = dcm.pixel_array
        image_array = ((image_array - image_array.min()) / (image_array.ptp()) * 255).astype(np.uint8)  # normalize
        filename = f"IMG_{counter}.jpg"
        save_image(image_array, filename)
        # df_temp = pd.concat([df_temp, pd.DataFrame([{"image_name": filename, "label": 1}])], ignore_index=True)
        data_entries.append([filename, 1])
        counter += 1
    except Exception as e:
        print(f"Error processing DICOM {path.name}: {e}")

# Process TIFF
for path in tiff_dir.glob("*.tif"):
    try:
        image_array = tiff.imread(str(path))
        # Normalize to 8-bit
        image_array = ((image_array - image_array.min()) / image_array.ptp() * 255).astype(np.uint8)
        if image_array.ndim == 2:  # grayscale
            img_pil = Image.fromarray(image_array).convert("RGB")
        else:
            img_pil = Image.fromarray(image_array)
        filename = f"IMG_{counter}.jpg"
        img_pil.save(output_dir / filename, "JPEG")
        data_entries.append([filename, 1])

        # df_temp = pd.concat([df_temp, pd.DataFrame([{"image": filename, "label": 1}])], ignore_index=True)
        counter += 1
    except Exception as e:
        print(f"Error processing TIFF {path.name}: {e}")

# Merge and save final CSV
df_temp = pd.DataFrame(data_entries, columns=['image_name', 'label'])
df_final = pd.concat([df_existing, df_temp], ignore_index=True)
df_final.to_csv(existing_csv_path, index=False)

print(f"✅ Processed {len(df_temp)} medical images and updated labels.csv.")
print(f"✅ Processed {counter - 1} medical images")


Current Images: 5857
✅ Processed 200 medical images and updated labels.csv.
✅ Processed 6056 medical images


In [24]:
# data_entries = []


In [6]:
# df_temp = pd.DataFrame(data_entries, columns=['image_name', 'label'])
# df_final = pd.concat([df_existing, df_temp], ignore_index=True)
# df_final.to_csv(existing_csv_path, index=False)

# print(f"✅ Processed {len(df_temp)} medical images and updated labels.csv.")

In [7]:

# Paths
source_dir = Path("/kaggle/input/cervical-cancer-largest-dataset-sipakmed")
output_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
output_dir.mkdir(parents=True, exist_ok=True)

labels_path = Path("/kaggle/working/labels.csv")

# Load or initialize labels.csv
if labels_path.exists():
    df = pd.read_csv(labels_path)
    if 'image_name' in df.columns:
        counter = int(df["image_name"].str.extract(r'IMG_(\d+)')[0].astype(float).max()) + 1
        print("Current Images:" ,counter)
    else:
        df = pd.DataFrame(columns=["image_name", "label"])
        counter = 1
        print("labels.csv Not Found!!")

else:
    df = pd.DataFrame(columns=["image_name", "label"])
    counter = 1
    print("labels.csv Not Found!!")


# Temporary label array
temp_labels = []

# Traverse all subfolders
for category in source_dir.iterdir():
    if category.is_dir():
        for root, _, files in os.walk(category):
            for file in files:
                if file.lower().endswith(".bmp"):
                    filepath = Path(root) / file
                    try:
                        img = Image.open(filepath).convert("RGB")
                        filename = f"IMG_{counter}.jpeg"
                        img.save(output_dir / filename, "JPEG")
                        temp_labels.append([filename, 1])
                        counter += 1
                    except Exception as e:
                        print(f"Error processing {file}: {e}")

# Create temporary DataFrame
df_temp = pd.DataFrame(temp_labels, columns=["image_name", "label"])

# Merge into main labels.csv and save
df = pd.concat([df, df_temp], ignore_index=True)
df.to_csv(labels_path, index=False)

print(f"✅ Processed {len(temp_labels)} images from cervical cancer dataset")
print(f"✅ Processed {counter - 1} medical images")


Current Images: 6057
✅ Processed 5015 images from cervical cancer dataset
✅ Processed 11071 medical images


In [20]:
# print(os.listdir("/kaggle/input/siim-medical-images"))


In [8]:

# Set paths
source_dir = Path("/kaggle/input/natural-images/natural_images")
output_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
labels_path = Path("/kaggle/working/labels.csv")

# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Load or initialize labels.csv
if labels_path.exists():
    df = pd.read_csv(labels_path)
    if 'image_name' in df.columns:
        counter = int(df["image_name"].str.extract(r'IMG_(\d+)')[0].astype(float).max()) + 1
        print("Current Images:" ,counter)
    else:
        df = pd.DataFrame(columns=["image_name", "label"])
        counter = 1
        print("labels.csv Not Found!!")
else:
    df = pd.DataFrame(columns=["image_name", "label"])
    counter = 1

# Temporary label list
temp_labels = []

# Iterate over
for category_folder in source_dir.iterdir():
    if category_folder.is_dir():
        for file in os.listdir(category_folder):
            file_path = category_folder / file
            if file_path.suffix.lower() in [".jpg", ".jpeg", ".png", ".bmp"]:
                try:
                    img = Image.open(file_path).convert("RGB")
                    filename = f"IMG_{counter}.jpeg"
                    img.save(output_dir / filename, "JPEG")
                    temp_labels.append([filename, 0])  # 0 = Non-medical
                    counter += 1
                except Exception as e:
                    print(f"❌ Error processing {file_path.name}: {e}")

        print(f"✅ Processing {len(temp_labels)} non-medical images")

# Create temporary DataFrame and merge
df_temp = pd.DataFrame(temp_labels, columns=["image_name", "label"])
df = pd.concat([df, df_temp], ignore_index=True)
df.to_csv(labels_path, index=False)

print(f"✅ Processed {len(temp_labels)} non-medical images and updated labels.csv")
print(f"✅ Processed {counter - 1} images")


Current Images: 11072
✅ Processing 788 non-medical images
✅ Processing 1515 non-medical images
✅ Processing 2358 non-medical images
✅ Processing 3060 non-medical images
✅ Processing 4060 non-medical images
✅ Processing 5028 non-medical images
✅ Processing 5913 non-medical images
✅ Processing 6899 non-medical images
✅ Processed 6899 non-medical images and updated labels.csv
✅ Processed 17970 medical images


In [9]:

# Paths
src_base_dir = Path("/kaggle/input/breast-histopathology-images")
dst_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
dst_dir.mkdir(parents=True, exist_ok=True)
labels_path = Path("/kaggle/working/labels.csv")

# Load or initialize the label file
if labels_path.exists():
    df_labels = pd.read_csv(labels_path)
    counter = int(df_labels["image_name"].str.extract(r'IMG_(\d+)')[0].astype(int).max()) + 1
    print("Current Images:" ,counter)
else:
    df_labels = pd.DataFrame(columns=["image_name", "label"])
    counter = 1
    print("labels.csv Not Found!!")

temp_labels = []
processed_images = 0
max_images = 5000

# Traverse through folders
for patient_folder in sorted(src_base_dir.iterdir()):
    if not patient_folder.is_dir():
        continue
    for label_folder in ['0', '1']:
        class_dir = patient_folder / label_folder
        if not class_dir.exists():
            continue
        for img_path in class_dir.glob("*.png"):
            try:
                with Image.open(img_path) as img:
                    img = img.convert("RGB")  # Ensure 3 channels
                    new_filename = f"IMG_{counter}.jpeg"
                    img.save(dst_dir / new_filename, "JPEG")
                    temp_labels.append([new_filename, 1])  # Label 1 for medical
                    counter += 1
                    processed_images += 1
                if processed_images >= max_images:
                    break
            except Exception as e:
                print(f"Error processing {img_path.name}: {e}")
        if processed_images >= max_images:
            break

# Merge with existing labels
df_temp = pd.DataFrame(temp_labels, columns=["image_name", "label"])
df_final = pd.concat([df_labels, df_temp], ignore_index=True)

# Save updated labels
df_final.to_csv(labels_path, index=False)

print(f"✅ Processed {processed_images} images from breast histopathology dataset.")
print(f"✅ Processed {counter - 1}  images")


Current Images: 17971
✅ Processed 5271 images from breast histopathology dataset.
✅ Processed 23241  images


In [27]:

# Paths
input_folder = '/kaggle/input/fashion-product-images-small/images'
output_folder = '/kaggle/working/Data_Med_vs_NonMed'
labels_path = '/kaggle/working/labels.csv'

# Create output dir if not exists
os.makedirs(output_folder, exist_ok=True)

# Load existing labels.csv
if os.path.exists(labels_path):
    df_existing = pd.read_csv(labels_path)
    counter = int(df_existing["image_name"].str.extract(r'IMG_(\d+)')[0].astype(int).max()) + 1
    print("Current Images:" ,counter)
else:
    df_existing = pd.DataFrame(columns=["image_name", "label"])
    counter = 1
    print("labels.csv Not Found!!")

# Temporary label list
temp_data = []

# Process fashion images
image_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.jpg')]

for img_file in tqdm(image_files, desc="Processing Fashion Images"):
    img_path = os.path.join(input_folder, img_file)
    
    try:
        with Image.open(img_path) as img:
            img = img.convert("RGB")
            new_name = f'IMG_{counter}.jpg'
            img.save(os.path.join(output_folder, new_name))
            temp_data.append([new_name, 0])  # Non-medical
            counter += 1
    except Exception as e:
        print(f"Error processing {img_file}: {e}")

# Merge and save updated CSV
df_temp = pd.DataFrame(temp_data, columns=["image_name", "label"])
df_final = pd.concat([df_existing, df_temp], ignore_index=True)
df_final.to_csv(labels_path, index=False)

print(f"✅ Processed {len(temp_data)} fashion images. Updated labels.csv.")


Current Images: 74349


Processing Fashion Images:  48%|████▊     | 21147/44441 [02:35<02:50, 136.24it/s]


KeyboardInterrupt: 

In [28]:
# Merge and save updated CSV
df_temp = pd.DataFrame(temp_data, columns=["image_name", "label"])
df_final = pd.concat([df_existing, df_temp], ignore_index=True)
df_final.to_csv(labels_path, index=False)

print(f"✅ Processed {len(temp_data)} fashion images. Updated labels.csv.")
print(f"✅ Processed {counter - 1} medical images")


✅ Processed 21147 fashion images. Updated labels.csv.
✅ Processed 95495 medical images


In [39]:
counter

33956

In [10]:
import shutil
import cv2
# Set paths
source_root = Path("/kaggle/input/agriculture-crop-images/crop_images")
dest_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
labels_csv_path = Path("/kaggle/working/labels.csv")

# Make destination if not exists
dest_dir.mkdir(parents=True, exist_ok=True)

# Read existing labels or create new
if labels_csv_path.exists():
    df_labels = pd.read_csv(labels_csv_path)
    existing_counter = df_labels["image_name"].str.extract(r'IMG_(\d+)')[0].astype(int).max() + 1
    print("Current Images:" ,counter)
else:
    df_labels = pd.DataFrame(columns=["image_name", "label"])
    existing_counter = 1
    print("labels.csv Not Found!!")


temp_records = []
counter = existing_counter

# Allowed image extensions

Current Images: 23242


In [13]:
from tqdm import tqdm
valid_ext = [".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"]

# Loop through all folders except 'kag2'
for folder in source_root.iterdir():
    if folder.name.lower() == "kag2" or not folder.is_dir():
        continue
    for root, _, files in os.walk(folder):
        for file in tqdm(files, desc=f"Processing {folder.name}"):
            ext = Path(file).suffix.lower()
            if ext in valid_ext:
                try:
                    img_path = Path(root) / file
                    img = cv2.imread(str(img_path))
                    if img is None:
                        continue
                    new_name = f"IMG_{counter}.jpg"
                    cv2.imwrite(str(dest_dir / new_name), img)
                    temp_records.append([new_name, 0])
                    counter += 1
                except Exception as e:
                    print(f"⚠️ Error reading {img_path}: {e}")

# Merge new data
df_temp = pd.DataFrame(temp_records, columns=["image_name", "label"])
df_labels = pd.concat([df_labels, df_temp], ignore_index=True)
df_labels.to_csv(labels_csv_path, index=False)

print(f"✅ Done. {len(temp_records)} new non-medical images added. Total: {len(df_labels)}")
print(f"✅ Processed {counter - 1} medical images")


Processing jute: 100%|██████████| 40/40 [00:00<00:00, 93.99it/s]
Processing maize: 100%|██████████| 40/40 [00:00<00:00, 100.11it/s]
Processing wheat: 100%|██████████| 41/41 [00:00<00:00, 101.75it/s]
Processing sugarcane: 100%|██████████| 40/40 [00:00<00:00, 102.11it/s]
Processing rice: 100%|██████████| 40/40 [00:00<00:00, 99.87it/s] 


✅ Done. 201 new non-medical images added. Total: 23442
✅ Processed 23442 medical images


In [14]:

# Paths
source_dir = Path("/kaggle/input/brain-mri-images/GAN-Traning Images")
dest_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
labels_csv_path = Path("/kaggle/working/labels.csv")

# Create destination if needed
dest_dir.mkdir(parents=True, exist_ok=True)

# Load or initialize labels
if labels_csv_path.exists():
    df_labels = pd.read_csv(labels_csv_path)
    existing_counter = df_labels["image_name"].str.extract(r'IMG_(\d+)')[0].astype(int).max() + 1
    print("Current Images:" ,counter)
else:
    df_labels = pd.DataFrame(columns=["image_name", "label"])
    existing_counter = 1
    print("labels.csv Not Found!!")

# Process images
temp_records = []
counter = existing_counter
valid_ext = [".jpg", ".jpeg", ".png", ".bmp"]

for file in tqdm(os.listdir(source_dir), desc="Processing Brain MRI Images"):
    ext = Path(file).suffix.lower()
    if ext in valid_ext:
        try:
            img_path = source_dir / file
            img = cv2.imread(str(img_path))
            if img is None:
                continue
            new_name = f"IMG_{counter}.jpg"
            cv2.imwrite(str(dest_dir / new_name), img)
            temp_records.append([new_name, 1])  # Label 1 for medical
            counter += 1
        except Exception as e:
            print(f"⚠️ Error reading {file}: {e}")

# Update labels.csv
df_temp = pd.DataFrame(temp_records, columns=["image_name", "label"])
df_labels = pd.concat([df_labels, df_temp], ignore_index=True)
df_labels.to_csv(labels_csv_path, index=False)

print(f"✅ Added {len(temp_records)} Brain MRI images. Total: {len(df_labels)}")
print(f"✅ Processed {counter - 1} medical images")


Current Images: 23443


Processing Brain MRI Images: 100%|██████████| 14715/14715 [03:21<00:00, 73.14it/s]


✅ Added 14715 Brain MRI images. Total: 38157
✅ Processed 38157 medical images


In [15]:

# Paths
source_root = Path("/kaggle/input/100-images-of-top-50-car-brands/imgs_zip/imgs")
dest_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
labels_csv_path = Path("/kaggle/working/labels.csv")

# Create destination directory if it doesn't exist
dest_dir.mkdir(parents=True, exist_ok=True)

# Load existing labels or initialize
if labels_csv_path.exists():
    df_labels = pd.read_csv(labels_csv_path)
    counter = df_labels["image_name"].str.extract(r'IMG_(\d+)')[0].astype(int).max() + 1
else:
    df_labels = pd.DataFrame(columns=["image_name", "label"])
    counter = 1
    print("labels.csv Not Found!!")

# Temporary list for new entries
temp_records = []
valid_ext = [".jpg", ".jpeg", ".png", ".bmp"]

# Traverse through all brand folders
for brand_folder in source_root.iterdir():
    if brand_folder.is_dir():
        for file in tqdm(os.listdir(brand_folder), desc=f"Processing {brand_folder.name}"):
            ext = Path(file).suffix.lower()
            if ext in valid_ext:
                try:
                    img_path = brand_folder / file
                    img = cv2.imread(str(img_path))
                    if img is None:
                        continue
                    new_name = f"IMG_{counter}.jpg"
                    cv2.imwrite(str(dest_dir / new_name), img)
                    temp_records.append([new_name, 0])  # Label 0 = Non-medical
                    counter += 1
                except Exception as e:
                    print(f"⚠️ Error processing {file}: {e}")

# Merge and save
df_temp = pd.DataFrame(temp_records, columns=["image_name", "label"])
df_labels = pd.concat([df_labels, df_temp], ignore_index=True)
df_labels.to_csv(labels_csv_path, index=False)

print(f"✅ Processed {len(temp_records)} car brand images. Total dataset size: {len(df_labels)}")
print(f"✅ Processed {counter - 1} medical images")


Processing Nissan: 100%|██████████| 88/88 [00:00<00:00, 98.81it/s] 
Processing Ford: 100%|██████████| 89/89 [00:00<00:00, 101.45it/s]
Processing Hyundai: 100%|██████████| 90/90 [00:00<00:00, 98.75it/s] 
Processing Mercedes-Benz: 100%|██████████| 88/88 [00:00<00:00, 109.42it/s]
Processing Studebaker: 100%|██████████| 102/102 [00:01<00:00, 98.42it/s]
Processing Chrysler: 100%|██████████| 89/89 [00:00<00:00, 97.27it/s] 
Processing GMC: 100%|██████████| 94/94 [00:00<00:00, 101.40it/s]
Processing Lincoln: 100%|██████████| 102/102 [00:00<00:00, 106.39it/s]
Processing Alfa Romeo: 100%|██████████| 89/89 [00:00<00:00, 103.68it/s]
Processing Fiat: 100%|██████████| 89/89 [00:00<00:00, 105.88it/s]
Processing Aston Martin: 100%|██████████| 91/91 [00:00<00:00, 104.46it/s]
Processing Volkswagen: 100%|██████████| 94/94 [00:00<00:00, 98.54it/s] 
Processing Land Rover: 100%|██████████| 87/87 [00:00<00:00, 103.66it/s]
Processing Oldsmobile: 100%|██████████| 102/102 [00:01<00:00, 100.75it/s]
Processing Bu

✅ Processed 4597 car brand images. Total dataset size: 42754
✅ Processed 42754 medical images





In [17]:


# Directories
src_base_dir = Path("/kaggle/input/unet-lung-segmentation-dataset-siim-covid/segmented_data/segmented_data")
dst_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
dst_dir.mkdir(exist_ok=True)

# Load existing labels
labels_csv_path = Path("/kaggle/working/labels.csv")
if labels_csv_path.exists():
    labels_df = pd.read_csv(labels_csv_path)
    counter = int(labels_df["image_name"].str.extract(r"IMG_(\d+)")[0].astype(int).max()) + 1
    print("Current Images:" ,counter)

else:
    labels_df = pd.DataFrame(columns=["image_name", "label"])
    counter = 1
    print("labels.csv Not Found!!")

temp_labels = []
processed_images = 0

# Process images from train and test folders
for folder in ['train', 'test']:
    folder_path = src_base_dir / folder
    for img_path in folder_path.glob("*.png"):
        try:
            with Image.open(img_path) as img:
                img = img.convert("RGB")
                new_filename = f"IMG_{counter}.jpeg"
                img.save(dst_dir / new_filename, "JPEG")
                temp_labels.append([new_filename, 1])  # 1 = medical
                counter += 1
                processed_images += 1
        except Exception as e:
            print(f"Error processing {img_path.name}: {e}")

# Merge with original labels and save
temp_df = pd.DataFrame(temp_labels, columns=["image_name", "label"])
final_df = pd.concat([labels_df, temp_df], ignore_index=True)
final_df.to_csv(labels_csv_path, index=False)

print(f"✅ Processed {processed_images} images from UNet lung segmentation dataset.Total dataset size: {len(final_df)}")
print(f"✅ Processed {counter - 1} images")


Current Images: 50352
✅ Processed 0 images from UNet lung segmentation dataset.Total dataset size: 50351
✅ Processed 50351 images


In [18]:
counter

50352

In [19]:

# Paths
src_base_dir = Path("/kaggle/input/benign-and-malignant-skin-lesion-dataset/Benign_Malignant_DataSet/MainData")
dst_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
dst_dir.mkdir(exist_ok=True)

# Load existing labels
labels_csv_path = Path("/kaggle/working/labels.csv")
if labels_csv_path.exists():
    labels_df = pd.read_csv(labels_csv_path)
    counter = int(labels_df["image_name"].str.extract(r"IMG_(\d+)")[0].astype(int).max()) + 1
    print("Current Images:" ,counter)
else:
    labels_df = pd.DataFrame(columns=["image_name", "label"])
    counter = 1
    print("labels.csv Not Found!!")

temp_labels = []
processed_images = 0

# Go through train, test, validation
for split in ["train", "test", "validation"]:
    split_path = src_base_dir / split
    if not split_path.exists():
        continue
    for label_dir in ["benign", "malignant"]:
        class_path = split_path / label_dir
        if not class_path.exists():
            continue
        image_list = list(class_path.glob("*.jpg"))
        for img_path in tqdm(image_list, desc=f"{split}/{label_dir}", unit="img"):
            try:
                with Image.open(img_path) as img:
                    img = img.convert("RGB")
                    new_filename = f"IMG_{counter}.jpeg"
                    img.save(dst_dir / new_filename, "JPEG")
                    temp_labels.append([new_filename, 1])  # Medical label
                    counter += 1
                    processed_images += 1
            except Exception as e:
                print(f"Error processing {img_path.name}: {e}")

# Save updated labels
temp_df = pd.DataFrame(temp_labels, columns=["image_name", "label"])
final_df = pd.concat([labels_df, temp_df], ignore_index=True)
final_df.to_csv(labels_csv_path, index=False)

print(f"✅ Processed {processed_images} images from skin lesion dataset.Total dataset size: {len(final_df)}")
print(f"✅ Processed {counter - 1} images")


Current Images: 50352


train/benign: 100%|██████████| 1944/1944 [03:08<00:00, 10.32img/s]
train/malignant: 100%|██████████| 1659/1659 [01:19<00:00, 20.84img/s]
test/benign: 100%|██████████| 228/228 [00:17<00:00, 13.00img/s]
test/malignant: 100%|██████████| 202/202 [00:09<00:00, 20.47img/s]
validation/benign: 100%|██████████| 228/228 [00:14<00:00, 16.15img/s]
validation/malignant: 100%|██████████| 198/198 [00:08<00:00, 24.39img/s]


✅ Processed 4459 images from skin lesion dataset.Total dataset size: 54810
✅ Processed 54810 images


In [20]:
from pathlib import Path
from PIL import Image
import pandas as pd
from tqdm import tqdm

# Paths
src_base_dir = Path("/kaggle/input/skyview-an-aerial-landscape-dataset/Aerial_Landscapes")
dst_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
dst_dir.mkdir(exist_ok=True)

# Load existing labels
labels_csv_path = Path("/kaggle/working/labels.csv")
if labels_csv_path.exists():
    labels_df = pd.read_csv(labels_csv_path)
    counter = int(labels_df["image_name"].str.extract(r"IMG_(\d+)")[0].astype(int).max()) + 1
    print("Current Images:" ,counter)
else:
    labels_df = pd.DataFrame(columns=["image_name", "label"])
    counter = 1
    print("labels.csv Not Found!!")

temp_labels = []
processed_images = 0

# Loop through each class folder
for class_folder in sorted(src_base_dir.iterdir()):
    if not class_folder.is_dir():
        continue
    images = list(class_folder.glob("*.jpg"))
    for img_path in tqdm(images, desc=f"Processing {class_folder.name}", unit="img"):
        try:
            with Image.open(img_path) as img:
                img = img.convert("RGB")
                new_filename = f"IMG_{counter}.jpeg"
                img.save(dst_dir / new_filename, "JPEG")
                temp_labels.append([new_filename, 0])  # Label 0 for non-medical
                counter += 1
                processed_images += 1
        except Exception as e:
            print(f"❌ Error processing {img_path.name}: {e}")

# Merge and save label CSV
temp_df = pd.DataFrame(temp_labels, columns=["image_name", "label"])
final_df = pd.concat([labels_df, temp_df], ignore_index=True)
final_df.to_csv(labels_csv_path, index=False)

print(f"✅ Extracted and labeled {processed_images} aerial images as non-medical.Total dataset size: {len(final_df)}")
print(f"✅ Processed {counter - 1} images")


Current Images: 54811


Processing Agriculture: 100%|██████████| 800/800 [00:04<00:00, 167.29img/s]
Processing Airport: 100%|██████████| 800/800 [00:05<00:00, 158.19img/s]
Processing Beach: 100%|██████████| 800/800 [00:04<00:00, 162.31img/s]
Processing City: 100%|██████████| 800/800 [00:05<00:00, 159.01img/s]
Processing Desert: 100%|██████████| 800/800 [00:04<00:00, 160.43img/s]
Processing Forest: 100%|██████████| 800/800 [00:05<00:00, 149.84img/s]
Processing Grassland: 100%|██████████| 800/800 [00:05<00:00, 157.07img/s]
Processing Highway: 100%|██████████| 800/800 [00:05<00:00, 153.94img/s]
Processing Lake: 100%|██████████| 800/800 [00:05<00:00, 154.70img/s]
Processing Mountain: 100%|██████████| 800/800 [00:05<00:00, 152.70img/s]
Processing Parking: 100%|██████████| 800/800 [00:05<00:00, 147.59img/s]
Processing Port: 100%|██████████| 800/800 [00:04<00:00, 163.16img/s]
Processing Railway: 100%|██████████| 800/800 [00:05<00:00, 158.50img/s]
Processing Residential: 100%|██████████| 800/800 [00:05<00:00, 153.31i

✅ Extracted and labeled 12000 aerial images as non-medical.Total dataset size: 66810
✅ Processed 66810 images





In [22]:


# Paths
base_paths = [
    Path("/kaggle/input/flower-color-images/flower_images/flower_images"),
    Path("/kaggle/input/flower-color-images/flowers/flowers")
]
dst_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
dst_dir.mkdir(exist_ok=True)

# Load existing labels
labels_csv_path = Path("/kaggle/working/labels.csv")
if labels_csv_path.exists():
    labels_df = pd.read_csv(labels_csv_path)
    counter = int(labels_df["image_name"].str.extract(r"IMG_(\d+)")[0].astype(int).max()) + 1
else:
    labels_df = pd.DataFrame(columns=["image_name", "label"])
    counter = 1
    print("labels.csv Not Found!!")


temp_labels = []
processed_images = 0

# Loop through both flower image folders
for base_path in base_paths:
    images = list(base_path.glob("*.png"))
    for img_path in tqdm(images, desc=f"Processing {base_path.name}", unit="img"):
        try:
            with Image.open(img_path) as img:
                img = img.convert("RGB")
                new_filename = f"IMG_{counter}.jpeg"
                img.save(dst_dir / new_filename, "JPEG")
                temp_labels.append([new_filename, 0])  # Label 0 for non-medical
                counter += 1
                processed_images += 1
        except Exception as e:
            print(f"❌ Error processing {img_path.name}: {e}")

# Merge and save labels
temp_df = pd.DataFrame(temp_labels, columns=["image_name", "label"])
final_df = pd.concat([labels_df, temp_df], ignore_index=True)
final_df.to_csv(labels_csv_path, index=False)

print(f"✅ {processed_images} flower images extracted and labeled as non-medical.Total dataset size: {len(final_df)}")
print(f"✅ Processed {counter - 1} images")


Processing flower_images: 0img [00:00, ?img/s]
Processing flowers: 0img [00:00, ?img/s]

✅ 0 flower images extracted and labeled as non-medical.Total dataset size: 67623
✅ Processed 67623 images





In [23]:


base_dir = Path("/kaggle/input/wildlife-animals-images")
dst_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
dst_dir.mkdir(exist_ok=True)

labels_csv_path = Path("/kaggle/working/labels.csv")
if labels_csv_path.exists():
    labels_df = pd.read_csv(labels_csv_path)
    counter = int(labels_df["image_name"].str.extract(r"IMG_(\d+)")[0].astype(int).max()) + 1
    print("Current Images:" ,counter)

else:
    labels_df = pd.DataFrame(columns=["image_name", "label"])
    counter = 1

temp_labels = []
processed_images = 0

# Loop through folders ending with '-resize-512'
for animal_folder in base_dir.glob("*-resize-512"):
    inner_folder = next(animal_folder.glob("resize-512"), None)
    if inner_folder and inner_folder.is_dir():
        for img_path in tqdm(inner_folder.glob("*.png"), desc=f"Processing {animal_folder.name}", unit="img"):
            try:
                with Image.open(img_path) as img:
                    img = img.convert("RGB")
                    new_filename = f"IMG_{counter}.jpeg"
                    img.save(dst_dir / new_filename, "JPEG")
                    temp_labels.append([new_filename, 0])  # Label 0 = non-medical
                    counter += 1
                    processed_images += 1
            except Exception as e:
                print(f"❌ Error processing {img_path.name}: {e}")

# Save merged labels
temp_df = pd.DataFrame(temp_labels, columns=["image_name", "label"])
final_df = pd.concat([labels_df, temp_df], ignore_index=True)
final_df.to_csv(labels_csv_path, index=False)

print(f"✅ {processed_images} flower images extracted and labeled as non-medical.Total dataset size: {len(final_df)}")
print(f"✅ Processed {counter - 1} images")


Current Images: 67624


Processing hyena-resize-512: 106img [00:02, 27.10img/s]

❌ Error processing 00000224_512resized.png: cannot identify image file '/kaggle/input/wildlife-animals-images/hyena-resize-512/resize-512/00000224_512resized.png'


Processing hyena-resize-512: 306img [00:07, 38.53img/s]
Processing cheetah-resize-512: 156img [00:01, 99.03img/s] 

❌ Error processing 00000244_512resized.png: cannot identify image file '/kaggle/input/wildlife-animals-images/cheetah-resize-512/resize-512/00000244_512resized.png'


Processing cheetah-resize-512: 343img [00:03, 101.31img/s]


✅ 647 flower images extracted and labeled as non-medical.Total dataset size: 68270
✅ Processed 68270 images


In [24]:
from pathlib import Path
from PIL import Image
import pandas as pd
from tqdm import tqdm

src_dir = Path("/kaggle/input/food41/images")
dst_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
dst_dir.mkdir(exist_ok=True)

labels_csv_path = Path("/kaggle/working/labels.csv")
if labels_csv_path.exists():
    labels_df = pd.read_csv(labels_csv_path)
    counter = int(labels_df["image_name"].str.extract(r"IMG_(\d+)")[0].astype(int).max()) + 1
    print("Current Images:" ,counter)

else:
    labels_df = pd.DataFrame(columns=["image_name", "label"])
    counter = 1

temp_labels = []
processed_images = 0

# Iterate through each food category
for category_dir in sorted(src_dir.iterdir()):
    if not category_dir.is_dir():
        continue
    for img_path in tqdm(category_dir.glob("*.jpg"), desc=f"Processing {category_dir.name}", unit="img"):
        try:
            with Image.open(img_path) as img:
                img = img.convert("RGB")
                new_filename = f"IMG_{counter}.jpeg"
                img.save(dst_dir / new_filename, "JPEG")
                temp_labels.append([new_filename, 0])  # 0 for non-medical
                counter += 1
                processed_images += 1
        except Exception as e:
            print(f"❌ Error processing {img_path.name}: {e}")

# Merge and save final labels



Current Images: 68271


Processing apple_pie: 1000img [00:13, 71.63img/s]
Processing baby_back_ribs: 1000img [00:14, 68.74img/s]
Processing baklava: 1000img [00:14, 70.49img/s]
Processing beef_carpaccio: 1000img [00:14, 69.65img/s]
Processing beef_tartare: 1000img [00:13, 72.46img/s]
Processing beet_salad: 1000img [00:14, 69.60img/s]
Processing beignets: 78img [00:01, 71.04img/s]


KeyboardInterrupt: 

In [25]:
temp_df = pd.DataFrame(temp_labels, columns=["image_name", "label"])
final_df = pd.concat([labels_df, temp_df], ignore_index=True)
final_df.to_csv(labels_csv_path, index=False)

print(f"✅ Extracted {processed_images} food images and updated labels.Total dataset size: {len(final_df)}")
print(f"✅ Processed {counter - 1} images")

✅ Extracted 6078 food images and updated labels.Total dataset size: 74348
✅ Processed 74348 images


In [31]:


# Source and destination paths
src_dir = Path("/kaggle/input/x-ray-dataset-1/images_02/images")
dst_dir = Path("/kaggle/working/Data_Med_vs_NonMed")
dst_dir.mkdir(exist_ok=True)

# Load existing labels if present
labels_csv_path = Path("/kaggle/working/labels.csv")
if labels_csv_path.exists():
    labels_df = pd.read_csv(labels_csv_path)
    counter = int(labels_df["image_name"].str.extract(r"IMG_(\d+)")[0].astype(int).max()) + 1
else:
    labels_df = pd.DataFrame(columns=["image_name", "label"])
    counter = 1

temp_labels = []
processed_images = 0

# Extract all PNG images
for img_path in tqdm(src_dir.glob("*.png"), desc="Processing X-ray images", unit="img"):
    try:
        with Image.open(img_path) as img:
            img = img.convert("RGB")
            new_filename = f"IMG_{counter}.jpeg"
            img.save(dst_dir / new_filename, "JPEG")
            temp_labels.append([new_filename, 1])  # Label 1 for medical
            counter += 1
            processed_images += 1
    except Exception as e:
        print(f"❌ Error processing {img_path.name}: {e}")

# Save updated labels
temp_df = pd.DataFrame(temp_labels, columns=["image_name", "label"])
final_df = pd.concat([labels_df, temp_df], ignore_index=True)
final_df.to_csv(labels_csv_path, index=False)

print(f"✅ Extracted {processed_images} X-ray images and updated labels.")


Processing X-ray images: 10000img [06:24, 26.03img/s]


✅ Extracted 10000 X-ray images and updated labels.


In [32]:
print(len(final_df))

110494


In [33]:
shutil.make_archive('/kaggle/working/Data_Med_vs_NonMed2', 'zip', '/kaggle/working/Data_Med_vs_NonMed')

'/kaggle/working/Data_Med_vs_NonMed2.zip'

In [47]:
!ls -lh /kaggle/working/

total 3.4G
drwxr-xr-x 2 root root 2.0M Jul 29 18:35 Data_Med_vs_NonMed
-rw-r--r-- 1 root root 3.4G Jul 29 18:55 Data_Med_vs_NonMed.zip
-rw-r--r-- 1 root root 2.9K Jul 29 18:10 labels2.csv
-rw-r--r-- 1 root root 841K Jul 29 18:35 labels.csv
