In [None]:
import kagglehub

# Download dataset
src_path = kagglehub.dataset_download("tawsifurrahman/covid19-radiography-database")
print("Original path:", src_path)

In [None]:
import shutil
from pathlib import Path

# Paths
source_root = Path(src_path+'/COVID-19_Radiography_Dataset')
target_root = Path("/data_for_split")

# Class folders
classes = ["COVID", "Normal", "Lung_Opacity", "Viral Pneumonia"]

# Create image-only dataset
for cls in classes:
    source = source_root / cls / "images"
    target = target_root / cls
    target.mkdir(parents=True, exist_ok=True)

    for file in source.glob("*.*"):
        shutil.copy(file, target)

print("✅ Images copied successfully to", target_root)

In [None]:
!%pip install split-folders

import splitfolders

base_dir = "/data"

splitfolders.ratio(
   "/data_for_split",
    output=base_dir,
    seed=42,
    ratio=(.7, .2, .1),  # train, val, test
    group_prefix=None  # Only needed for paired data like images + masks
)

In [None]:
dispaly_data_distribuation("train")

In [None]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from collections import Counter
import shutil

# Paths
train_dir = base_dir + "/train"
aug_dir = base_dir + "/train_augmented"

# Create augmentation generator
augmenter = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.15,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode="nearest"
)

# Step 1: Count images per class
class_counts = {}
for cls in os.listdir(train_dir):
    cls_path = os.path.join(train_dir, cls)
    if os.path.isdir(cls_path):
        class_counts[cls] = len(os.listdir(cls_path))

max_count = max(class_counts.values())
print(f"Target count per class: {max_count}")

# Step 2: Balance classes
for cls, count in class_counts.items():
    src = os.path.join(train_dir, cls)
    dst = os.path.join(aug_dir, cls)
    os.makedirs(dst, exist_ok=True)

    # Copy originals
    for img_name in os.listdir(src):
        shutil.copy(os.path.join(src, img_name), os.path.join(dst, img_name))

    # How many augmented images needed
    needed = max_count - count
    if needed <= 0:
        continue  # Class already balanced

    # Load all images in class
    img_files = os.listdir(src)

    # Generate augmented images
    gen_count = 0
    while gen_count < needed:
        img_name = np.random.choice(img_files)
        img = load_img(os.path.join(src, img_name))
        x = img_to_array(img)
        x = x.reshape((1,) + x.shape)

        for batch in augmenter.flow(
            x, batch_size=1,
            save_to_dir=dst,
            save_prefix='aug',
            save_format='jpeg'
        ):
            gen_count += 1
            if gen_count >= needed:
                break

    print(f"{cls}: augmented {gen_count} images to reach {max_count}")

In [None]:
dispaly_data_distribuation("train_augmented")