# COVID-19 Radiography Dataset Processing
---

## 📦 Imports and Setup

- `kagglehub`: For downloading datasets from Kaggle.
- `shutil`: Utilities for file operations like copying and removing files.
- `pathlib.Path`: For handling filesystem paths in a platform-independent way.
- `splitfolders`: To split datasets into train, validation, and test folders.
- `os`: Operating system utilities for directory and file management.
- `tensorflow.keras.preprocessing.image`: For image loading and augmentation.
- `utils.display_data_distribution`: Custom utility function to visualize data distribution.


In [None]:
import kagglehub
import shutil
from pathlib import Path
import splitfolders
import os
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from utils import display_data_distribution


------


In [None]:
# Download dataset
src_path = kagglehub.dataset_download("tawsifurrahman/covid19-radiography-database")
print("Original path:", src_path)

## 2. Configure Data Directory

In [17]:
DATA_DIR = "" # Change this to your local path or Google Drive mount path if running in Colab !!!
              # if you keep DATA_DIR = "" , the data will be loaded in your current repo !!

with open('Shared_vars.py', 'w') as f:
    f.write(f"DATA_DIR=\"{DATA_DIR}\"\n")

---

## 3. Organize Dataset by Classes

In [None]:
# Paths
source_root = Path(src_path+'/COVID-19_Radiography_Dataset')
target_root = Path(DATA_DIR + "/data_for_split")

# Class folders
classes = ["COVID", "Normal", "Lung_Opacity", "Viral Pneumonia"]

# Create image-only dataset
for cls in classes:
    source = source_root / cls / "images"
    target = target_root / cls
    target.mkdir(parents=True, exist_ok=True)

    for file in source.glob("*.*"):
        shutil.copy(file, target)

print("✅ Images copied successfully to", target_root)

---

## 4. Split Data into Train/Val/Test Sets

In [None]:
base_dir = f"{DATA_DIR}/dataset"

splitfolders.ratio(
    f"{DATA_DIR}/data_for_split",
    output=base_dir,
    seed=42,
    ratio=(.7, .2, .1),  # train, val, test
    group_prefix=None  # Only needed for paired data like images + masks
)

---

## 5. Visualize Training Set Class Distribution

In [None]:
train_path = base_dir + "/train"
display_data_distribution(train_path)  # Display the distribution of images across classes in the training set

---

**Observation:** There is significant class imbalance in the training set. We will address this with data augmentation in the next step.

---

## 6. Apply Data Augmentation to Minority Classes

In [None]:
# Paths
aug_dir = base_dir + "/train_augmented"
classes_to_augment = ['COVID', 'Lung_Opacity', 'Viral Pneumonia']

# Create augmentation generator
augmenter = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.15,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode="nearest"
)

# Copy all original images to the new folder
for cls in os.listdir(train_dir):
    src = os.path.join(train_dir, cls)
    dst = os.path.join(aug_dir, cls)
    os.makedirs(dst, exist_ok=True)

    for img_name in os.listdir(src):
        shutil.copy(os.path.join(src, img_name), os.path.join(dst, img_name))

        # Augment only if class is in the selected list
        if cls in classes_to_augment:
            img = load_img(os.path.join(src, img_name))
            x = img_to_array(img)
            x = x.reshape((1,) + x.shape)

            # Create 3 augmented versions
            for i, batch in enumerate(augmenter.flow(x, batch_size=1,
                                                     save_to_dir=dst,
                                                     save_prefix='aug',
                                                     save_format='jpeg')):
                if i >= 3:
                    break

display_data_distribution(aug_dir) # Display augmented data distribution