## 1. Download Dataset from Kaggle

In [None]:
import kagglehub

# Download the COVID-19 Radiography Dataset from Kaggle
# Note: Ensure Kaggle credentials are configured (kaggle.json)
src_path = kagglehub.dataset_download("tawsifurrahman/covid19-radiography-database")
print(f"Dataset downloaded to: {src_path}")

## 2. Configure Data Directory

In [None]:
# Set this to a custom path (e.g., Google Drive mount in Colab)
# Leave empty ("") to use the current working directory
DATA_DIR = ""  # Example: "/content/drive/MyDrive/covid_data"

## 3. Preprocess and Organize Data

In [None]:
import shutil
from pathlib import Path
import os

# Define paths
source_root = Path(src_path) / "COVID-19_Radiography_Dataset"
target_root = Path(DATA_DIR) / "data_for_split" if DATA_DIR else Path("data_for_split")

# Class folders to process
classes = ["COVID", "Normal", "Lung_Opacity", "Viral Pneumonia"]

# Copy images to a unified directory (skip if already exists)
try:
    for cls in classes:
        source = source_root / cls / "images"
        target = target_root / cls
        target.mkdir(parents=True, exist_ok=True)

        for file in source.glob("*.*"):
            shutil.copy(file, target)
    print(f"✅ Images copied to: {target_root}")
except Exception as e:
    print(f"❌ Error during copy: {e}")

## 4. Split Data into Train/Val/Test

In [None]:
!pip install split-folders -q

import splitfolders

# Define output directory
base_dir = Path(DATA_DIR) / "dataset" if DATA_DIR else Path("dataset")

# Split data into 70% train, 20% validation, 10% test
splitfolders.ratio(
    str(target_root),  # Input directory
    output=str(base_dir),
    seed=42,  # For reproducibility
    ratio=(0.7, 0.2, 0.1),
    group_prefix=None
)

print(f"✅ Data split into: {base_dir}")

## 5. Analyze Class Distribution (Before Augmentation)

In [None]:
train_path = base_dir / "train"
display_data_distribution(train_path)  # Custom function to plot class counts

**Observation:** The training set shows significant class imbalance (e.g., fewer `COVID` images than `Normal`). We'll address this with augmentation.

## 6. Apply Data Augmentation

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array

# Configure augmentation for minority classes only
aug_dir = base_dir / "train_augmented"
classes_to_augment = ['COVID', 'Lung_Opacity', 'Viral Pneumonia']  # Skip 'Normal'

# Augmentation settings (preserve medical relevance)
augmenter = ImageDataGenerator(
    rotation_range=15,  # Avoid extreme rotations
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    fill_mode="constant",  # Black borders for medical consistency
    cval=0.0
)

# Copy original images and augment minority classes
try:
    for cls in os.listdir(train_path):
        src = train_path / cls
        dst = aug_dir / cls
        os.makedirs(dst, exist_ok=True)

        # Copy all originals
        for img_name in os.listdir(src):
            shutil.copy(src / img_name, dst / img_name)

            # Augment only minority classes (3x per image)
            if cls in classes_to_augment:
                img = load_img(src / img_name)
                x = img_to_array(img).reshape((1,) + img_to_array(img).shape)
                
                for _ in augmenter.flow(x, batch_size=1, save_to_dir=dst, 
                                       save_prefix='aug', save_format='png'):
                    break  # Generate 1 augmented version (adjust loop for more)
    
    print(f"✅ Augmented data saved to: {aug_dir}")
except Exception as e:
    print(f"❌ Augmentation failed: {e}")

## 7. Verify Augmented Distribution

In [None]:
display_data_distribution(aug_dir)  # Confirm balanced classes