<a href="https://colab.research.google.com/github/cgyireh1/SickleClinix25/blob/main/notebooks/SickleClinix_Data_Augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SickleClinix Data Augmentation

Total Data size: 991

In [None]:
dir = '/content/drive/MyDrive/CAPSTONE/sickle-dataset'
from pathlib import Path

num_images = len(list(Path(dir).glob('*/*/*.jpg')))
num_images

991

In [None]:
# Check the folders and number of images in the dataset
import os

for category in ['Positive', 'Negative']:
    category_path = os.path.join(dir, category)

    for subfolder in os.listdir(category_path):
        subfolder_path = os.path.join(category_path, subfolder)

        if os.path.isdir(subfolder_path):
            jpg_count = sum(1 for f in os.listdir(subfolder_path) if f.lower().endswith('.jpg'))
            print(f"{subfolder} ({category}): {jpg_count} files")

Labelled (Positive): 422 files
Unlabelled (Positive): 422 files
Clear (Negative): 147 files


## **Data Augmentation**

Data augmentation is a technique for expanding and diversifying datasets particularly in image processing. By applying various transformations to existing data we can create new training examples that help improve model generalization, reduce overfitting and enhance robustness.

### **Tools and Libraries for Image Data Augmentation**

**TensorFlow:** TensorFlow’s tf.image module provides functions for image transformations.

**Keras:** Keras offers the ImageDataGenerator class for real-time data augmentation.

**PyTorch:** PyTorch’s torchvision.transforms module includes a wide range of augmentation techniques.

**Albumentations:** A fast image augmentation library with a rich set of transformations.

**imgaug:** A flexible library for image augmentation with support for various augmentations. (GeeksforGeeks, 2025)



### **- Notes On the Original dataset:**
- Imbalanced data

In [None]:
# import necessary libraries
import os
import cv2
import shutil
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import albumentations as A
from datetime import datetime
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
%%capture
!pip install -q albumentations opencv-python

In [None]:
base_dir = '/content/drive/MyDrive/CAPSTONE/sickle-dataset'
nosorting_dir = '/content/drive/MyDrive/CAPSTONE/sickle-data-augmented/Augmented_Without-Sorting'

# Augmentation settings
positive_aug = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.Rotate(limit=20, p=0.7),
    A.RandomBrightnessContrast(p=0.5),
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, p=0.7),
    A.GaussianBlur(p=0.2),
    A.GaussNoise(p=0.2),
])

negative_aug = A.Compose([
    A.HorizontalFlip(p=0.7),
    A.Rotate(limit=30, p=0.8),
    A.RandomBrightnessContrast(p=0.6),
    A.GaussNoise(p=0.3),
    A.ElasticTransform(alpha=1, sigma=50, alpha_affine=20, p=0.3),
    A.GridDistortion(p=0.3),
    A.GaussianBlur(p=0.3),
    A.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=25, p=0.8),
])


  original_init(self, **validated_kwargs)
  A.ElasticTransform(alpha=1, sigma=50, alpha_affine=20, p=0.3),


In [None]:
# Augmentation function to apply the augmentation
def augment_images(input_folder, output_folder, target_count, augmenter):
    os.makedirs(output_folder, exist_ok=True)
    images = [f for f in os.listdir(input_folder) if f.lower().endswith(('.jpg'))]

    current_count = len(os.listdir(output_folder))
    needed = target_count - current_count

    print(f"\n Augmenting {input_folder} → {output_folder}")
    print(f"Currently: {current_count}, Need: {needed}")

    if needed <= 0:
        print(" Already balanced.")
        return

    i = 0
    while i < needed:
        for img_name in images:
            img_path = os.path.join(input_folder, img_name)
            image = cv2.imread(img_path)

            if image is None:
                continue

            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            augmented = augmenter(image=image)['image']
            aug_img = cv2.cvtColor(augmented, cv2.COLOR_RGB2BGR)

            timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")
            out_path = os.path.join(output_folder, f"aug_{timestamp}.jpg")
            cv2.imwrite(out_path, aug_img)

            i += 1
            if i >= needed:
                break

    print(f" Final count: {len(os.listdir(output_folder))}")


pairs = [
    # Positive class
    (f'{base_dir}/Positive/Labelled', f'{nosorting_dir}/Positive/Labelled_augmented', 500, positive_aug),
    (f'{base_dir}/Positive/Unlabelled', f'{nosorting_dir}/Positive/Unlabelled_augmented', 500, positive_aug),

    # Negative class
    (f'{base_dir}/Negative/Clear', f'{nosorting_dir}/Negative/Clear_augmented', 1000, negative_aug),
]

for src, dst, tgt, aug in pairs:
    augment_images(src, dst, tgt, aug)


 Augmenting /content/drive/MyDrive/CAPSTONE/sickle-dataset/Positive/Labelled → /content/drive/MyDrive/CAPSTONE/sickle-data-augmented/Augmented_Without-Sorting/Positive/Labelled_augmented
Currently: 0, Need: 500
 Final count: 500

 Augmenting /content/drive/MyDrive/CAPSTONE/sickle-dataset/Positive/Unlabelled → /content/drive/MyDrive/CAPSTONE/sickle-data-augmented/Augmented_Without-Sorting/Positive/Unlabelled_augmented
Currently: 0, Need: 500
 Final count: 500

 Augmenting /content/drive/MyDrive/CAPSTONE/sickle-dataset/Negative/Clear → /content/drive/MyDrive/CAPSTONE/sickle-data-augmented/Augmented_Without-Sorting/Negative/Clear_augmented
Currently: 722, Need: 278
 Final count: 1000


In [None]:
# ncomfirm number images generated
for category in ['Positive', 'Negative']:
    category_path = os.path.join(nosorting_dir, category)

    for subfolder in os.listdir(category_path):
        subfolder_path = os.path.join(category_path, subfolder)

        if os.path.isdir(subfolder_path):
            jpg_count = sum(1 for f in os.listdir(subfolder_path) if f.lower().endswith('.jpg'))
            print(f"{subfolder} ({category}): {jpg_count} files")

Labelled_augmented (Positive): 500 files
Unlabelled_augmented (Positive): 500 files
Clear_augmented (Negative): 1000 files


### Sorting Data to Sickle and Normal folders for training

In [None]:
dir = '/content/drive/MyDrive/CAPSTONE/sickle-dataset'
nosort_dir = '/content/drive/MyDrive/CAPSTONE/sickle-data-augmented/Augmented_Without-Sorting'
aug_dir  = '/content/drive/MyDrive/CAPSTONE/sickle-data-augmented/Augmented-Sorted'

# Input folder paths
labelled = os.path.join(dir, 'Positive', 'Labelled')
unlabelled = os.path.join(dir, 'Positive', 'Unlabelled')
clear = os.path.join(dir, 'Negative', 'Clear')
labelled_aug = os.path.join(nosort_dir, 'Positive', 'Labelled_augmented')
unlabelled_aug = os.path.join(nosort_dir, 'Positive', 'Unlabelled_augmented')
clear_aug = os.path.join(nosort_dir, 'Negative', 'Clear_augmented')

# Output folder paths
sickle = os.path.join(aug_dir, 'Sickle')
normal = os.path.join(aug_dir, 'Normal')

# Create output folders
os.makedirs(sickle, exist_ok=True)
os.makedirs(normal, exist_ok=True)

In [None]:
# Moving Positive images
for folder_path in [labelled_aug, unlabelled_aug, labelled, unlabelled]:
    for filename in os.listdir(folder_path):
        src = os.path.join(folder_path, filename)
        dst = os.path.join(sickle, f"{os.path.basename(folder_path)}_{filename}")
        if os.path.isfile(src):
            shutil.copy(src, dst)

# Moving Negative images
for f_path in [clear_aug, clear]:
    for filename in os.listdir(f_path):
      src = os.path.join(f_path, filename)
      dst = os.path.join(normal, f"{os.path.basename(f_path)}_{filename}")
      if os.path.isfile(src):
          shutil.copy(src, dst)

In [None]:
print(f" Total images in:")
print(f"- Sickle: {len(os.listdir(sickle))}")
print(f"- Normal: {len(os.listdir(normal))}")

 Total images in:
- Sickle: 1844
- Normal: 1147


**References**

What is data augmentation? How does data augmentation work for images? (2025). GeeksforGeeks. [Link to Page](https://colab.research.google.com/drive/15wFJHq6CCNOV_B-XX4LN_dP64XEb6iBh#scrollTo=ZOlyJUowFtd4&line=5&uniqifier=1)

Dataset: Florence Tushabe, Sickle Cell Disease Dataset, Kaggle (2024). Available at: https://www.kaggle.com/datasets/florencetushabe/sickle-cell-disease-dataset

