# Data Augmentation

About the data:
The dataset consists of 2 folders: yes or no which contains 253 brain MRI images.
The folder yes contains 155 Brain MRI Images that are tumorous and the folder no contains 98 Brain MRI Images that are non-tumorous.
Since, the dataset is small. a technique called Data Augmentation is used to solve the data imbalance issue.

In [5]:
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
import cv2
import imutils
import matplotlib.pyplot as plt
from os import listdir

%matplotlib inline

In [12]:
def augment_data(file_path, n_generated_samples, save_dir):

    data_gen = ImageDataGenerator(rotation_range=10, 
                                  width_shift_range=0.1, 
                                  height_shift_range=0.1, 
                                  shear_range=0.1, 
                                  brightness_range=(0.3, 1.0),
                                  horizontal_flip=True, 
                                  vertical_flip=True, 
                                  fill_mode='nearest')
    
    for filename in listdir(file_path):
        #load the image
        image = cv2.imread(file_path + '\\' + filename)
        #reshape the image
        image = image.reshape((1,) + image.shape)
        #prefix the names of the generated samples
        save_prefix = 'aug_' + filename[:-4]
        #generate n_generated_samples images
        i = 0
        for batch in data_gen.flow(x = image,  batch_size = 1, save_to_dir = save_dir,
                                      save_prefix = save_prefix, save_format = 'jpg'):
            i += 1
            if i > n_generated_samples:
                break

In [13]:
augmented_data_path = 'augmented_data/'

#augment data for the examples with label 'yes' representing tumorous examples
augment_data(file_path = 'dataset/yes/', n_generated_samples = 6, save_dir = augmented_data_path + 'yes')
#augment data for the examples with label 'no' representing non-tumorous examples
augment_data(file_path = 'dataset/no/', n_generated_samples = 9, save_dir = augmented_data_path + 'no')


In [14]:
def data_summary(main_path):
    yes_path = main_path + 'yes'
    no_path = main_path + 'no'
    
    #number of files(images) that are in the folder 'yes' representing tumorous examples
    no_of_pos = len(listdir(yes_path))
    #number of files(images) that are in the folder 'no' representing non-tumorous examples
    no_of_neg = len(listdir(no_path))
    
    total = no_of_pos + no_of_neg
    
    pos_perc = (no_of_pos * 100.0)/total
    neg_perc = (no_of_neg * 100.0)/total
    
    print(f"Total number of examples: {total}")
    print(f"Percentage of positive examples: {pos_perc}%, Number of positive examples: {no_of_pos}")
    print(f"Percentage of negative examples: {neg_perc}%, Number of negative examples: {no_of_neg}")
    

In [15]:
data_summary(augmented_data_path)

Total number of examples: 2064
Percentage of positive examples: 52.616279069767444%, Number of positive examples: 1086
Percentage of negative examples: 47.383720930232556%, Number of negative examples: 978
