In [1]:
import os
import io 
from PIL import Image
import random
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import layers, Sequential
from tensorflow.keras.utils import image_dataset_from_directory, load_img, img_to_array
from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [51]:
print(len(sorted(os.listdir(os.path.join("../../data/food/train")))))
print(len(sorted(os.listdir(os.path.join("../../data//food/test")))))
print(len(sorted(os.listdir(os.path.join("../../data/food/val")))))

#took out bread, butter, flour, pasta, red_wine
all_ingredients = [
                    "apple",'asparagus', 'avocado'
                   , 'banana', 'beef', 'beetroot', 'blueberry', 'broccoli'
                   , 'cabbage', 'capsicum', 'carrot', 'cauliflower', 'celery', 'chicken'
                   , 'cod', 'coconut', 'corn', 'cranberry', 'cucumber'
                   , 'eggs', 'eggplant', 'garlic', 'ginger', 'grape'
                   , 'honey', 'kiwi', 'leek', 'lemon', 'lettuce', 'lime'
                   , "milk", 'mushroom', 'onion', 'orange'
                   , 'pineapple', 'pork', 'potato', 
                    'prawns', 'pumpkin'
                   , 'salmon', 'spinach', 'strawberry'
                   , 'tomato'
                   , 'white_wine'
                   , 'zucchini'
                    ]

print(len(all_ingredients))

46
46
46
45


In [38]:
#----Create image augmentation generator
datagen = ImageDataGenerator(rotation_range = 90,
                            horizontal_flip = True,
                            vertical_flip = True,
                            width_shift_range = 0.2,
                            height_shift_range = 0.2,
                            shear_range = 0.2,
                            zoom_range = 0.3,
                            brightness_range = (0.5,1))


In [43]:
def data_aug(parent_folder_path: str, dataset: str, ingred_name: str, total_num_images_required: dict, save_to_dir: str):    
    
    #----Get list of file names from sub_folder
    src_path = os.path.join(parent_folder_path, dataset, ingred_name)
    img_name_list = os.listdir(src_path)
    
    #----Remove hidden file(s) in img_name_list
    for img_name in img_name_list:
        if img_name[0] == ".":
            img_name_list.remove(img_name)
    
    #----Total number of images required in folder
    if dataset == "train":
        total_num_images_req = total_num_images_required["train"]
    elif dataset == "val":
        total_num_images_req = total_num_images_required["val"]
    elif dataset == "test":
        total_num_images_req = total_num_images_required["test"]
        
    #----number of images needed for augmenting
    if len(img_name_list)*2 > total_num_images_req:
        random.shuffle(img_name_list)
        new_img_name_list = img_name_list[:total_num_images_req-len(img_name_list)]
    else:
        new_img_name_list = img_name_list
    
    print(f"Number of original images in {ingred_name} {dataset} folder: {len(img_name_list)}")
    print(f"Total number of {ingred_name} {dataset} images required: {total_num_images_req}")                                                               
    print(f"Number of images used for {ingred_name} {dataset} augmentation: {len(new_img_name_list)}")
    
    #----lopping through each image file...
    for original_img_name in new_img_name_list:

        #----load the image
        original_img_file_path = os.path.join(parent_folder_path, dataset, ingred_name, original_img_name) 
        loading_img = load_img(original_img_file_path)

        #----convert image to 3D array
        img_3d = img_to_array(loading_img)

        #----convert 3d image to 4d (a list with one 3d image)
        img_list = np.expand_dims(img_3d, axis=0)
        
        
        #(can be replaced)----Create image iterator 
        image_iterator = datagen.flow(img_list, batch_size=1
                                  ,save_to_dir=save_to_dir, save_prefix=f'aug_{original_img_name[0:-4]}'
                                      , save_format='jpg')

        #(can be replaced)----Save image
        for i in range(total_num_images_req//len(img_name_list)):
            image_iterator.next()
        
    print(f"Total number of images now in {ingred_name} {dataset} folder: {len(os.listdir(src_path))}")
    print(f"========================")

            

In [49]:
############----SET INPUTS----################

#datasets: train, val, test
dataset_list = ["test", "train", "val"]
# dataset_list =["test"]

#Set total number of images needed for each class
total_num_images_required = {"train": 1000, "test": 200, "val": 200}


#----Set original image folder path
parent_folder_path = "../../data/food"


In [50]:
#----Calling function
for ingred_name in all_ingredients_test:   
    for dataset in dataset_list:
        save_to_dir = os.path.join(parent_folder_path, dataset, ingred_name)
        data_aug(parent_folder_path, dataset, ingred_name, total_num_images_required, save_to_dir)    


Number of original images in milk test folder: 32
Total number of milk test images required: 200
Number of images used for milk test augmentation: 32
Total number of images now in milk test folder: 224
Number of original images in milk train folder: 143
Total number of milk train images required: 1000
Number of images used for milk train augmentation: 143
Total number of images now in milk train folder: 1001
Number of original images in milk val folder: 30
Total number of milk val images required: 200
Number of images used for milk val augmentation: 30
Total number of images now in milk val folder: 210


In [33]:
#----EXTRA CODE FOR VIEWING THE AUGMENTED IMAGES, SEE LINES WITH "(can be replaced)"

# image_iterator = datagen.flow(img_list, batch_size=1)

# rows = 2
# columns = 2

# fig, axes = plt.subplots(rows, columns)
# fig.set_size_inches(10,10)

# for r in range(rows):
#     for c in range(columns):
#         image_batch = image_iterator.next()
#         image = image_batch[0].astype('uint8')
#         axes[r,c].imshow(image)

In [57]:
#----Making sure number of files in each folder is correct
for ingred_name in all_ingredients:
    for dataset in dataset_list:
        print(f"Total number of images in {ingred_name} {dataset} folder: \
        {len(os.listdir(os.path.join(parent_folder_path, dataset, ingred_name)))}")
    print("=============")

Total number of images in apple test folder:         201
Total number of images in apple train folder:         1000
Total number of images in apple val folder:         200
Total number of images in asparagus test folder:         208
Total number of images in asparagus train folder:         1008
Total number of images in asparagus val folder:         207
Total number of images in avocado test folder:         201
Total number of images in avocado train folder:         1000
Total number of images in avocado val folder:         200
Total number of images in banana test folder:         201
Total number of images in banana train folder:         1000
Total number of images in banana val folder:         200
Total number of images in beef test folder:         209
Total number of images in beef train folder:         1053
Total number of images in beef val folder:         225
Total number of images in beetroot test folder:         210
Total number of images in beetroot train folder:         1056
