# 01 - CNN preprocessing comparison

__Andrés Felipe García Albarracín <br>
May 7, 2021__

In [1]:
import tensorflow as tf
import os
from shutil import copyfile
import pandas as pd
import time
import math

## 1. Order data in the training / validation folders

In [2]:
# Run the following the first time

trainingFolder = "../Datasets/flowers/training"
validationFolder = "../Datasets/flowers/validation"
firstTime = not(os.path.exists(trainingFolder) and os.path.exists(validationFolder))

def safeCreateFolder(folderPath):
    """
    Creates a folder if it does not exist
    """
    if not os.path.exists(folderPath):
        os.mkdir(folderPath)

if firstTime:
    # Unzip files
    from zipfile import ZipFile

    with ZipFile(os.path.join("../Datasets/archive.zip"), 'r') as zipObj:
        zipObj.extractall(os.path.join("../Datasets/flowers"))

    # Create folders
    safeCreateFolder(trainingFolder)
    safeCreateFolder(validationFolder)
    
    # Split function
    def splitFiles(sourcePath, trainingPath, validationPath, splitSize):
        """
        Function that splits the files from sourcePath in trainingPath and
        validationPath, according to the splitSize
        """
        folderList = os.listdir(sourcePath)
        for folder in folderList:
            safeCreateFolder(os.path.join(trainingPath,folder))
            safeCreateFolder(os.path.join(validationPath,folder))
            fileList = os.listdir(os.path.join(sourcePath, folder))
            for n, file in enumerate(fileList):
                if n < splitSize * len(fileList):
                    copyfile(
                        os.path.join(sourcePath, folder, file),
                        os.path.join(trainingPath, folder, file)
                    )
                else:
                    copyfile(
                        os.path.join(sourcePath, folder, file),
                        os.path.join(validationPath, folder, file)
                )
    
    # Call the functions
    splitRatio = 0.7
    splitFiles(
        sourcePath = "../Datasets/flowers/flowers",
        trainingPath = trainingFolder,
        validationPath = validationFolder,
        splitSize = splitRatio
    )

## 2. Define the model to use

In [3]:
def createAndCompile(numClases):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(16, (3,3), activation = 'relu'),
        tf.keras.layers.MaxPool2D(2,2),
        tf.keras.layers.Conv2D(32, (3,3), activation = 'relu'),
        tf.keras.layers.MaxPool2D(2,2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(256, activation = 'relu'),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(64, activation = 'relu'),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(numClases, activation = 'softmax')
    ])
    #run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)
    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss = tf.keras.losses.CategoricalCrossentropy(),
        metrics = ['accuracy']
    #    options = run_opts
    )
    return model


In [4]:
numClases = len(os.listdir(trainingFolder))

## 3. Strategies

In [5]:
dfResults = pd.DataFrame(columns=['Experiment', 'Training time', 'Evaluation time'])

### 3.1 Image Data Generator

In [6]:
batch_size = 8
trainDataGen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255
)
train_generator = trainDataGen.flow_from_directory(
    trainingFolder,
    batch_size = batch_size,
    target_size = (150,150),
    class_mode = 'categorical'
)
validationDataGen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255
)
validation_generator = validationDataGen.flow_from_directory(
    validationFolder,
    batch_size = batch_size,
    target_size = (150,150),
    class_mode = 'categorical'
)

Found 3028 images belonging to 5 classes.
Found 1295 images belonging to 5 classes.


In [7]:
model1 = createAndCompile(numClases)

In [8]:
start = time.time()
model1.fit(
    train_generator,
    epochs = 10,
    steps_per_epoch = math.ceil(train_generator.samples/batch_size)
)
trainTime = time.time() - start
start = time.time()
model1.evaluate(
    validation_generator,
    steps = math.ceil(validation_generator.samples/batch_size)
)
evalTime = time.time() - start

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
dfResults = dfResults.append({
    'Experiment': '1. ImageDataGenerator',
    'Training time': trainTime,
    'Evaluation time': evalTime},
    ignore_index=True)

In [10]:
dfResults

Unnamed: 0,Experiment,Training time,Evaluation time
0,1. ImageDataGenerator,90.574544,4.466817
