# 01 - CNN preprocessing comparison

__Andrés Felipe García Albarracín <br>
May 7, 2021__

In [1]:
import tensorflow as tf
import os
from shutil import copyfile

## 1. Order data in the training / validation folders

In [4]:
# Run the following the first time

trainingFolder = "../Datasets/flowers/training"
validationFolder = "../Datasets/flowers/validation"
firstTime = not(os.path.exists(trainingFolder) and os.path.exists(validationFolder))

def safeCreateFolder(folderPath):
    """
    Creates a folder if it does not exist
    """
    if not os.path.exists(folderPath):
        os.mkdir(folderPath)

if firstTime:
    # Unzip files
    from zipfile import ZipFile

    with ZipFile(os.path.join("../Datasets/archive.zip"), 'r') as zipObj:
        zipObj.extractall(os.path.join("../Datasets/flowers"))

    # Create folders
    safeCreateFolder(trainingFolder)
    safeCreateFolder(validationFolder)
    
    # Split function
    def splitFiles(sourcePath, trainingPath, validationPath, splitSize):
        """
        Function that splits the files from sourcePath in trainingPath and
        validationPath, according to the splitSize
        """
        folderList = os.listdir(sourcePath)
        for folder in folderList:
            safeCreateFolder(os.path.join(trainingPath,folder))
            safeCreateFolder(os.path.join(validationPath,folder))
            fileList = os.listdir(os.path.join(sourcePath, folder))
            for n, file in enumerate(fileList):
                if n < splitSize * len(fileList):
                    copyfile(
                        os.path.join(sourcePath, folder, file),
                        os.path.join(trainingPath, folder, file)
                    )
                else:
                    copyfile(
                        os.path.join(sourcePath, folder, file),
                        os.path.join(validationPath, folder, file)
                )
    
    # Call the functions
    splitRatio = 0.7
    splitFiles(
        sourcePath = "../Datasets/flowers/flowers",
        trainingPath = trainingFolder,
        validationPath = validationFolder,
        splitSize = splitRatio
    )

## 2. Define the model to use

In [5]:
def createAndCompile(numClases):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(64, (3,3), activation = 'relu'),
        tf.keras.layers.MaxPool2D(2,2),
        tf.keras.layers.Conv2D(128, (3,3), activation = 'relu'),
        tf.keras.layers.MaxPool2D(2,2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1024, activation = 'relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation = 'relu'),
        tf.keras.layers.Dense(numClases, activation = 'softmax')
    ])
    model.compile(
        optimizer = tf.keras.optimizers.Adam(),
        loss = tf.keras.losses.CategoricalCrossentropy(),
        metrics = ['accuracy']
    )
    return model


ModuleNotFoundError: No module named 'guppy'