# Introduction

This notebooks prepare our dataset before the training. 

# Imports

In [1]:
import numpy as np
import tensorflow as tf
import datetime; 
import pandas as pd
import matplotlib.pyplot as plt
import math

from shutil import copy2, rmtree
from tqdm import tqdm
from sys import stdout
from os import listdir, makedirs, remove
from os.path import isfile, join, isdir, exists, dirname
from tensorflow import keras
from numpy.random import seed
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Project modules below
from lib import util

In [2]:
# Load Inception V3.

InceptionV3        = keras.applications.inception_v3.InceptionV3
preprocess_input   = keras.applications.inception_v3.preprocess_input
image              = keras.preprocessing.image
Model              = keras.models.Model
Dense              = keras.layers.Dense
ImageDataGenerator = keras.preprocessing.image.ImageDataGenerator


In [3]:
inceptionV3Model = InceptionV3(weights='imagenet', include_top=False, pooling=  'avg')
# Uncomment to describe the inception v3 summary model
#print(inceptionV3Model.summary())

# Variables

In [4]:
SEED_APP = 123
tf.random.set_seed(SEED_APP)

BATCH_SIZE = 25
IMG_W = IMG_H = 299

MOMO_CLASSNAME    = "momo"
NO_MOMO_CLASSNAME = "no_momo"

MOUNT = "./"
DATASET_PATH = join(MOUNT, "dataset/")
RESULT_FOLDER_PATH = join(MOUNT,"result/")

DATESET_BASIC_PATH    = join(DATASET_PATH,'basic/')
DATESET_TRAINING_PATH = join(DATASET_PATH,'train/')
DATESET_TESTING_PATH  = join(DATASET_PATH,'test/')
DATESET_EVAL_PATH     = join(DATASET_PATH,'eval/')

print(DATESET_BASIC_PATH)

./dataset/basic/


# Functions

In [31]:
def getFolders(path):
    return [d for d in listdir(path) if isdir(join(path, d))]

def getFolderFiles(path: str):
    return [f for f in listdir(path) if isfile(join(path, f))]  


def predict(path: str) -> np.array:
    img = image.load_img(path, target_size=(299, 299))
    # Size  (299, 299, 3)
    imgArray = image.img_to_array(img) 
    
    # Size  (1, 299, 299, 3)
    expandedImgArray = np.expand_dims(imgArray, axis=0) 
    
    # Preproces to inceptionV3, normalize each pixel RGB value to an scale of zero to one
    processedImgArray = preprocess_input(expandedImgArray) 
    
    return inceptionV3Model.predict(processedImgArray)

def getTimestamp():
    return datetime.datetime.now().timestamp()
    
def getRandomExample(xClass:str):

    exampleFileList = getFolderFiles(DATESET_BASIC_PATH + xClass)
    
    rndIndex = np.random.randint(0,len(exampleFileList))
    filename = exampleFileList[rndIndex]
    return join(DATESET_BASIC_PATH,xClass,filename)

def getDatasetClasses():
    return getFolders(DATESET_BASIC_PATH)

def getOutputClasses():
    return [MOMO_CLASSNAME,NO_MOMO_CLASSNAME]
    
def createFolderIfNotExist(folderPath):
    if not exists(folderPath):
        makedirs(folderPath)

def deleteIfExist(filepath):
    if exists(filepath):
        rmtree(filepath)
        
def resetFolderIfExist(path : str):
    deleteIfExist(path)
    createFolderIfNotExist(path)

def saveInFileIfNotExist(filepath: str, content: str):
  
    # Create (or not) the result folder
    createFolderIfNotExist(dirname(filepath))
    
    with open(filepath, mode="a") as f:
        f.write(content + '\n')
        

# Observe a single example

In [6]:
np.random.seed(SEED_APP)

CLASSES = getDatasetClasses()
RANDOM_POSITIVE_EXAMPLE_PATH = getRandomExample("momo")
RANDOM_POSITIVE_EXAMPLE_FILE = predict(RANDOM_POSITIVE_EXAMPLE_PATH)
print("Momo class random file path" , RANDOM_POSITIVE_EXAMPLE_FILE)

predict(RANDOM_POSITIVE_EXAMPLE_PATH)


Momo class random file path [[0.19734913 0.45167    0.06192229 ... 0.13883413 0.17555334 0.6870728 ]]


array([[0.19734913, 0.45167   , 0.06192229, ..., 0.13883413, 0.17555334,
        0.6870728 ]], dtype=float32)

# Preparing InceptionV3 Model to adjust it to our problem: identify momo in. images

In [7]:
# Take the output of the model
x = inceptionV3Model.output

# Add a full-conected layer of 1024 neurons with relu activation to our model output
x = Dense(1024, activation='relu')(x)

# Add a output layer with only one neurone
momoOutput = Dense(1, activation='sigmoid')(x)

# Create the momo Model from our outputs
momoModel = Model(inputs=inceptionV3Model.input, outputs=momoOutput)

# Compile our model using adam and an optimizer for binari clasification
momoModel.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

In [8]:
#print(momoModel.summary())

# Pre training

## Prepare our test/training folders

In [39]:
# Read each class of the dataset
for ds_class in getOutputClasses():
    # Create the folders in the train/test folders
    resetFolderIfExist( DATESET_TRAINING_PATH  +  ds_class  )
    resetFolderIfExist( DATESET_TESTING_PATH   +  ds_class  )
    resetFolderIfExist( DATESET_EVAL_PATH      +  ds_class  )


## Fill the train/test/eval folders

In [40]:
DATASET_CLASSES = getDatasetClasses()


# Proportions
TRAINING_PERCENTAGE = 0.7
TESTING_PERCENTAGE  = 0.15
EVAL_PERCENTAGE     = 0.15

ds_folders = getFolders(DATESET_BASIC_PATH)

for ds_folder in ds_folders:
    
    path      = DATESET_BASIC_PATH + ds_folder
    files     = np.array(getFolderFiles(path))

    m         = len(files)
    
    trainIdx  = math.ceil( m * TRAINING_PERCENTAGE )
    testIdx   = math.ceil( m * TESTING_PERCENTAGE  ) 
    evalIdx   = math.ceil( m * EVAL_PERCENTAGE     )     
    
    np.random.shuffle(files)
    
    isPositiveClass      = ds_folder == MOMO_CLASSNAME
    folderTo             = MOMO_CLASSNAME if isPositiveClass else NO_MOMO_CLASSNAME
    
    trainingClassPath    = DATESET_TRAINING_PATH + folderTo
    testClassPath        = DATESET_TESTING_PATH  + folderTo
    evalClassPath        = DATESET_EVAL_PATH     + folderTo
    
   
    currentIndex     = 0
    trainingImages   = files[ currentIndex : currentIndex + trainIdx ]
    
    currentIndex     = currentIndex + trainIdx 
    testImages       = files[ currentIndex : currentIndex + testIdx  ]
    
    currentIndex     = currentIndex + testIdx
    evalImages       = files[ currentIndex :          ]
    
    print(
        "From folder '" + ds_folder  + "'"
        + " take " 
        + str(len(trainingImages)) + " Training examples, "
        + str(len(testImages))     + " Testing examples, and "
        + str(len(evalImages))     + " Eval examples."        
    )
    
    print("Copying traning files from " + path + " to " + trainingClassPath)
    for imageName in tqdm(trainingImages, file=stdout):
        copy2(path+ "/"+  imageName ,trainingClassPath + "/"+  imageName)
        
    print("Copying testing files from " + path + " to " + testClassPath)
    for imageName in tqdm(testImages, file=stdout):
        copy2(path+ "/"+  imageName ,testClassPath + "/"+  imageName)
        
    print("Copying eval files from "    + path + " to " + evalClassPath)
    for imageName in tqdm(evalImages, file=stdout):
        copy2(path+ "/"+  imageName ,evalClassPath + "/"+  imageName)

    
    
    



From folder 'meme' take 42 Training examples, 9 Testing examples, and 9 Eval examples.
Copying traning files from ./dataset/basic/meme to ./dataset/train/no_momo
100%|██████████| 42/42 [00:00<00:00, 1699.92it/s]
Copying testing files from ./dataset/basic/meme to ./dataset/test/no_momo
100%|██████████| 9/9 [00:00<00:00, 1339.65it/s]
Copying eval files from ./dataset/basic/meme to ./dataset/eval/no_momo
100%|██████████| 9/9 [00:00<00:00, 1037.94it/s]
From folder 'person' take 62 Training examples, 14 Testing examples, and 12 Eval examples.
Copying traning files from ./dataset/basic/person to ./dataset/train/no_momo
100%|██████████| 62/62 [00:00<00:00, 1159.62it/s]
Copying testing files from ./dataset/basic/person to ./dataset/test/no_momo
100%|██████████| 14/14 [00:00<00:00, 1661.72it/s]
Copying eval files from ./dataset/basic/person to ./dataset/eval/no_momo
100%|██████████| 12/12 [00:00<00:00, 1784.87it/s]
From folder 'momo' take 80 Training examples, 17 Testing examples, and 16 Eval e


## Data augmentation

In [None]:
# Create a generator to pre process our dataset images
imageGenerator = ImageDataGenerator(
    rescale          = 1./255,       # Scale our data to our dataset scale
    horizontal_flip  = True, # Horizontal mirror
    vertical_flip    = False   # Disable vertical mirror
)

trainGenerator = imageGenerator.flow_from_directory(
        directory   = DATESET_BASIC_PATH + "momo",
        target_size =  (IMG_H, IMG_W),
        batch_size  =  BATCH_SIZE,
        class_mode  =  'binary',
        classes     =  getOutputClasses())

testGenerator= imageGenerator.flow_from_directory(
        directory    = DATESET_BASIC_PATH + "momo",
        target_size  = (IMG_H, IMG_W),
        batch_size   = BATCH_SIZE,
        class_mode   = 'binary',
        classes      = getOutputClasses())