# Introduction

This Notebook allow to create and visualize the dataset for this project.

Some functions was taked from "Visión por computadora" workshop by Mauricio Repetto & Waldemar López 

(https://drive.google.com/file/d/1neqSeyIqdpufL4EtY6jUirUvWGya0Mkp/view?usp=sharing)


# Imports

In [1]:
import numpy as np
import tensorflow as tf
import datetime; 
import pandas as pd
import matplotlib.pyplot as plt
import math

from shutil import copy2, rmtree, copytree
from tqdm import tqdm
from sys import stdout
from os import listdir, makedirs, remove
from os.path import isfile, join, isdir, exists, dirname, abspath
from inspect import getsourcefile
from tensorflow import keras
from numpy.random import seed
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from itertools import product

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Downloading dataset from Github

In [2]:
![ ! -d "dataset" ] && echo "Cloning dataset project from github" && git clone https://github.com/alphonse92/momo-dataset.git dataset 
!cd dataset
!# Set the dataset branch
!git checkout master
!cd ..

Cloning dataset project from github
Cloning into 'dataset'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 647 (delta 0), reused 19 (delta 0), pack-reused 627[K
Receiving objects: 100% (647/647), 62.33 MiB | 5.09 MiB/s, done.
Resolving deltas: 100% (3/3), done.
Already on 'master'
Your branch is up to date with 'origin/master'.


In [3]:
# Load Inception V3.

InceptionV3             = keras.applications.inception_v3.InceptionV3
     
preprocess_input        = keras.applications.inception_v3.preprocess_input
image                   = keras.preprocessing.image
     
ImageDataGenerator      = keras.preprocessing.image.ImageDataGenerator
Callback                = keras.callbacks
     
Model                   = keras.models.Model
Sequential              = keras.models.Sequential
layers                  = keras.layers
     
Input                   = layers.Input
Dense                   = layers.Dense
Conv2D                  = layers.Conv2D
ZeroPadding2D           = layers.ZeroPadding2D
BatchNormalization      = layers.BatchNormalization
AveragePooling2D        = layers.AveragePooling2D
MaxPooling2D            = layers.MaxPooling2D
GlobalAveragePooling2D  = layers.GlobalAveragePooling2D




In [4]:
inceptionV3Model = InceptionV3(weights='imagenet', include_top=False, pooling=  'avg')

# Uncomment to describe the inception v3 summary model
#print(inceptionV3Model.summary())

# Notebook Configuration 


In [5]:
# Notebook Configuration
USE_INCEPTION_V3_MODEL = True
RESET_TRAINING_EXAMPLES = True
SAVE_WEIGHTS = True
SEED_APP = 9

# DATASET PROPORTIONS
TRAINING_PERCENTAGE = 0.7
TESTING_PERCENTAGE  = 0.15
EVAL_PERCENTAGE     = 0.15

# HYPERPARAMETERS
CRITERIA_THRESHOLD = 0.1
EPOCHS = 25
BATCH_SIZE = 15
IMG_W = IMG_H = 299


# MODEL CHECKPOINTS CONFIGURATION
SAVE_CHECKPOINTS = 5
SAVE_WEIGHTS_FREQUENCY = 'epoch'
SAVE_WEIGTHS_PERIOD = EPOCHS // SAVE_CHECKPOINTS

# COLAB SCOPE
GOOGLE_COLLAB = False                                                            # Default value. You should not modify this
GOOGLE_RESET_CONTENT_TREE = True                                                 # Reset the dataset content tree. It means remove and re copy the data from drive
MOUNT = "./"                                                                     # Base path of this project. You may not change this value   
G_MOUNT = "/content/drive"                                                       # Set where the drive folder will be mounted
G_PROJECT_PATH = G_MOUNT+ "/My Drive/Colab Notebooks/uruit-ml-momo-test/"        # Set the pathe where momo project is 

# If colab instance, then build the content tree
try:
  import google.colab
  from google.colab import drive
  drive.mount('/content/drive', force_remount=GOOGLE_RESET_CONTENT_TREE)
  GOOGLE_COLLAB = True
except:
  GOOGLE_COLLAB = False
  print(tf.test.gpu_device_name())
  




# Variables

In [6]:



tf.random.set_seed(SEED_APP)

MOMO_CLASSNAME    = "momo"
NO_MOMO_CLASSNAME = "no_momo"

DATASET_PATH = join(MOUNT, "dataset/")
RESULT_FOLDER_PATH = join(MOUNT,"result/")
RESULT_FOLDER_WEIGHTS_INCEPTION_V3_PATH = RESULT_FOLDER_PATH + "inception_v3/"
DATESET_BASIC_PATH    = join(DATASET_PATH,'basic/')
DATESET_TRAINING_PATH = join(DATASET_PATH,'train/')
DATESET_TESTING_PATH  = join(DATASET_PATH,'test/')
DATESET_EVAL_PATH     = join(DATASET_PATH,'eval/')


DEFAULT_WEIGHTS_FILE_PATH = RESULT_FOLDER_WEIGHTS_INCEPTION_V3_PATH + 'weights.h5'

print(DATESET_BASIC_PATH)

./dataset/basic/


## Functions

In [7]:
def getFolders(path):
    return [d for d in listdir(path) if isdir(join(path, d))]

def getFolderFiles(path: str):
    return [f for f in listdir(path) if isfile(join(path, f))]  


def predict(path: str) -> np.array:
    img = image.load_img(path, target_size=(299, 299))
    # Size  (299, 299, 3)
    imgArray = image.img_to_array(img) 
    
    # Size  (1, 299, 299, 3)
    expandedImgArray = np.expand_dims(imgArray, axis=0) 
    
    # Preproces to inceptionV3, normalize each pixel RGB value to an scale of zero to one
    processedImgArray = preprocess_input(expandedImgArray) 
    
    return inceptionV3Model.predict(processedImgArray)

def getTimestamp():
    return datetime.datetime.now().timestamp()
    
def getRandomExample(xClass:str):

    exampleFileList = getFolderFiles(DATESET_BASIC_PATH + xClass)
    
    rndIndex = np.random.randint(0,len(exampleFileList))
    filename = exampleFileList[rndIndex]
    return join(DATESET_BASIC_PATH,xClass,filename)

def getClasses():
    return getFolders(DATESET_BASIC_PATH)
    
def createFolderIfNotExist(folderPath):
    if not exists(folderPath):
        makedirs(folderPath)

def deleteIfExist(filepath):
    if exists(filepath):
        remove(filepath)

def saveInFileIfNotExist(filepath: str, content: str):
  
    # Create (or not) the result folder
    createFolderIfNotExist(dirname(filepath))
    
    with open(filepath, mode="a") as f:
        f.write(content + '\n')
        

# Observe a single example

In [8]:
np.random.seed(SEED_APP)

CLASSES = getClasses()
RANDOM_POSITIVE_EXAMPLE_PATH = getRandomExample("momo")
RANDOM_POSITIVE_EXAMPLE_FILE = predict(RANDOM_POSITIVE_EXAMPLE_PATH)
print("Momo class random file path" , RANDOM_POSITIVE_EXAMPLE_FILE)

predict(RANDOM_POSITIVE_EXAMPLE_PATH)


Momo class random file path [[0.7938269  0.08079947 0.2090581  ... 0.76392585 0.5125654  0.18211181]]


array([[0.7938269 , 0.08079947, 0.2090581 , ..., 0.76392585, 0.5125654 ,
        0.18211181]], dtype=float32)

# Create a CSV to visualize dataset data

In [10]:

CSV_PATH = RESULT_FOLDER_PATH + "basic-predictions/" + str(getTimestamp()) + "/result.csv"

# Write in the file the csv Hheaders
saveInFileIfNotExist(CSV_PATH, "class;image_name;predictions") 

for _class_ in CLASSES:
    classFolderPath = join(DATESET_BASIC_PATH, _class_)
    imagePaths = getFolderFiles(classFolderPath)
    
    print(f"Processing {_class_}...")
    
    for imgName in tqdm(imagePaths, file=stdout):
        imagePath = join(classFolderPath, imgName)
        # Inception V3 return an array of (1,2048)
        try:
            predictions = predict(imagePath)

            # Get a CSV row for the current class,image and prediction
            csvRow = f'"{_class_}";"{imgName}";"{",".join([ str(pred) for pred in predictions[0]])}"'
            saveInFileIfNotExist(CSV_PATH, csvRow)
        except Exception:
            print("Can't to load: " , imagePath, "Please check your dataset and remove it if it is required")
            


Processing meme...
100%|██████████| 60/60 [00:05<00:00, 10.92it/s]
Processing person...
100%|██████████| 88/88 [00:08<00:00, 10.98it/s]
Processing momo...
 14%|█▎        | 59/435 [00:05<00:37, 10.06it/s]Can't to load:  ./dataset/basic/momo/.DS_Store Please check your dataset and remove it if it is required
100%|██████████| 435/435 [00:51<00:00,  8.53it/s]


In [None]:
print("loading csv:" ,CSV_PATH)

# Create dataframe with pandas
df = pd.read_csv(CSV_PATH, sep=';')

# Check the head
df.head()



In [None]:
# Get the predictions as float
df['predictions_float'] = df['predictions'].apply(lambda x: np.array([float(str_dim) for str_dim in x.split(',')], dtype=np.float32))

# Save as np array the predictions per record
vectors = np.array(df['predictions_float'].tolist())

# Print the size (m,2048) where m is the length of our dataset. and 2048 is the predictions
# For each class in inception
print("vectors" , vectors.shape)

folderResults       = dirname(CSV_PATH)
filenameTsvLabels   = folderResults + "/" + "result_labels.tsv" 
filenameTsvEmbdings = folderResults + "/" + "result_embdings.tsv" 

deleteIfExist(filenameTsvLabels)
deleteIfExist(filenameTsvEmbdings)

with open(filenameTsvLabels,'w') as f:
    f.write("Index\tLabel\n")
    for index,(file, label) in enumerate(zip(df.image_name, df["class"])):
        f.write(f'{label} - {file}\t{label}\n')

        
with open(filenameTsvEmbdings,'w') as f:
    for dims in vectors:
        f.write('\t'.join([str(dim) for dim in dims])+'\n')

# Visualize the data using tensorflow embding projection

This tool allow to users to visualize him data. Use the last folder in result/basic-predictions/

## Steps

1. Click on Load buttom
2. In step 1 select the file `result_embdings.tsv`
3. In step 2 select the file `result_labels.tsv`
4. Click outside the modal

You should see something like this:

![screenshot using tf embding projector](./docs/tensorflowEmbdingProjector.png)

# PCA 2 dimension Visualization 

In [None]:
# Scaling the data before
vectors_std = StandardScaler().fit_transform(vectors)

In [None]:
vectors[0:10]

In [None]:
vectors_std[0:10]

In [None]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(vectors_std)
print(principalComponents.shape)
pca.explained_variance_ratio_

In [None]:
# Plotting using matplot lib

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 

ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)

ax.set_title('2 component PCA', fontsize = 20)

targets = getClasses()
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = df['class'] == target    
    ax.scatter(principalComponents[indicesToKeep][:,0:1]
               , principalComponents[indicesToKeep][:,1:2]
               , c = color
               , s = 10)
ax.legend(targets)
ax.grid()