In [1]:
import pandas as pd
import os 
import numpy as np
import matplotlib.pyplot as plt
import shutil
from sklearn.model_selection import train_test_split
import keras
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import image_dataset_from_directory

2023-05-25 19:37:54.254226: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-25 19:37:54.393413: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-25 19:37:54.984213: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-05-25 19:37:54.984288: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

# 1. UTILS

CREATE DATASETS AND PROPER FOLDER STUCTURE


In [2]:

def create_datasets_and_directories(path_to_csv,path_to_output,cap,nb_img_to_keep,only_species=True,image_size=128):

    """
    Given a dataset of cropped images, create the train, validation and test folders.
    Only keeps the images with more than cap images in the dataset, keeps only nb_img_to_keep images per class.
    Split for train, validation and test is 80/10/10.
    
    args : 

    path_to_csv : path to the csv file containing the dataset
                    # paths , # labels 
    path_to_output : path to the output folder were folders will be created
                     in this format : 
                        output_folder
                            - train
                            - validation
                            - test
                            - train_dataset.csv
                            - validation_dataset.csv
                            - test_dataset.csv
                            - weights.h5
    cap : minimum number of images per class, if None no cap
    nb_img_to_keep : number of images to keep per class, if None keep all images
    only_species : if True, only keeps the images labelled as species (i.e. real labels has more than 1 word) 
                     if False, keeps all the images
    image_size : size of the images to resize to
    TODO : integrate the only_species = False
    
    Returns : 
        train_dataset, validation_dataset, test_dataset : Dataset objects
    """

    ###### FILTER THE DATASET ######

    # read the csv file
    df_dataset = pd.read_csv(path_to_csv)
    
    # Take only the images labelled as species (i.e. real labels has more than 1 word)
    if only_species:
        df_dataset = df_dataset[df_dataset["Labels"].str.contains(" ")]
  
    # Get the number of species that have more than cap images
    if cap is not None : 
        species = df_dataset['Labels'].value_counts()[df_dataset['Labels'].value_counts() > cap]

        # Convert the series to a dataframe
        species = species.to_frame()

        # Reset the index
        species.reset_index(inplace=True)

        # Rename the columns
        species.columns = ['Species', 'Number of images']

        # Filter the dataset
        df_dataset = df_dataset[df_dataset["Labels"].isin(species["Species"])]

        print("Number of species with more than {} images : {}".format(cap, len(species)))
        print("Number of images in the filtered dataset : {}".format(len(df_dataset)))

        print('-'*50)

        print(df_dataset['Labels'].value_counts())

    if nb_img_to_keep is not None : 
        
        dataset = df_dataset.groupby('Labels').head(nb_img_to_keep)

    #### SPLITS THE DATASET #####


    # Get the paths and the labels
    X = dataset["Paths"]
    y = dataset["Labels"]

    
    X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, stratify=y_test_val, random_state=42)

    # Create the train, validation and test datasets

    train_dataset = pd.concat([X_train, y_train], axis=1)
    val_dataset = pd.concat([X_val, y_val], axis=1)
    test_dataset = pd.concat([X_test, y_test], axis=1)


    # # Making sure that each label is present in the train, validation and test sets

    # train_dataset = dataset.sample(frac=0.8, random_state=0)
    # test_dataset = dataset.drop(train_dataset.index)

    # train_dataset = train_dataset.reset_index(drop=True)
    # test_dataset = test_dataset.reset_index(drop=True)

    # train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.2, random_state=0)





    ###### CREATE THE FOLDER STRUCTURE ######

    if os.path.exists(path_to_output):
        shutil.rmtree(path_to_output)

    os.makedirs(path_to_output)

    os.makedirs(path_to_output + "/train")
    os.makedirs(path_to_output + "/validation")
    os.makedirs(path_to_output + "/test")

    ###### COPY THE IMAGES TO THE FOLDERS ######

    for index, row in train_dataset.iterrows():

        # Create the folder if it does not exist
        if not os.path.exists(path_to_output + "/train/" + row["Labels"]):
            os.makedirs(path_to_output + "/train/" + row["Labels"])

        # Copy the image
        shutil.copy(row["Paths"], path_to_output + "/train/" + row["Labels"])

    for index, row in val_dataset.iterrows():
            
            # Create the folder if it does not exist
            if not os.path.exists(path_to_output + "/validation/" + row["Labels"]):
                os.makedirs(path_to_output + "/validation/" + row["Labels"])
    
            # Copy the image
            shutil.copy(row["Paths"], path_to_output + "/validation/" + row["Labels"])

    for index, row in test_dataset.iterrows():

        # Create the folder if it does not exist
        if not os.path.exists(path_to_output + "/test/" + row["Labels"]):
            os.makedirs(path_to_output + "/test/" + row["Labels"])

        # Copy the image
        shutil.copy(row["Paths"], path_to_output + "/test/" + row["Labels"])

    ###### CREATE THE CSV FILES ######

    train_dataset.to_csv(path_to_output + "/train_dataset.csv", index=False)
    val_dataset.to_csv(path_to_output + "/validation_dataset.csv", index=False)
    test_dataset.to_csv(path_to_output + "/test_dataset.csv", index=False)


    ##### MAKE THE DATASET OBJECTS #####

    train_dataset = image_dataset_from_directory(os.path.join(path_to_output,"train"), shuffle=True, batch_size=32, image_size=(image_size,image_size),labels = 'inferred',label_mode= 'categorical')
    test_dataset = image_dataset_from_directory(os.path.join(path_to_output,"test"), shuffle=True, batch_size=32, image_size=(image_size,image_size),labels = 'inferred',label_mode= 'categorical')
    val_dataset = image_dataset_from_directory(os.path.join(path_to_output,"validation" ),shuffle=True, batch_size=32, image_size=(image_size,image_size),labels = 'inferred',label_mode= 'categorical')

    ##### PRINT INFO #####

    print('-'*50)
    print('-'*50)

    print("Number of species in the train dataset : {}".format(len(train_dataset.class_names)))
    print("Number of images in the train dataset : {}".format(len(train_dataset.file_paths)))

    print('-'*50)
    print("Number of species in the validation dataset : {}".format(len(val_dataset.class_names)))
    print("Number of images in the validation dataset : {}".format(len(val_dataset.file_paths)))
    
    print('-'*50)
    print("Number of species in the test dataset : {}".format(len(test_dataset.class_names)))
    print("Number of images in the test dataset : {}".format(len(test_dataset.file_paths)))

    print('-'*50)
    print('-'*50)


    
    return train_dataset, val_dataset, test_dataset

PROCESS INPUTS



In [3]:

def color_preprocessing(x):
    x = x.astype('float32')

    # RGB 
    mean = [125.3, 123.0, 113.9]
    std  = [63.0,  62.1,  66.7]


    # TODO : modify fo imagenet mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]".

    for i in range(3):
        # standardization
        x[:,:,:,i] = (x[:,:,:,i] - mean[i]) / std[i]
    return x
     

from keras.utils import Sequence
import numpy as np
import cv2 as cv


In [4]:
class AbeillesSequence(Sequence):
    #  Initialisation de la séquence avec différents paramètres
    def __init__(self, x_train, y_train, batch_size, class_names, image_size):
        self.x_train = x_train
        self.y_train = y_train
        self.classes = class_names
        self.batch_size = batch_size
        self.image_size = image_size
        self.indices1 = np.arange(len(x_train))

        np.random.shuffle(self.indices1)
        #  Les indices permettent d'accéder
        #  aux données et sont randomisés à chaque epoch pour varier la composition
        #  des batches au cours de l'entraînement

    #  Fonction calculant le nombre de pas de descente du gradient par epoch
    def __len__(self):
        return int(np.ceil(self.x_train.shape[0] / float(self.batch_size)))

    # Application de l'augmentation de données à chaque image du batch

    def apply_augmentation(self, bx, by):

        batch_x = np.zeros((bx.shape[0], self.image_size, self.image_size, 3))
        batch_y = by

        # Pour chaque image du batch
        for i in range(len(bx)):

            # Récupération du label de l'image
            class_labels = []
            class_id = np.argmax(by[i])
            class_labels.append(self.classes[class_id])

            # Read image
            img = cv.imread(bx[i])
            img = cv.cvtColor(img, cv.COLOR_BGR2RGB)

            # resize
            img = self._resize_img_(img)

            batch_x[i] = img

        return batch_x, batch_y

    def _resize_img_(self, img):
        # resize img to image_size x image_size, add padding if necessary

        shape = img.shape

        # cas 1 : both dimensions are too small
        if shape[0] < self.image_size and shape[1] < self.image_size:

            # add padding
            img = cv.copyMakeBorder(
                img, 0, self.image_size - shape[0], 0, self.image_size - shape[1], cv.BORDER_CONSTANT, value=0)

        # cas 2 : every other case
        else:

            # add padding to the smallest dimension to make it equal to the biggest one
            if shape[0] < shape[1]:
                img = cv.copyMakeBorder(
                    img, 0, shape[1] - shape[0], 0, 0, cv.BORDER_CONSTANT, value=0)
            else:
                img = cv.copyMakeBorder(
                    img, 0, 0, 0, shape[0] - shape[1], cv.BORDER_CONSTANT, value=0)

            # resize
            img = cv.resize(img, (self.image_size, self.image_size))

        return img

    #  Fonction appelée à chaque nouveau batch : sélection et augmentation des données
    # idx = position du batch (idx = 5 => on prend le 5ème batch)

    def __getitem__(self, idx):
        # Sélection des données : batch x correspond aux filepath
        batch_x = self.x_train[self.indices1[idx *
                                             self.batch_size:(idx + 1) * self.batch_size]]
        batch_y = self.y_train[self.indices1[idx *
                                             self.batch_size:(idx + 1) * self.batch_size]]

        #  Application de l'augmentation de données
        batch_x, batch_y = self.apply_augmentation(batch_x, batch_y)

        # Normalisation des données
        batch_x = color_preprocessing(batch_x)

        #### temp ####

        mean, std, min, max = np.mean(batch_x), np.std(
            batch_x), np.min(batch_x), np.max(batch_x)
        shp = batch_x.shape

        print('-'*50)
        print("mean : {}".format(mean), "std : {}".format(std),
              "min : {}".format(min), "max : {}".format(max))
        

        return batch_x, batch_y

    #  Fonction appelée à la fin d'un epoch ; on randomise les indices d'accès aux données
    def on_epoch_end(self):
        np.random.shuffle(self.indices1)


In [10]:
def process_datasets(train_dataset,test_dataset,val_dataset,img_size,batch_size):
  """
  Given three BatchDatasets objects, process them to have inputable objects.

  args : 
      - train_dataset
      - test_dataset
      - val_dataset
  
  returns :
      - ds_train : abeilleSequence ojbect
      - x_val,y_val : np arrays
      - x_test,y_test : np arrays
      - 
  """


  #### TRAIN ####

  x_train = np.array(train_dataset.file_paths)
  y_train = np.zeros((len(train_dataset.file_paths), len(train_dataset.class_names)))

  ind_data = 0
  for bx, by in train_dataset.as_numpy_iterator():
    y_train[ind_data:ind_data+bx.shape[0]] = by
    ind_data += bx.shape[0]


  ds_train = AbeillesSequence(x_train, y_train, batch_size=batch_size, class_names=train_dataset.class_names,image_size=img_size)

  #### VAL ####

  # Normalisation des données de validation
  x_val = np.zeros((len(val_dataset.file_paths),img_size,img_size,3))
  x_val_temp = np.array(val_dataset.file_paths)
  y_val = np.zeros((len(val_dataset.file_paths), len(val_dataset.class_names)))

  ind_data = 0
  for bx, by in val_dataset.as_numpy_iterator():
    x_val[ind_data:ind_data+bx.shape[0]] = bx
    y_val[ind_data:ind_data+bx.shape[0]] = by
    ind_data += bx.shape[0]

  x_val = color_preprocessing(x_val)

  ds_val = AbeillesSequence(x_val_temp, y_val, batch_size=batch_size, class_names=val_dataset.class_names,image_size=img_size)


  #### TEST ####
  x_test = np.zeros((len(test_dataset.file_paths),img_size,img_size,3 ))
  y_test = np.zeros((len(test_dataset.file_paths), len(test_dataset.class_names)))

  ind_data = 0

  for bx, by in test_dataset.as_numpy_iterator():
    x_test[ind_data:ind_data+bx.shape[0]] = bx
    y_test[ind_data:ind_data+bx.shape[0]] = by
    ind_data += bx.shape[0]

  x_test= color_preprocessing(x_test)


  return ds_train , x_val , y_val , x_test , y_test, ds_val


PLOT HISTORY

In [6]:
from matplotlib import pyplot as plt

def plot_history(history,path):

    plt.figure(figsize=(20,10))
    plt.subplot(1,2,1)
    plt.plot(history.history['categorical_accuracy'])
    plt.plot(history.history['val_categorical_accuracy'])
    plt.title('Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['train', 'val'], loc='upper left')

    plt.subplot(1,2,2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['train', 'val'], loc='upper left')

    plt.show()

    plt.savefig(path)


# 2. BENCHMARK MODELS


Simple VGG16 no callbacks 

In [16]:

##### PARAMETERS #####
PATH_TO_DATASET ="/home/basile/Documents/projet_bees_detection_basile/bees_detection/src/datafiles/final_datafiles/dataset_yolo_cropped_with_cleaned_structure.csv"
OUTPUT_FOLDER = "/home/basile/Documents/projet_bees_detection_basile/data_bees_detection/benchmark_classification/2_Resnet_1000_imgs_2_species_256/"
IMG_SIZE = 256
CAP = 20000
NB_IMG_TO_KEEP = 1000
BACTH_SIZE = 32
##### PARAMETERS #####

# Create datasets and directories
train_dataset, val_dataset, test_dataset = create_datasets_and_directories(PATH_TO_DATASET, OUTPUT_FOLDER, cap=CAP, nb_img_to_keep=NB_IMG_TO_KEEP, image_size=IMG_SIZE, only_species=True)

NB_CLASSES = len(train_dataset.class_names)
CLASS_NAMES = train_dataset.class_names


# Process datasets
ds_train , x_val , y_val , x_test , y_test ,ds_val= process_datasets(train_dataset,test_dataset,val_dataset,IMG_SIZE,BACTH_SIZE)


Number of species with more than 20000 images : 2
Number of images in the filtered dataset : 69628
--------------------------------------------------
Apis mellifera       49093
Bombus terrestris    20535
Name: Labels, dtype: int64
Found 1400 files belonging to 2 classes.
Found 300 files belonging to 2 classes.
Found 300 files belonging to 2 classes.
--------------------------------------------------
--------------------------------------------------
Number of species in the train dataset : 2
Number of images in the train dataset : 1400
--------------------------------------------------
Number of species in the validation dataset : 2
Number of images in the validation dataset : 300
--------------------------------------------------
Number of species in the test dataset : 2
Number of images in the test dataset : 300
--------------------------------------------------
--------------------------------------------------


First we'll not use 

In [12]:
# Create model 

from keras.applications.vgg16 import VGG16
from keras.layers import Flatten, Dense
from keras.models import Model

vgg_model = VGG16(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))

# FREEZE LAYERS
for layer in vgg_model.layers:
    layer.trainable = False

# Classification
x = vgg_model.output
x = Flatten()(x)
x = Dense(512, activation="relu")(x)
x = Dense(NB_CLASSES, activation="softmax")(x)

model = Model(inputs=vgg_model.input, outputs=x)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
