In [1]:
import pandas as pd
import os 
import numpy as np
import matplotlib.pyplot as plt
import shutil
from sklearn.model_selection import train_test_split
import keras
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import image_dataset_from_directory

2023-05-23 14:01:14.917385: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-23 14:01:15.053410: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-23 14:01:15.677211: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-05-23 14:01:15.677263: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

# CREATE DATASETS AND PROPER FOLDER STUCTURE


In [2]:

def create_datasets_and_directories(path_to_csv,path_to_output,cap,nb_img_to_keep,only_species=True,image_size=128):

    """
    Given a dataset of cropped images, create the train, validation and test folders.
    Only keeps the images with more than cap images in the dataset, keeps only nb_img_to_keep images per class.
    Split for train, validation and test is 80/10/10.
    
    args : 

    path_to_csv : path to the csv file containing the dataset
                    # paths , # labels 
    path_to_output : path to the output folder were folders will be created
                     in this format : 
                        output_folder
                            - train
                            - validation
                            - test
                            - train_dataset.csv
                            - validation_dataset.csv
                            - test_dataset.csv
                            - weights.h5
    cap : minimum number of images per class, if None no cap
    nb_img_to_keep : number of images to keep per class, if None keep all images
    only_species : if True, only keeps the images labelled as species (i.e. real labels has more than 1 word) 
                     if False, keeps all the images
    image_size : size of the images to resize to
    TODO : integrate the only_species = False
    
    Returns : 
        train_dataset, validation_dataset, test_dataset : Dataset objects
    """

    ###### FILTER THE DATASET ######

    # read the csv file
    df_dataset = pd.read_csv(path_to_csv)
    
    # Take only the images labelled as species (i.e. real labels has more than 1 word)
    if only_species:
        df_dataset = df_dataset[df_dataset["Labels"].str.contains(" ")]
  
    # Get the number of species that have more than cap images
    if cap is not None : 
        species = df_dataset['Labels'].value_counts()[df_dataset['Labels'].value_counts() > cap]

        # Convert the series to a dataframe
        species = species.to_frame()

        # Reset the index
        species.reset_index(inplace=True)

        # Rename the columns
        species.columns = ['Species', 'Number of images']

        # Filter the dataset
        df_dataset = df_dataset[df_dataset["Labels"].isin(species["Species"])]

        print("Number of species with more than {} images : {}".format(cap, len(species)))
        print("Number of images in the filtered dataset : {}".format(len(df_dataset)))

        print('-'*50)

        print(df_dataset['Labels'].value_counts())

    if nb_img_to_keep is not None : 
        
        dataset = df_dataset.groupby('Labels').head(nb_img_to_keep)

    #### SPLITS THE DATASET #####

    train_dataset = dataset.sample(frac=0.8, random_state=0)
    test_dataset = dataset.drop(train_dataset.index)

    train_dataset = train_dataset.reset_index(drop=True)
    test_dataset = test_dataset.reset_index(drop=True)

    train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.2, random_state=0)

    ###### CREATE THE FOLDER STRUCTURE ######

    if os.path.exists(path_to_output):
        shutil.rmtree(path_to_output)

    os.makedirs(path_to_output)

    os.makedirs(path_to_output + "/train")
    os.makedirs(path_to_output + "/validation")
    os.makedirs(path_to_output + "/test")

    ###### COPY THE IMAGES TO THE FOLDERS ######

    for index, row in train_dataset.iterrows():

        # Create the folder if it does not exist
        if not os.path.exists(path_to_output + "/train/" + row["Labels"]):
            os.makedirs(path_to_output + "/train/" + row["Labels"])

        # Copy the image
        shutil.copy(row["Paths"], path_to_output + "/train/" + row["Labels"])

    for index, row in val_dataset.iterrows():
            
            # Create the folder if it does not exist
            if not os.path.exists(path_to_output + "/validation/" + row["Labels"]):
                os.makedirs(path_to_output + "/validation/" + row["Labels"])
    
            # Copy the image
            shutil.copy(row["Paths"], path_to_output + "/validation/" + row["Labels"])

    for index, row in test_dataset.iterrows():

        # Create the folder if it does not exist
        if not os.path.exists(path_to_output + "/test/" + row["Labels"]):
            os.makedirs(path_to_output + "/test/" + row["Labels"])

        # Copy the image
        shutil.copy(row["Paths"], path_to_output + "/test/" + row["Labels"])

    ###### CREATE THE CSV FILES ######

    train_dataset.to_csv(path_to_output + "/train_dataset.csv", index=False)
    val_dataset.to_csv(path_to_output + "/validation_dataset.csv", index=False)
    test_dataset.to_csv(path_to_output + "/test_dataset.csv", index=False)


    ##### MAKE THE DATASET OBJECTS #####

    train_dataset = image_dataset_from_directory(os.path.join(path_to_output,"train"), shuffle=True, batch_size=32, image_size=(image_size,image_size),labels = 'inferred',label_mode= 'categorical')
    test_dataset = image_dataset_from_directory(os.path.join(path_to_output,"test"), shuffle=True, batch_size=32, image_size=(image_size,image_size),labels = 'inferred',label_mode= 'categorical')
    val_dataset = image_dataset_from_directory(os.path.join(path_to_output,"validation" ),shuffle=True, batch_size=32, image_size=(image_size,image_size),labels = 'inferred',label_mode= 'categorical')

    return train_dataset, val_dataset, test_dataset

In [3]:
PATH_TO_DATASET ="/workspaces/projet_bees_detection_basile/bees_detection/src/datafiles/final_datafiles/dataset_yolo_cropped_with_cleaned_structure.csv"

##### PARAMETERS #####
OUTPUT_FOLDER = "/workspaces/projet_bees_detection_basile/data_bees_detection/benchmark_classification/23_05_VGG16"
IMG_SIZE = 224
CAP = 20000
NB_IMG_TO_KEEP = 20
##### PARAMETERS #####

train_dataset, val_dataset, test_dataset = create_datasets_and_directories(PATH_TO_DATASET, OUTPUT_FOLDER, cap=CAP, nb_img_to_keep=NB_IMG_TO_KEEP, image_size=IMG_SIZE, only_species=True)

Number of species with more than 20000 images : 2
Number of images in the filtered dataset : 69628
--------------------------------------------------
Apis mellifera       49093
Bombus terrestris    20535
Name: Labels, dtype: int64
Found 25 files belonging to 2 classes.


2023-05-23 14:01:17.772316: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-23 14:01:17.786925: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-23 14:01:17.787094: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-23 14:01:17.788372: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuil

Found 8 files belonging to 2 classes.
Found 7 files belonging to 2 classes.


In [4]:
# check one image
test = train_dataset.as_numpy_iterator()
image, label = next(test)

min,max,mean,std = np.min(image), np.max(image), np.mean(image), np.std(image)

print("min : {}, max : {}, mean : {}, std : {}".format(min,max,mean,std))

min : 0.0, max : 255.0, mean : 89.81210327148438, std : 71.43992614746094


In [5]:
NB_CLASSES = len(train_dataset.class_names)
CLASS_NAMES = train_dataset.class_names

# PROCESS INPUTS



In [6]:

def color_preprocessing(x):
    x = x.astype('float32')

    # RGB 
    mean = [125.3, 123.0, 113.9]
    std  = [63.0,  62.1,  66.7]


    # TODO : modify fo imagenet mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]".

    for i in range(3):
        # standardization
        x[:,:,:,i] = (x[:,:,:,i] - mean[i]) / std[i]
    return x
     

from keras.utils import Sequence
import numpy as np
import cv2 as cv

class AbeillesSequence(Sequence):
    # Initialisation de la séquence avec différents paramètres

    def __init__(self, x_train, y_train, batch_size, augmentations,class_names):
        self.x_train = x_train
        self.y_train = y_train
        self.classes = class_names
        self.batch_size = batch_size
        self.augment = augmentations
        self.indices1 = np.arange(len(x_train))


        np.random.shuffle(self.indices1) 
        # Les indices permettent d'accéder
        # aux données et sont randomisés à chaque epoch pour varier la composition
        # des batches au cours de l'entraînement

    # Fonction calculant le nombre de pas de descente du gradient par epoch
    def __len__(self):
        return int(np.ceil(x_train.shape[0] / float(self.batch_size)))
    
    def _read_img(self, img_path):
        img = cv.imread(img_path)
        img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
        return img


    # Fonction appelée à chaque nouveau batch : sélection et augmentation des données
    # idx = position du batch (idx = 5 => on prend le 5ème batch)
    def __getitem__(self, idx):

        # Sélection des données
        batch_x = self.x_train[self.indices1[idx * self.batch_size:(idx + 1) * self.batch_size]]
        batch_y = self.y_train[self.indices1[idx * self.batch_size:(idx + 1) * self.batch_size]]
           
        # Lecture des images
        batch_x = np.array([self._read_img(file_name) for file_name in batch_x])

        # Normalisation des données
        batch_x = color_preprocessing(batch_x)
        
        return batch_x, batch_y

    # Fonction appelée à la fin d'un epoch ; on randomise les indices d'accès aux données
    def on_epoch_end(self):
        np.random.shuffle(self.indices1)
     

In [17]:
def process_datasets(train_dataset,test_dataset,val_dataset):
  """
  Given three BatchDatasets objects, process them to have inputable objects.

  args : 
      - train_dataset
      - test_dataset
      - val_dataset
  
  returns :
      - ds_train : abeilleSequence ojbect
      - x_val,y_val : np arrays
      - x_test,y_test : np arrays
      - 
  """


  #### TRAIN ####

  x_train = np.array(train_dataset.file_paths)
  y_train = np.zeros((len(train_dataset.file_paths),NB_CLASSES))

  ind_data = 0
  for bx, by in train_dataset.as_numpy_iterator():
    y_train[ind_data:ind_data+bx.shape[0]] = by
    ind_data += bx.shape[0]


  ds_train = AbeillesSequence(x_train, y_train, batch_size=32, augmentations=None, class_names=CLASS_NAMES)

  #### VAL ####

  # Normalisation des données de validation
  x_val = np.zeros((len(val_dataset.file_paths),IMG_SIZE,IMG_SIZE,3))
  y_val = np.zeros((len(val_dataset.file_paths), len(val_dataset.class_names)))

  ind_data = 0
  for bx, by in val_dataset.as_numpy_iterator():
    x_val[ind_data:ind_data+bx.shape[0]] = bx
    y_val[ind_data:ind_data+bx.shape[0]] = by
    ind_data += bx.shape[0]

  x_val = color_preprocessing(x_val)


  #### TEST ####
  x_test = np.zeros((len(test_dataset.file_paths),IMG_SIZE,IMG_SIZE,3 ))
  y_test = np.zeros((len(test_dataset.file_paths), len(test_dataset.class_names)))

  ind_data = 0

  for bx, by in test_dataset.as_numpy_iterator():
    x_test[ind_data:ind_data+bx.shape[0]] = bx
    y_test[ind_data:ind_data+bx.shape[0]] = by
    ind_data += bx.shape[0]

  x_test= color_preprocessing(x_test)


  return ds_train , x_val , y_val , x_test , y_test



# CONSTRUCT MODEL

In [8]:

base_model = tf.keras.applications.ResNet50(input_shape=(IMG_SIZE,IMG_SIZE,3), include_top=False, weights='imagenet')

base_model.trainable = False

flatten = tf.keras.layers.Flatten()(base_model.output)
dense_1 = tf.keras.layers.Dense(256, activation='relu')(flatten)
dense_2 = tf.keras.layers.Dense(128, activation='relu')(dense_1)
classification = tf.keras.layers.Dense(NB_CLASSES, activation='softmax')(dense_2)

model = tf.keras.models.Model(inputs=base_model.input, outputs=classification)