# CHANGE BDD STRUCTURE

In [36]:
import os 
import shutil
import pandas as pd
import numpy as np

In [107]:
path_to_bdd = "/home/basile/Documents/projet_bees_detection_basile/data_bees_detection/whole_dataset_cropped_with_cleaned_structure"
path_to_csv = "/home/basile/Documents/projet_bees_detection_basile/bees_detection/src/datafiles/final_datafiles/dataset_yolo_cropped_with_real_labels.csv"

In [108]:
# Create all the folders for the new structure

if not os.path.exists(path_to_bdd):
    os.mkdir(path_to_bdd)       

df = pd.read_csv(path_to_csv)
species = df["Real labels"].unique()

for specie in species:

    path = os.path.join(path_to_bdd, specie)

    if not os.path.exists(path):
        os.mkdir(path)

In [109]:
# Copy all the images in the right folder
# the name of the image is the concatenation of the path and the name of the image

for index, row in df.iterrows():

    
    image_path = row["Paths"]
    real_label = row["Real labels"]
    image_name = "_".join(row["Paths"].split("/")[6:])

    shutil.copy(os.path.join(image_path), os.path.join(path_to_bdd, real_label, image_name))



In [110]:
# Save the new dataset in a csv file

df = pd.DataFrame(columns=["Paths", "Labels"])


for specie in species:

    path = os.path.join(path_to_bdd, specie)

    temp = pd.DataFrame(columns=["Paths", "Labels"])
    temp["Paths"] = [os.path.join(path, image) for image in os.listdir(path)]
    temp["Labels"] = specie

    df = pd.concat([df, temp])

df.to_csv("/home/basile/Documents/projet_bees_detection_basile/bees_detection/src/datafiles/final_datafiles/dataset_yolo_cropped_with_cleaned_structure.csv", index=False)

# GEN TRAIN TEST VAL

To start with, we'll create a simple dataset
- only images labelled at species level
- only species with more than a specified nb of pictures
- no hierarchy between databases

1/ Filter dataset

In [111]:
def filter_dataset(only_species=True, min_images=None,hierarchy=False,df_dataset=None):
    """
    Filter the dataset according to specified criteria. 
    Parameters
    ---------------
    only_species : keep only the pictures labelled in picture level
    min_images : keep only the taxa that have more than x images
    path_to_dataset : dataset to filter in csv
            # Path #Labels

    Returns 
    --------------
    df_dataset : filtered dataset # Paths # Labels
    species : species in filtered dataset # Specie # Nb_img

    """


    # Take only the images labelled as species (i.e. real labels has more than 1 word)
    if only_species:
        df_dataset = df_dataset[df_dataset["Labels"].str.contains(" ")]
  
    # Get the number of species that have more than min_images images
    if min_images is not None : 

        species = df_dataset['Labels'].value_counts()[df_dataset['Labels'].value_counts() > min_images]

        # Convert the series to a dataframe
        species = species.to_frame()

        # Reset the index
        species.reset_index(inplace=True)

        # Rename the columns
        species.columns = ['Species', 'Number of images']

        # Filter the dataset
        df_dataset = df_dataset[df_dataset["Labels"].isin(species["Species"])]

    return df_dataset, species

df_dataset=pd.read_csv("/home/basile/Documents/projet_bees_detection_basile/bees_detection/src/datafiles/final_datafiles/dataset_yolo_cropped_with_cleaned_structure.csv")
df_dataset, species = filter_dataset(only_species=True, min_images=100,
                                      df_dataset=df_dataset)

2/ Split the filtered dataset into train / test / val 

In [112]:
# For each specie 80% of the images are used for training 10% for validation and 10% for testing

def split_dataset(df_dataset=None, df_dataset__species= None, path_to_output= None):
    """
    Split the dataset into train, valid and test set, 
    with 80% of the images for training, 10% for validation and 10% for testing

    Parameters
    ----------
    df_dataset : pandas dataframe of all the pictures
                # Paths # Labels
    df_dataset_species : pandas dataframe with only the filtered species   
                # Species # Number of images
    path_to_output : path to the output folder where the csv files will be saved

    Returns
    -------
    df_train : pandas dataframe of the training set
                # Paths # Real labels
    df_valid : pandas dataframe of the validation set
                # Paths # Real labels
    df_test : pandas dataframe of the testing set   
                # Paths # Real labels

    Csv files :
     whole dataset 
     train/test/val
     correspondance label/id
    are saved in the output folder

    """


    # Create empty dataframes
    df_train = pd.DataFrame()
    df_valid = pd.DataFrame()
    df_test = pd.DataFrame()

    for specie in species['Species']:

        df_img = df_dataset[df_dataset['Labels'] == specie]

        # shuffle the dataframe
        df_img = df_img.sample(frac=1).reset_index(drop=True)

        # get the number of images
        nb_img = len(df_img)

        # get the number of images for each set
        nb_img_train = int(nb_img * 0.8)
        nb_img_valid = int(nb_img * 0.1)
        nb_img_test = int(nb_img * 0.1)

        # get the dataframe for each set
        df_img_train = df_img.iloc[:nb_img_train]
        df_img_valid = df_img.iloc[nb_img_train:nb_img_train+nb_img_valid]
        df_img_test = df_img.iloc[nb_img_train+nb_img_valid:]

        # Concatenate the dataframe for each set
        df_train = pd.concat([df_train, df_img_train])
        df_valid = pd.concat([df_valid, df_img_valid])
        df_test = pd.concat([df_test, df_img_test])

   
    # Drop the Real labels column
    df_train.drop(columns=['Labels'], inplace=True)
    df_valid.drop(columns=['Labels'], inplace=True)
    df_test.drop(columns=['Labels'], inplace=True)

    if not os.path.exists(path_to_output):
        os.mkdir(path_to_output)

    # save the dataframe to csv
    df_train.to_csv(os.path.join(path_to_output, 'train.csv'), index=False)
    df_valid.to_csv(os.path.join(path_to_output, 'valid.csv'), index=False)
    df_test.to_csv(os.path.join(path_to_output, 'test.csv'), index=False) 

    # save the whole dataset to csv
    df_dataset.to_csv(os.path.join(path_to_output, 'dataset.csv'), index=False)
    species.to_csv(os.path.join(path_to_output,'dataset_summary.csv'),index=False)


    return df_train, df_valid, df_test


df_train, df_valid, df_test = split_dataset(df_dataset,species,"/home/basile/Documents/projet_bees_detection_basile/bees_detection/src/datafiles/classification/inputs/VGG16_1")

3/ Create a folder to store this split

In [113]:
def copy_split_to_folder(path_to_folder=None, df_train=None ,df_test=None,df_valid=None,species=None):
    """
    Copy the pictures in a new folder
    """

    # Creates folders
    if not os.path.exists(path_to_folder):
        os.mkdir(path_to_folder)

    names = ['train','test','valid']
    dfs = [df_train,df_test,df_valid]

    def df_to_folder(df,name):
        
        if not os.path.exists(os.path.join(path_to_folder,name)):
            os.mkdir(os.path.join(path_to_folder,name))

        for specie in species['Species']: 

            if not os.path.exists(os.path.join(path_to_folder,name,specie)):
                os.mkdir(os.path.join(path_to_folder,name,specie))

      
        for old_path in df['Paths']:

            new_path = os.path.join(path_to_folder, name,('/').join(old_path.split(os.path.sep)[-2:]))
            old_path = os.path.join(old_path)
            shutil.copy(old_path,new_path)
    

    for name,df in zip(names,dfs):
        df_to_folder(df,name)


copy_split_to_folder(path_to_folder='/home/basile/Documents/projet_bees_detection_basile/data_bees_detection/VGG16_1',df_train=df_train,df_test=df_test,df_valid=df_valid,species=species)


4/ Convert it to dataset

In [2]:
import keras
import tensorflow as tf
from keras.utils import image_dataset_from_directory

IMG_SIZE = 64
train_ds = keras.utils.image_dataset_from_directory(
    directory= '/home/basile/Documents/projet_bees_detection_basile/data_bees_detection/VGG16_1/train',
    labels='inferred',
    label_mode='categorical',
    shuffle = False,
    batch_size=16,
    image_size=(IMG_SIZE, IMG_SIZE))

valid_ds = keras.utils.image_dataset_from_directory(
    directory= '/home/basile/Documents/projet_bees_detection_basile/data_bees_detection/VGG16_1/valid',
    labels='inferred',
    label_mode='categorical',
    shuffle = False,
    batch_size=16,
    image_size=(IMG_SIZE, IMG_SIZE))

test_ds = keras.utils.image_dataset_from_directory(
    directory= '/home/basile/Documents/projet_bees_detection_basile/data_bees_detection/VGG16_1/test',
    labels='inferred',
    label_mode='categorical',
    shuffle = False,
    batch_size=16,
    image_size=(IMG_SIZE, IMG_SIZE))


2023-05-03 13:53:06.596808: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-03 13:53:06.749724: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-03 13:53:07.556269: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-05-03 13:53:07.556426: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

Found 239652 files belonging to 178 classes.


2023-05-03 13:53:13.698083: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-03 13:53:13.712329: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-03 13:53:13.712516: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-03 13:53:13.713750: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuil

Found 29888 files belonging to 178 classes.
Found 30118 files belonging to 178 classes.


In [46]:
test_ds


<BatchDataset element_spec=(TensorSpec(shape=(None, 64, 64, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 178), dtype=tf.float32, name=None))>

In [6]:
for element in train_ds:
    for ele in element:
        print(ele.shape)
    break

(16, 64, 64, 3)
(16, 178)


5/ Convert to x /y 


In [None]:
def dataset_to_numpy(dataset):
    """
    Transform a dataset to a numpy array
    """
    x = np.array(dataset.file_paths)
    NB_FILES = len(x)
    NB_CLASSES = len(dataset.class_names)
    y = np.zeros((NB_FILES,NB_CLASSES))

    ind_data = 0
    for bx, by in dataset.as_numpy_iterator():
        y[ind_data:ind_data+bx.shape[0]] = by
        ind_data += bx.shape[0]

    return x,y

# CREATE MODEL

1/ Compile model

In [27]:
from keras.applications.vgg16 import VGG16

model = VGG16(input_shape=(IMG_SIZE, IMG_SIZE, 3),include_top=False)

# NB_CLASSES = species.shape[0]
NB_CLASSES = 178


# create a classifier model on top
x = model.output
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(1024, activation='relu')(x)
predictions = keras.layers.Dense(NB_CLASSES, activation='softmax')(x)

# combine the two models
model = keras.Model(inputs=model.input, outputs=predictions)

# set parameters for training
EPOCHS = 1
BATCH_SIZE = 5
OPTI = keras.optimizers.Adam(learning_rate=0.001, decay=1e-6)
LOSS = keras.losses.CategoricalCrossentropy()
METRICS = [ keras.metrics.Precision(name='precision'), keras.metrics.Recall(name='recall')]

# compile model
model.compile(optimizer=OPTI, loss=LOSS, metrics=METRICS)



In [9]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 64, 64, 3)]       0         
                                                                 
 block1_conv1 (Conv2D)       (None, 64, 64, 64)        1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 64, 64, 64)        36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 32, 32, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 32, 32, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 32, 32, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 16, 16, 128)       0   

2/ Train the model

In [34]:
# callbacks
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath='/home/basile/Documents/projet_bees_detection_basile/bees_detection/src/datafiles/classification/saved_weights/VGG16_1.h5',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    #save_freq=16,
    verbose=1)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0.01,
    patience=10,
    verbose=1,
    mode="auto")

reduce_lr_cb = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=0.00001, verbose=1)


In [None]:
history =model.fit(train_ds,validation_data=valid_ds,epochs=EPOCHS, batch_size=BATCH_SIZE,callbacks=[model_checkpoint_cb,early_stopping_cb,reduce_lr_cb])

In [26]:
history.history.keys()

NameError: name 'history' is not defined

In [1]:
# evaluate model

# load best weights
model.load_weights('/home/basile/Documents/projet_bees_detection_basile/bees_detection/src/datafiles/classification/saved_weights/VGG16_1.h5')

# evaluate model
model.evaluate(test_ds)



NameError: name 'model' is not defined

In [None]:

import matplotlib.pyplot as plt
def plot_training_analysis(history, metric='loss'):    

  loss = history.history[metric]
  val_loss = history.history['val_' + metric]

  epochs = range(len(loss))

  plt.plot(epochs, loss, 'b', linestyle="--",label='Training ' + metric)
  plt.plot(epochs, val_loss, 'g', label='Validation ' + metric)
  plt.title('Training and validation ' + metric)
  plt.legend()

  plt.show()