# 1. Generate train test val 

To start with, we'll create a simple dataset for our VGG16 
- only images labelled at species level
- only species with more than pictures 
- no hierarchy between databases

In [56]:
import os 
import pandas as pd 
from matplotlib import pyplot as plt
import numpy as np

In [57]:
path_to_dataset = '/home/basile/Documents/projet_bees_detection_basile/bees_detection/src/datafiles/final_datafiles/dataset_yolo_cropped_with_real_labels.csv'

df_dataset = pd.read_csv(path_to_dataset)
df_dataset.head()

Unnamed: 0,Paths,Real labels
0,/home/basile/Documents/projet_bees_detection_b...,Hylaeus pictus
1,/home/basile/Documents/projet_bees_detection_b...,Hylaeus pictus
2,/home/basile/Documents/projet_bees_detection_b...,Hylaeus pictus
3,/home/basile/Documents/projet_bees_detection_b...,Hylaeus pictus
4,/home/basile/Documents/projet_bees_detection_b...,Hylaeus pictus


Select species that fit the conditions

In [58]:
# Take only the images labelled as species (i.e. real labels has more than 1 word)
df_dataset_species = df_dataset[df_dataset['Real labels'].str.split(' ').str.len() > 1]


# Get the number of species that have more than 100 images
df_dataset_species = df_dataset_species['Real labels'].value_counts()[df_dataset_species['Real labels'].value_counts() > 20000]

# Convert the series to a dataframe
df_dataset_species = df_dataset_species.to_frame()

# Reset the index
df_dataset_species.reset_index(inplace=True)

# Rename the columns
df_dataset_species.columns = ['Species', 'Number of images']

In [59]:
df_dataset_species.head()

Unnamed: 0,Species,Number of images
0,Apis mellifera,49093
1,Bombus terrestris,20535


Creates an index for each specie, so that we can generate datasets with paths_and_labels_to_dataset

In [60]:
def associate_label_to_index(df_dataset): 
    """
    Associate an integer to each label

    Parameters
    ----------
    df_dataset : pandas dataframe of all the pictures
                # Paths # Real labels
    
    Returns
    -------
    df_dataset : pandas dataframe of all the pictures
                # Paths # Real labels # Id
    """

    # Get the unique labels
    labels = df_dataset['Real labels'].unique()

    # Create a dictionary with the labels as keys and the integers as values
    label_to_index = dict((label, index) for index, label in enumerate(labels))

    # Add a column with the integer label
    df_dataset['Id'] = df_dataset['Real labels'].map(label_to_index)

    return df_dataset

df_dataset = associate_label_to_index(df_dataset)

In [61]:
# Sguffle the dataset
df_dataset = df_dataset.sample(frac=1).reset_index(drop=True)

df_dataset.head()

Unnamed: 0,Paths,Real labels,Id
0,/home/basile/Documents/projet_bees_detection_b...,Andrena wilkella,45
1,/home/basile/Documents/projet_bees_detection_b...,Apis mellifera,97
2,/home/basile/Documents/projet_bees_detection_b...,Apis mellifera,97
3,/home/basile/Documents/projet_bees_detection_b...,Bombus pascuorum,240
4,/home/basile/Documents/projet_bees_detection_b...,Andrena fulva,187


Split the pictures between train / test / val

In [62]:
# For each specie 80% of the images are used for training 10% for validation and 10% for testing

def split_dataset(
        df_dataset, df_dataset__species,
        path_to_output= '/home/basile/Documents/projet_bees_detection_basile/bees_detection/src/datafiles/classification/inputs/VGG16_little'
):
    """
    Split the dataset into train, valid and test set, 
    with 80% of the images for training, 10% for validation and 10% for testing

    Parameters
    ----------
    df_dataset : pandas dataframe of all the pictures
                # Paths # Real labels # Id
    df_dataset_species : pandas dataframe with only the filtered species   
                # Species # Number of images
    path_to_output : path to the output folder where the csv files will be saved

    Returns
    -------
    df_train : pandas dataframe of the training set
                # Paths # Real labels
    df_valid : pandas dataframe of the validation set
                # Paths # Real labels
    df_test : pandas dataframe of the testing set   
                # Paths # Real labels

    Csv files :
     whole dataset 
     train/test/val
     correspondance label/id
    are saved in the output folder

    """



    # Create empty dataframes
    df_train = pd.DataFrame()
    df_valid = pd.DataFrame()
    df_test = pd.DataFrame()

    for specie in df_dataset_species['Species']:

        df_img = df_dataset[df_dataset['Real labels'] == specie]

        # shuffle the dataframe
        df_img = df_img.sample(frac=1).reset_index(drop=True)

        # get the number of images
        nb_img = len(df_img)

        # get the number of images for each set
        nb_img_train = int(nb_img * 0.8)
        nb_img_valid = int(nb_img * 0.1)
        nb_img_test = int(nb_img * 0.1)

        # get the dataframe for each set
        df_img_train = df_img.iloc[:nb_img_train]
        df_img_valid = df_img.iloc[nb_img_train:nb_img_train+nb_img_valid]
        df_img_test = df_img.iloc[nb_img_train+nb_img_valid:]

        # Concatenate the dataframe for each set
        df_train = pd.concat([df_train, df_img_train])
        df_valid = pd.concat([df_valid, df_img_valid])
        df_test = pd.concat([df_test, df_img_test])

    # shuffle the dataframe
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_valid = df_valid.sample(frac=1).reset_index(drop=True)
    df_test = df_test.sample(frac=1).reset_index(drop=True)

    # Drop the Real labels column
    df_train.drop(columns=['Real labels'], inplace=True)
    df_valid.drop(columns=['Real labels'], inplace=True)
    df_test.drop(columns=['Real labels'], inplace=True)


    # save the dataframe to csv
    df_train.to_csv(os.path.join(path_to_output, 'train.csv'), index=False)
    df_valid.to_csv(os.path.join(path_to_output, 'valid.csv'), index=False)
    df_test.to_csv(os.path.join(path_to_output, 'test.csv'), index=False) 

    # save the whole dataset to csv
    df_dataset.to_csv(os.path.join(path_to_output, 'dataset.csv'), index=False)

    # save the correspondance label/id to csv
    df_dataset[['Id', 'Real labels']].drop_duplicates().to_csv(os.path.join(path_to_output, 'label_id.csv'), index=False)
    

    return df_train, df_valid, df_test


df_train, df_valid, df_test = split_dataset(df_dataset, df_dataset_species)

In [63]:
# def copy_pictures_from_csv(df, path_to_output):
#     """
#     param df: dataframe with the paths to the images 
#             like this # paths #labels
#             NB : paths is : whole_dataset_cropped/.../image.jpg
#     param path_to_output: path to the folder where the images will be copied 
#             e.g. VGG16/train
#     """
#     # copy pictures
#     for index, row in df.iterrows():
#         path_to_file = row['Paths']
#         path_to_file = path_to_file.replace('whole_dataset_cropped', path_to_output)
#         os.makedirs(os.path.dirname(path_to_file), exist_ok=True)
#         os.system('cp "{}" "{}"'.format(row['Paths'], path_to_file))

#     # write new csv

#     df['Paths'] = df['Paths'].str.replace('whole_dataset_cropped', path_to_output)
#     df.to_csv(os.path.join(path_to_output, 'dataset.csv'), index=False)


# copy_pictures_from_csv(df_train, 'VGG16/train')
# copy_pictures_from_csv(df_valid, 'VGG16/valid')
# copy_pictures_from_csv(df_test, 'VGG16/test')

# 2 Create datasets

Using paths_and_labels_to_dataset so as to save memory and keep the actual folder structure

(we've modified the code to import and use it) :




In [64]:
import tensorflow as tf
import numpy as np
from keras.utils import paths_and_labels_to_dataset

def dataframe_to_dataset(df,batch_size):
    """
    Converts a dataframe to a tensorflow dataset
    Parameters
    ----------
    df: dataframe with the paths to the images 
            like this # paths #labels
    batch size : batch size for the dataset
    Returns
    -------
    dataset : tensorflow dataset
    """

    ds = paths_and_labels_to_dataset(
        image_paths = df['Paths'].values,
        image_size = (64,64),
        num_channels= 3,
        labels = df['Id'].values,
        label_mode = 'categorical',
        num_classes = df['Id'].nunique(),
        interpolation = 'bilinear',
        crop_to_aspect_ratio=False)
    
    ds = ds.batch(batch_size)

    return ds

train_ds = dataframe_to_dataset(df_train, 1)
valid_ds = dataframe_to_dataset(df_valid, 1)
test_ds = dataframe_to_dataset(df_test, 1)


    

In [65]:

# train_ds = paths_and_labels_to_dataset(
#     image_paths = df_train['Paths'].values,
#     image_size = (64,64),
#     num_channels= 3,
#     labels = df_train['Id'].values,
#     label_mode = 'categorical',
#     num_classes = df_train['Id'].nunique(),
#     interpolation = 'bilinear',
#     crop_to_aspect_ratio=False)



# test_ds = paths_and_labels_to_dataset(
#     image_paths = df_test['Paths'].values,
#     image_size = (64,64),
#     num_channels= 3,
#     labels = df_test['Id'].values,
#     label_mode = 'categorical',
#     num_classes = df_test['Id'].nunique(),
#     interpolation = 'bilinear',
#     crop_to_aspect_ratio=False)

# valid_ds = paths_and_labels_to_dataset(
#     image_paths = df_valid['Paths'].values,
#     image_size = (64,64),
#     num_channels= 3,
#     labels = df_valid['Id'].values,
#     label_mode = 'categorical',
#     num_classes = df_valid['Id'].nunique(),
#     interpolation = 'bilinear',
#     crop_to_aspect_ratio=False)


# 3 Create the model

In [66]:
import tensorflow as tf
from keras.applications import VGG16
from keras.models import Model
from keras.layers import Dense, Flatten

# load model without classifier layers
model = VGG16(include_top=False, input_shape=(64, 64, 3))

# Number of classes in the dataset
num_classes = df_train['Id'].nunique()

# add new classifier layers
flat1 = Flatten()(model.layers[-1].output)
class1 = Dense(1024, activation='relu')(flat1)
output = Dense(num_classes, activation='softmax')(class1)

# define new model
model = Model(inputs=model.inputs, outputs=output)

# summarize
model.summary()


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 64, 64, 3)]       0         
                                                                 
 block1_conv1 (Conv2D)       (None, 64, 64, 64)        1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 64, 64, 64)        36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 32, 32, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 32, 32, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 32, 32, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 16, 16, 128)       0   

In [67]:
# Set the optimizer, loss function and metrics

opti = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss = tf.keras.losses.CategoricalCrossentropy()
metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]

# Compile the model
model.compile(optimizer=opti, loss=loss, metrics=metrics)

In [68]:
# Fit the model
history = model.fit(train_ds, validation_data=valid_ds, epochs=1)

# Evaluate the model
model.evaluate(test_ds)

# Plot the training and validation loss
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='valid')
plt.legend()
plt.show()

 2267/55702 [>.............................] - ETA: 24:50 - loss: nan - precision_2: 0.0000e+00 - recall_2: 0.0000e+00



 3597/55702 [>.............................] - ETA: 24:11 - loss: nan - precision_2: 0.0000e+00 - recall_2: 0.0000e+00



 3851/55702 [=>............................] - ETA: 24:04 - loss: nan - precision_2: 0.0000e+00 - recall_2: 0.0000e+00



 8537/55702 [===>..........................] - ETA: 21:55 - loss: nan - precision_2: 0.0000e+00 - recall_2: 0.0000e+00





KeyboardInterrupt: 