#Import usefull packages and libraris

In [0]:
import numpy as np
import pdb
import matplotlib.pyplot as plt
import pandas  as pd
from google.colab import drive

#Mount the drive and read the images

In [102]:
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/prime_project/data/'  # your new root path

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Loading the dataset

We load the The BeeImage Dataset from Kaggle.

In [103]:
df = pd.read_csv(root_path+"bee_data.csv")
n_tot = df['file'].count()
print("Total images in the dataset: {}".format(n_tot))
df.head()

Total images in the dataset: 5172


Unnamed: 0,file,date,time,location,zip code,subspecies,health,pollen_carrying,caste
0,041_066.png,8/28/18,16:07,"Alvin, TX, USA",77511,-1,hive being robbed,False,worker
1,041_072.png,8/28/18,16:07,"Alvin, TX, USA",77511,-1,hive being robbed,False,worker
2,041_073.png,8/28/18,16:07,"Alvin, TX, USA",77511,-1,hive being robbed,False,worker
3,041_067.png,8/28/18,16:07,"Alvin, TX, USA",77511,-1,hive being robbed,False,worker
4,041_059.png,8/28/18,16:07,"Alvin, TX, USA",77511,-1,hive being robbed,False,worker


# Data pre-processing

For the purposes of the current work, we are only interested in associating each bee to the proper subspecies. For this reason, we are going to discard the images repsenting bees whose subspecies is unknown.

In [104]:
df = df[df['subspecies'] != '-1']
print("Images left: {}\nImages lost: {}%".format(df['file'].count(), 100-df['file'].count()/n_tot*100))
print("Distinct values of subspecies = {}\n".format(df["subspecies"].nunique()))
print("Counting the distinct values of subspecies:\n{}".format(df["subspecies"].value_counts()))

Images left: 4744
Images lost: 8.275328692962105%
Distinct values of subspecies = 6

Counting the distinct values of subspecies:
Italian honey bee        3008
Russian honey bee         527
Carniolan honey bee       501
1 Mixed local stock 2     472
VSH Italian honey bee     199
Western honey bee          37
Name: subspecies, dtype: int64


We need to pre-process the data in order to have a dataset composed by squared images of the same size. For this reason, we add black pixels as padding whenever the image is not square-shaped.

In [0]:
import cv2

def resize_image_padding(img, desired_size=64):
    old_size = img.shape[:2]
    ratio = float(desired_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])

    img = cv2.resize(img, (new_size[1], new_size[0]))

    delta_w = desired_size - new_size[1]
    delta_h = desired_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)

    color = [0, 0, 0]
    new_img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    return new_img

In [0]:
input_size = 64

directory = root_path+"/bee_imgs/"
X = []
for row in df.file.values:
    img = cv2.imread(directory + row, cv2.IMREAD_COLOR) 
    img = np.array(img)
    img = resize_image_padding(img, input_size)
    X.append(img)
X = np.array(X)

We also need to encode the labels with integer values.

In [142]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df.subspecies.values)
y = np.array(y)
print(y)

[2 2 2 ... 5 5 5]


Now we can finally split the data for training and testing purposes.

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

#transform the target in OHE in order to be trained using a softmax activation function
Y_train = to_categorical(Y_train)
Y_test = to_categorical(Y_test)


In [144]:
print("X_train.shape = {}\nX_test.shape = {}\ny_train.shape = {}\ny_test.shape ={}"
      .format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape))

X_train.shape = (3795, 64, 64, 3)
X_test.shape = (949, 64, 64, 3)
y_train.shape = (3795, 6)
y_test.shape =(949, 6)


# Building the CNN

Define the network structure

In [145]:
import tensorflow as tf

tf.reset_default_graph()
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input
from keras.layers import Conv2D, MaxPooling2D
from keras import optimizers
from keras.utils.np_utils import to_categorical
print(keras.__version__)

2.2.5


In [0]:
learning_rate = 0.01
n_epochs = 5
batch_size = 64

# number of convolutional filters to use
nb_filters = 32
# convolution kernel size
kernelSize = (3, 3)
# size of pooling area for max pooling
pool_size = (2, 2)

img_rows, img_cols, nb_channels = input_size,input_size, 3
input_shape = (img_rows, img_cols, nb_channels)
#number of calsses
nb_classes = 6# Y_train.shape[1]
# --- Size of the successice layers
n_h_0 = nb_channels
n_h_1 = nb_filters
n_h_2 = nb_filters
n_h_3 = nb_filters

In [0]:

model = Sequential()
filter_size = (3,3)
#the initialization of the kernel influences a lot the performance of the algorithm: different initializers have different impact.
model.add(Conv2D(filters = 32, kernel_size = (3, 3), activation='relu',padding= "same",kernel_initializer= keras.initializers.RandomUniform(minval=-0.05, maxval=0.05, seed=None), input_shape=input_shape))

model.add(Conv2D(filters = nb_filters, kernel_size = filter_size, activation='relu',padding ="same", kernel_initializer= "random_uniform",name='conv_2'))
model.add(MaxPooling2D((2, 2)))

model.add(Conv2D(filters = nb_filters, kernel_size = filter_size, activation='relu',padding ="same",kernel_initializer= "random_uniform", name='conv_3'))
model.add(MaxPooling2D((2, 2), padding="same"))

model.add(Flatten())
model.add(Dense(nb_classes, activation='softmax', name='dense_1'))

# --- END CODE HERE

compile the model, define the loss and the optimizer

In [0]:
model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.Adam(lr=learning_rate),
              metrics=['accuracy'])

summary of the model to have an idea of the architechture and the weigths that will be trained

In [160]:
#the number of trainable of parameters at each layers is equal to total number of weight of all the filters at each layer.
#In this case 3*3(size of each filter) * 3(depth of each filter)*32(total number of filters at the first layer) + 32(each filter has a bias) = 896
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 64, 64, 32)        896       
_________________________________________________________________
conv_2 (Conv2D)              (None, 64, 64, 32)        9248      
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 32, 32, 32)        0         
_________________________________________________________________
conv_3 (Conv2D)              (None, 32, 32, 32)        9248      
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 16, 16, 32)        0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 8192)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                

train the model

In [162]:
print(X_test.shape,Y_test.shape)
model.fit(X_train, Y_train,
           batch_size=batch_size, 
          nb_epoch=n_epochs,
          verbose=1, 
          validation_data=(X_test, Y_test))

(949, 64, 64, 3) (949, 6)
Train on 3795 samples, validate on 949 samples
Epoch 1/5


  


InvalidArgumentError: ignored

evaluate the performances of the model on test data

In [0]:
score = model.evaluate(X_test, Y_test, verbose=False)
print('Test score:', score[0])
print('Test accuracy:', score[1])

let's have a look to what the network have learned

plot the confusion matrix