Classification model training code used in BJ Spiesman, C Gratton, E Gratton, and H Hines. 2024. Deep learning for identifying bee species from images of wings and pinned specimens. PLOS ONE 

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import mixed_precision

import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import keras
import random
import scipy

from sklearn import svm, datasets
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

from keras.models import Sequential, Model
from tensorflow.keras.utils import array_to_img, img_to_array, load_img
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization, ZeroPadding2D, AveragePooling2D
from keras import regularizers
from keras import backend as K

from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import time

In [None]:
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

# Training/test generator setup

In [None]:
#Pinned
TRAIN_DATADIR = "<Path to train directory>"
VAL_DATADIR = "<Path to validation directory>"
TEST_DATADIR = "<Path to test directory>"

CATEGORIES = ["Agapostemon_virescens", "Augochlorella_aurata", "Augochlorella_persimilis", "Augochloropsis_metallica", 
              "Ceratina_strenua", "Halictus_ligatus", "Lasioglossum_albipenne", "Lasioglossum_anomalum", 
              "Lasioglossum_cressonii", "Lasioglossum_disparile", "Lasioglossum_hitchensi", "Lasioglossum_leucocomum", 
              "Lasioglossum_oceanicum", "Lasioglossum_paradmirandum", "Lasioglossum_pectorale", "Lasioglossum_pilosum", 
              "Lasioglossum_pruinosum", "Lasioglossum_semicaeruleum", "Lasioglossum_trigeminum", "Lasioglossum_versatum"]

num_classes = len(CATEGORIES) #Number of classes (e.g., species)
IMG_SIZE = 480 #length and width of input images
batch_size = 6
num_classes = len(CATEGORIES) #Number of classes (e.g., species)

In [None]:
#BeeWing images
TRAIN_DATADIR = "<Path to train directory>"
VAL_DATADIR = "<Path to validation directory>"
TEST_DATADIR = "<Path to test directory>"

CATEGORIES = ["01_Agapostemon_sericeus", "02_Agapostemon_texanus", "03_Bombus_bimaculatus", "04_Bombus_griseocolis", 
              "05_Bombus_impatiens", "06_Bombus_perplexus", "07_Bombus_sandersonii", "08_Bombus_vagans", "09_Ceratina_calcarata", 
              "10_Lasioglossum_acuminatum", "11_Lasioglossum_coriaceum", "12_Lasioglossum_leucozonium", "13_Lasioglossum_MAWIspB", 
              "14_Lasioglossum_nymphaerum", "15_Lasioglossum_pilosum", "16_Lasioglossum_rohweri", "17_Lasioglossum_zephyrum", 
              "18_Lasioglossum_zonulum"]

num_classes = len(CATEGORIES) #Number of classes (e.g., species)
IMG_SIZE = 480 #length and width of input images
batch_size = 4
num_classes = len(CATEGORIES) #Number of classes (e.g., species)

In [None]:
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=100,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

val_datagen = ImageDataGenerator(
    rescale=1./255
    )

test_datagen = ImageDataGenerator(
    rescale=1./255
)

In [None]:
train_generator = train_datagen.flow_from_directory(
    TRAIN_DATADIR,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=batch_size,
    class_mode='categorical')

validation_generator = val_datagen.flow_from_directory(
    VAL_DATADIR,
    shuffle=False,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=batch_size,
    class_mode='categorical')

test_generator = test_datagen.flow_from_directory(
    TEST_DATADIR,
    shuffle=False,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=batch_size,
    class_mode='categorical')

In [None]:
#Set up class weights for imbalanced dataset
from collections import Counter
counter = Counter(validation_generator.classes)
max_val = float(max(counter.values()))
class_weights = {class_id : max_val/num_images for class_id, num_images in counter.items()}
print(class_weights)

In [None]:
#import keras_applications
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, GlobalMaxPooling2D
from tensorflow.keras.applications import EfficientNetV2L

base_model = EfficientNetV2L(include_top=False, pooling ='avg', weights='imagenet', input_shape=(IMG_SIZE, IMG_SIZE, 3), include_preprocessing=False)
x = base_model.output
x = Dense(1024, activation='relu')(x)
x = Dropout(0.95)(x)

predictions = Dense(num_classes, activation='softmax')(x)

model = Model(inputs = base_model.input, outputs = predictions)
# summarize the model
model.summary()

In [None]:
#Set up early stopping, save best model, learning rate
earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min')
mcp_save = ModelCheckpoint('Best_mdl_wts.hdf5', save_best_only=True, monitor='val_accuracy', mode='auto')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1, min_delta=1e-4, mode='auto')

In [None]:
#Compile and run model
import functools
import multiprocessing as mp

opt = tf.keras.optimizers.SGD(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", 
              optimizer= opt, 
              metrics=['accuracy'])

In [None]:
batch_size = batch_size
num_train_images = train_generator.samples
steps = num_train_images // batch_size
num_epochs = 100

history = model.fit(train_generator,
                        steps_per_epoch = steps,
                        epochs=num_epochs,
                        validation_data=validation_generator,  
                        verbose=1,
                        class_weight=class_weights,
                        callbacks=[mcp_save, reduce_lr_loss],
                        use_multiprocessing=False,
                        workers=20)

In [None]:
#Plot loss and accuracy
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='lower right')
plt.savefig('acc', dpi=600)
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.savefig('val', dpi=600)
plt.show()

In [None]:
#Load best model weights and calculate model loss and accuracy on the test set
model.load_weights('Best_mdl_wts.hdf5')
score = model.evaluate(test_generator)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
#Save the best model
model.save('Saved_model.h5')

In [None]:
## TEST Confusion matrix
y_pred = model.predict(test_generator)
y_pred_labels = np.argmax(y_pred, axis=1)

cm = confusion_matrix(test_generator.classes, y_pred_labels)
print(cm)

In [None]:
#Nicer looking confusion matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

np.set_printoptions(precision=6)

df_cm = pd.DataFrame(cm, index = CATEGORIES, columns = CATEGORIES)
df_cm.index.name = 'True'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (16,12))
sn.heatmap(df_cm, annot=True, cmap="Blues", fmt=".1f")

In [None]:
#Classification report
from sklearn.metrics import classification_report, accuracy_score
print ('Accuracy Score:',accuracy_score(y_test, y_pred_labels))
print(classification_report(y_test, y_pred_labels, digits=4, target_names=CATEGORIES))