### Imports

In [1]:
# pip install Augmentor scikit-learn gputil

In [2]:
import os
import re
import csv
import time
import GPUtil
import random
import shutil
import Augmentor
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2, MobileNetV3Small, MobileNetV3Large, VGG16, InceptionV3, ResNet50, ResNet101, ResNet152
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input as preprocess_input_mobilenet_v2
from tensorflow.keras.applications.mobilenet_v3 import preprocess_input as preprocess_input_mobilenet_v3
from keras.applications.vgg16 import preprocess_input as preprocess_input_vgg16
from keras.applications.inception_v3 import preprocess_input as preprocess_input_inceptionv3
from tensorflow.keras.applications.resnet50 import preprocess_input as preprocess_input_resnet50
from tensorflow.keras.applications.resnet import preprocess_input as preprocess_input_resnet101
from tensorflow.keras.applications.resnet_v2 import preprocess_input as preprocess_input_resnet152

### GPU Check

In [3]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print(tf.test.is_built_with_cuda())

TensorFlow version: 2.8.0
True


In [5]:
# Get GPU information
gpus = GPUtil.getGPUs()
for gpu in gpus:
    print(f"GPU {gpu.id}: {gpu.name}, GPU Load: {gpu.load * 100}%")

GPU 0: NVIDIA GeForce RTX 2080 Ti, GPU Load: 0.0%
GPU 1: NVIDIA GeForce RTX 2080 Ti, GPU Load: 0.0%


### 6:2:2 split of each folder -> create csv files -> check for duplicates -> check for leakage -> remove duplicates and leakage (if any) -> oversample train to 3000 using augmentations -> creating final csv files [all done only once, hence, commented.]

In [6]:
### Functions

### Split by 6:2:2, creating separate train, val and test folders

def split_data(input_folder, output_folder):
    # Create train, val, and test folders
    train_folder = os.path.join(output_folder, 'train')
    val_folder = os.path.join(output_folder, 'val')
    test_folder = os.path.join(output_folder, 'test')

    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    # Dictionary to store counts for each class
    class_counts = {}

    # Iterate through each class folder in the input folder
    for class_folder in os.listdir(input_folder):
        class_path = os.path.join(input_folder, class_folder)

        # Skip if it's not a directory
        if not os.path.isdir(class_path):
            continue

        # Create subfolders in train, val, and test
        train_class_folder = os.path.join(train_folder, class_folder)
        val_class_folder = os.path.join(val_folder, class_folder)
        test_class_folder = os.path.join(test_folder, class_folder)

        os.makedirs(train_class_folder, exist_ok=True)
        os.makedirs(val_class_folder, exist_ok=True)
        os.makedirs(test_class_folder, exist_ok=True)

        # Get the list of images in the class folder
        images = os.listdir(class_path)
        random.shuffle(images)

        # Calculate the number of images for each split
        total_images = len(images)
        train_split = int(0.6 * total_images)
        val_split = int(0.2 * total_images)

        # Copy images to train, val, and test folders
        for i, image in enumerate(images):
            src_path = os.path.join(class_path, image)
            
            if i < train_split:
                dst_path = os.path.join(train_class_folder, image)
            elif i < train_split + val_split:
                dst_path = os.path.join(val_class_folder, image)
            else:
                dst_path = os.path.join(test_class_folder, image)

            shutil.copy(src_path, dst_path)

        # Update class counts dictionary
        class_counts[class_folder] = {
            'total': total_images,
            'train': train_split,
            'val': val_split,
            'test': total_images - train_split - val_split
        }

    return class_counts

### -------------------------------------------------------------------------

### Create csv for a folder

def create_csv(input_folder, output_csv):
    # Open CSV file in write mode
    with open(output_csv, 'w', newline='') as csv_file:
        # Create CSV writer
        csv_writer = csv.writer(csv_file)
        
        # Write header
        csv_writer.writerow(['Image_Path', 'Label'])

        # Initialize count
        total_rows = 0

        # Iterate through each class folder in the input folder
        for label in os.listdir(input_folder):
            class_folder = os.path.join(input_folder, label)

            # Skip if it's not a directory
            if not os.path.isdir(class_folder):
                continue

            # Iterate through each image in the class folder
            for image in os.listdir(class_folder):
                # Get the image path
                image_path = os.path.join(class_folder, image)

                # Write the row to the CSV file
                csv_writer.writerow([image_path, label])

                # Increment the count
                total_rows += 1

    return total_rows

### -------------------------------------------------------------------------

### Oversample to n using augmentations

def oversample_aug(main_folder, output_folder, n=3000):
    # Iterate over subfolders
    for subfolder in os.listdir(main_folder):
        subfolder_path = os.path.join(main_folder, subfolder)

        # Check if it's a directory
        if os.path.isdir(subfolder_path):
            # Check if the directory contains images
            if any(f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tif')) for f in os.listdir(subfolder_path)):
                # Create an Augmentor pipeline for each subfolder
                pipeline = Augmentor.Pipeline(subfolder_path, output_directory=os.path.join(output_folder, subfolder))
                # Add your augmentations here
                pipeline.rotate(probability=1, max_left_rotation=5, max_right_rotation=5)
                pipeline.rotate(probability=0.7, max_left_rotation=10, max_right_rotation=10)
                pipeline.flip_left_right(probability=0.5)
                pipeline.flip_top_bottom(probability=0.5)

                # Execute the augmentation process
                pipeline.sample(n)  # Adjust the number of samples as needed
            else:
                print(f"No images found in {subfolder_path}. Skipping.")
            
### -------------------------------------------------------------------------

### Check for dupliates in a csv file, on the column Image_Path
            
def check_duplicates(file):
    df = pd.read_csv(f'/uoa/home/s04bs3/data/full/dasa/{file}.csv')
    duplicates_specific_column= df[df.duplicated(subset=['Image_Path'])]
    print(f"\nDuplicates in {file} based on Image_Path:")
    print(duplicates_specific_column)
    
### -------------------------------------------------------------------------

### Functions to deal with leakage

def extract_pattern(path):
    pattern = r'(\d{2}-\d{2}-\d{4}-\d{2}-\d{2}-\d{2}\.hvdfrm\d+\(\d+,\d+\)-Z\d+\.\d+)'
    match = re.search(pattern, path)
    return match.group() if match else None

def get_paths_with_pattern(df):
    return set(df['Image_Path'].apply(extract_pattern))

def get_full_path(pattern, df):
    # Find the row where Image_Path contains the pattern
    match_row = df[df['Image_Path'].apply(lambda x: pattern in x)]
    
    # Assuming there is only one match, get the full path
    full_path = match_row['Image_Path'].iloc[0] if not match_row.empty else None
    
    return full_path

def check_leakage(train_path, val_path, test_path):
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)

    train_paths = get_paths_with_pattern(train_df)
    val_paths = get_paths_with_pattern(val_df)
    test_paths = get_paths_with_pattern(test_df)

    overlap_train_val = train_paths.intersection(val_paths)
    overlap_train_test = train_paths.intersection(test_paths)
    overlap_val_test = val_paths.intersection(test_paths)

    print("Overlap between train and val:", overlap_train_val)
    print("Overlap between train and test:", overlap_train_test)
    print("Overlap between val and test:", overlap_val_test)
    print("\n")

    return train_df, val_df, test_df, overlap_train_val, overlap_train_test, overlap_val_test

def remove_leakage(train_df, val_df, overlap_train_val, overlap_train_test):
    # Filter rows from train_df where Image_Path is in overlap_train_val or overlap_train_test
    train_df_no_overlap = train_df[~train_df['Image_Path'].apply(extract_pattern).isin(overlap_train_val.union(overlap_train_test))]

    # Filter rows from val_df where Image_Path is in overlap_train_val
    val_df_no_overlap = val_df[~val_df['Image_Path'].apply(extract_pattern).isin(overlap_train_val)]

    return train_df_no_overlap, val_df_no_overlap

def delete_images(image_paths):
    for path in image_paths:
        try:
            os.remove(path)
            print(f"Deleted: {path}")
        except Exception as e:
            print(f"Error deleting {path}: {e}")

In [8]:
# ### 6:2:2 split

# # Provide the input and output folder paths
# input_folder_path = "/uoa/home/s04bs3/data/full/ALL/"
# output_folder_path = "/uoa/home/s04bs3/data/full/dasa/"

# counts = split_data(input_folder_path, output_folder_path)

# # Display counts for each class
# for class_name, count_info in counts.items():
#     print(f"Class: {class_name}")
#     print(f"Total: {count_info['total']} | Train: {count_info['train']} | Val: {count_info['val']} | Test: {count_info['test']}")
#     print("---")

Class: PLANKTON_LARVAE
Total: 372 | Train: 223 | Val: 74 | Test: 75
---
Class: FLOCK
Total: 4555 | Train: 2733 | Val: 911 | Test: 911
---
Class: RADIOLARIAN
Total: 1204 | Train: 722 | Val: 240 | Test: 242
---
Class: APPENDICULARIAN
Total: 3131 | Train: 1878 | Val: 626 | Test: 627
---
Class: NAUPLII
Total: 1650 | Train: 990 | Val: 330 | Test: 330
---
Class: DINOFLAGELLATES_CERATIUM
Total: 190 | Train: 114 | Val: 38 | Test: 38
---
Class: FILAMENTOUS_ALGAL_COLONY
Total: 3562 | Train: 2137 | Val: 712 | Test: 713
---
Class: CHAETOCEROS_SUBTILIS
Total: 596 | Train: 357 | Val: 119 | Test: 120
---
Class: PHYTOPLANKTON_HELICAL
Total: 629 | Train: 377 | Val: 125 | Test: 127
---
Class: DIATOM
Total: 587 | Train: 352 | Val: 117 | Test: 118
---
Class: COPEPODS
Total: 2812 | Train: 1687 | Val: 562 | Test: 563
---
Class: DINOFLAGELLATES_NOCTILUCA
Total: 1023 | Train: 613 | Val: 204 | Test: 206
---


In [20]:
# ### create csv -- train.csv, val.csv, test.csv & check for duplicates

# for n in ['train', 'val', 'test']:
#     input_folder_path = f"/uoa/home/s04bs3/data/full/dasa/{n}/"
#     output_csv_path = f"/uoa/home/s04bs3/data/full/dasa/{n}.csv"
#     total_rows = create_csv(input_folder_path, output_csv_path)
#     print(f'\nTotal rows created: {total_rows}')
#     check_duplicates(n)


Total rows created: 12183

Duplicates in train based on Image_Path:
Empty DataFrame
Columns: [Image_Path, Label]
Index: []

Total rows created: 4058

Duplicates in val based on Image_Path:
Empty DataFrame
Columns: [Image_Path, Label]
Index: []

Total rows created: 4070

Duplicates in test based on Image_Path:
Empty DataFrame
Columns: [Image_Path, Label]
Index: []


In [21]:
# # Example usage:
# train_path = '/uoa/home/s04bs3/data/full/dasa/train.csv'
# val_path = '/uoa/home/s04bs3/data/full/dasa/val.csv'
# test_path = '/uoa/home/s04bs3/data/full/dasa/test.csv'

# train_df, val_df, test_df, overlap_train_val, overlap_train_test, overlap_val_test = check_leakage(train_path, val_path, test_path)
# train_df_no_overlap, val_df_no_overlap = remove_leakage(train_df, val_df, overlap_train_val, overlap_train_test)

# # Get full paths for overlapping images
# full_paths_train = [get_full_path(pattern, train_df) for pattern in overlap_train_val.union(overlap_train_test) if pattern is not None]
# full_paths_val = [get_full_path(pattern, val_df) for pattern in overlap_val_test if pattern is not None]


# # Delete images
# delete_images(full_paths_train)
# delete_images(full_paths_val)

Overlap between train and val: {'21-09-2022-10-12-08.hvdfrm064(462,218)-Z114.50', '21-09-2022-10-03-04.hvdfrm281(1929,1236)-Z39.50', '21-09-2022-08-59-32.hvdfrm101(1814,323)-Z65.00', '21-09-2022-10-07-36.hvdfrm270(267,96)-Z43.00', '21-09-2022-10-01-33.hvdfrm589(1027,443)-Z112.00', '21-09-2022-08-58-32.hvdfrm005(1361,1602)-Z107.00', None, '21-09-2022-09-07-36.hvdfrm029(1058,1535)-Z89.00', '21-09-2022-10-12-08.hvdfrm552(2018,12)-Z127.50'}
Overlap between train and test: {'21-09-2022-10-02-34.hvdfrm570(2035,678)-Z65.50', '21-09-2022-10-08-06.hvdfrm124(1383,1835)-Z55.00', '21-09-2022-10-12-08.hvdfrm593(255,1889)-Z90.50', '21-09-2022-10-02-03.hvdfrm422(1058,317)-Z70.50', '21-09-2022-10-08-06.hvdfrm357(916,32)-Z122.50', '21-09-2022-10-09-37.hvdfrm167(0,520)-Z129.00', None, '21-09-2022-10-04-34.hvdfrm339(1475,18)-Z185.00', '21-09-2022-09-08-06.hvdfrm407(1241,353)-Z42.50', '21-09-2022-10-02-03.hvdfrm060(2081,80)-Z50.50', '21-09-2022-10-01-03.hvdfrm577(1376,592)-Z104.50'}
Overlap between val an

In [22]:
# ### oversample using augmentations to n=3000

# main_folder = "/uoa/home/s04bs3/data/full/dasa/train"
# output_folder = "/uoa/home/s04bs3/data/full/dasa/train_o/"

# oversample_aug(main_folder, output_folder, 3000)

Initialised with 213 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/PLANKTON_LARVAE.

Processing <PIL.Image.Image image mode=RGB size=483x218 at 0x153A9C5701F0>: 100%


Initialised with 2732 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/FLOCK.

Processing <PIL.Image.Image image mode=RGB size=173x156 at 0x153A9C546D00>: 100%


Initialised with 712 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/RADIOLARIAN.

Processing <PIL.Image.Image image mode=RGB size=117x106 at 0x153A9C508280>: 100%


Initialised with 1876 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/APPENDICULARIAN.

Processing <PIL.Image.Image image mode=RGB size=101x106 at 0x153A9FDBE880>: 100%


Initialised with 989 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/NAUPLII.

Processing <PIL.Image.Image image mode=RGB size=149x186 at 0x153A9FDB7220>: 100%


Initialised with 113 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/DINOFLAGELLATES_CERATIUM.

Processing <PIL.Image.Image image mode=RGB size=154x118 at 0x153A9FD9D5B0>: 100%


Initialised with 2136 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/FILAMENTOUS_ALGAL_COLONY.

Processing <PIL.Image.Image image mode=RGB size=255x277 at 0x153A9FDB3550>: 100%


Initialised with 357 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/CHAETOCEROS_SUBTILIS.

Processing <PIL.Image.Image image mode=LA size=267x315 at 0x153AA05A9EE0>: 100%|


Initialised with 375 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/PHYTOPLANKTON_HELICAL.

Processing <PIL.Image.Image image mode=LA size=166x353 at 0x153AA0466F10>: 100%|


Initialised with 351 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/DIATOM.

Processing <PIL.Image.Image image mode=LA size=105x218 at 0x153AA04D7CD0>: 100%|


Initialised with 1687 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/COPEPODS.

Processing <PIL.Image.Image image mode=RGB size=175x120 at 0x153A9C5D68B0>: 100%


Initialised with 611 image(s) found.
Output directory set to /uoa/home/s04bs3/data/full/dasa/train_o/DINOFLAGELLATES_NOCTILUCA.

Processing <PIL.Image.Image image mode=LA size=156x216 at 0x153A9C5348E0>: 100%|


In [23]:
# ### final csv files

# for n in ['train_o','val','test']:
#     input_folder_path = f"/uoa/home/s04bs3/data/full/dasa/{n}/"
#     output_csv_path = f"/uoa/home/s04bs3/data/full/dasa/{n}.csv"
#     total_rows = create_csv(input_folder_path, output_csv_path)
#     print(f'Total rows created: {total_rows}\n\n')

Total rows created: 36000


Total rows created: 4058


Total rows created: 4070




### Functions to train, evaluate and save results

In [7]:
def train_model(model_name, train_data, val_data, epochs=10, batch_size=32):
    
    target_size=(224,224)

    if model_name.lower() in ["mobilenetv2", "mobilenetv2-0.5", "mobilenetv2-0.75"]:
        prep_fn = preprocess_input_mobilenet_v2
    elif model_name.lower() == "vgg16":
        prep_fn = preprocess_input_vgg16
    elif model_name.lower() == "inceptionv3":
        prep_fn = preprocess_input_inceptionv3
        target_size=(299,299)
    elif model_name.lower() == "ResNet50":
        prep_fn = preprocess_input_resnet50
    elif model_name.lower() == "ResNet101":
        prep_fn = preprocess_input_resnet101
    elif model_name.lower() == "ResNet152":
        prep_fn = preprocess_input_resnet152
    else:
        prep_fn = preprocess_input_mobilenet_v3
    
    # Set up data generators
    train_datagen = ImageDataGenerator(preprocessing_function=prep_fn)
    val_datagen = ImageDataGenerator(preprocessing_function=prep_fn)

    # Set up data generators
    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_data,
        x_col="Image_Path",
        y_col="Label",
        target_size=target_size,
        batch_size=batch_size,
        class_mode="categorical",
        shuffle=False
    )

    val_generator = val_datagen.flow_from_dataframe(
        dataframe=val_data,
        x_col="Image_Path",
        y_col="Label",
        target_size=target_size,
        batch_size=batch_size,
        class_mode="categorical",
        shuffle=False
    )

    # Build model based on model_name
    base_model = None

    if model_name == "MobileNetV2":
        base_model = MobileNetV2(input_shape=(224,224, 3), include_top=False, weights="imagenet")
    elif model_name == "MobileNetV2-0.75":
        base_model = MobileNetV2(input_shape=(224,224, 3), include_top=False, weights="imagenet", alpha=0.75)
    elif model_name == "MobileNetV2-0.5":
        base_model = MobileNetV2(input_shape=(224,224, 3), include_top=False, weights="imagenet", alpha=0.5)
    elif model_name == "MobileNetV3Small":
        base_model = MobileNetV3Small(input_shape=(224,224, 3), include_top=False, weights="imagenet")
    elif model_name == "MobileNetV3Small-0.75":
        base_model = MobileNetV3Small(input_shape=(224,224, 3), include_top=False, weights="imagenet", alpha=0.75)
    elif model_name == "MobileNetV3Small-Min":
        base_model = MobileNetV3Small(input_shape=(224,224, 3), include_top=False, weights="imagenet", minimalistic=True)
    elif model_name == "MobileNetV3Large":
        base_model = MobileNetV3Large(input_shape=(224,224, 3), include_top=False, weights="imagenet")
    elif model_name == "MobileNetV3Large-0.75":
        base_model = MobileNetV3Large(input_shape=(224,224, 3), include_top=False, weights="imagenet", alpha=0.75)
    elif model_name == "MobileNetV3Large-Min":
        base_model = MobileNetV3Large(input_shape=(224,224, 3), include_top=False, weights="imagenet", minimalistic=True)
    elif model_name == "VGG16":
        base_model = VGG16(input_shape=(224,224, 3), include_top=False, weights="imagenet")
    elif model_name == "InceptionV3":
        base_model = InceptionV3(input_shape=(299,299, 3), include_top=False, weights="imagenet")
    elif model_name == "ResNet50":
        base_model = ResNet50(input_shape=(224,224, 3), include_top=False, weights="imagenet")
    elif model_name == "ResNet101":
        base_model = ResNet101(input_shape=(224,224, 3), include_top=False, weights="imagenet")
    elif model_name == "ResNet152":
        base_model = ResNet152(input_shape=(224,224, 3), include_top=False, weights="imagenet")

    if base_model:
        base_model.trainable = False

        model = models.Sequential([
            base_model,
            layers.GlobalAveragePooling2D(),
            layers.Dropout(0.5),
            layers.Dense(128, activation='relu'),
            layers.Dense(32, activation='relu'),
            layers.Dense(12, activation='softmax')
        ])

        model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

        # Define callbacks (EarlyStopping)
        early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
        model_checkpoint = ModelCheckpoint(f"/uoa/home/s04bs3/data/full/dasa/models/{model_name}.h5", save_best_only=True)

        # Start the timer for training
        start_time = time.time()

        # Train the model
        history = model.fit(
            train_generator,
            epochs=epochs,
            validation_data=val_generator,
            callbacks=[early_stopping, model_checkpoint]
        )

        # Stop the timer for training
        end_time = time.time()

        # Calculate and print the training time
        training_time = round(end_time - start_time,2)
        print(f"Training Time: {training_time} seconds")

        return model, history

    else:
        print(f"Model {model_name} not recognized.")
        return None, None

##############################################################################################################

def evaluate_model(model_name, model_path, test_csv_path, target_size=(224, 224), batch_size=32):
    # Load the test data
    test_data = pd.read_csv(test_csv_path)

    # Load the saved model
    model = load_model(model_path)

    # Create an ImageDataGenerator for test data
    if model_name.lower() in ["mobilenetv2", "mobilenetv2-0.5", "mobilenetv2-0.75"]:
        prep_fn = preprocess_input_mobilenet_v2
    elif model_name.lower() == "vgg16":
        prep_fn = preprocess_input_vgg16
    elif model_name.lower() == "inceptionv3":
        prep_fn = preprocess_input_inceptionv3
        target_size = (299,299)
    elif model_name.lower() == "ResNet50":
        prep_fn = preprocess_input_resnet50
    elif model_name.lower() == "ResNet101":
        prep_fn = preprocess_input_resnet101
    elif model_name.lower() == "ResNet152":
        prep_fn = preprocess_input_resnet152
    else:
        prep_fn = preprocess_input_mobilenet_v3
    test_datagen = ImageDataGenerator(preprocessing_function=prep_fn)  # You may need to adjust this based on your training data preprocessing

    # Configure the test generator
    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_data,
        x_col="Image_Path",
        y_col="Label",
        target_size=target_size,
        batch_size=batch_size,
        class_mode="categorical",
        shuffle=False  # Important: set shuffle to False for reproducibility
    )

    # Start the timer for testing
    start_time = time.time()

    # Get predictions
    y_pred = model.predict(test_generator)

    # Stop the timer for testing
    end_time = time.time()

    # Calculate and print the testing time
    testing_time = round(end_time - start_time, 2)
    print(f"Testing Time: {testing_time} seconds")

    # Convert predictions to class labels
    y_pred_classes = np.argmax(y_pred, axis=1)

    # Convert true labels to class labels
    y_true = test_generator.classes

    # Calculate and print confusion matrix
    conf_mat = confusion_matrix(y_true, y_pred_classes)
    print("Confusion Matrix:")
    print(conf_mat)

    # Calculate and print classification report
    class_labels = list(test_generator.class_indices.keys())
    report = classification_report(y_true, y_pred_classes, target_names=class_labels)
    print("\nClassification Report:")
    print(report)

    save_results(model_name, conf_mat, report, testing_time)

##############################################################################################################

def save_results(model_name, conf_mat, report, testing_time):
    # Save all results in a single text file
    with open(f"/uoa/home/s04bs3/data/full/dasa/results/{model_name}_results.txt", 'w') as file:
        file.write("Confusion Matrix:\n")
        file.write(np.array_str(conf_mat))
        file.write("\n\nClassification Report:\n")
        file.write(report)
        file.write(f"\nTesting Time: {testing_time} seconds")

### Train & Test

In [11]:
model_name = ["MobileNetV2","MobileNetV2-0.75", "MobileNetV2-0.5",
              "MobileNetV3Small", "MobileNetV3Small-0.75", "MobileNetV3Small-Min",
              "MobileNetV3Large", "MobileNetV3Large-0.75", "MobileNetV3Large-Min",
              "VGG16","InceptionV3", "ResNet50", "ResNet101","ResNet152"]

train_data = pd.read_csv("/uoa/home/s04bs3/data/full/dasa/train_o.csv")
val_data = pd.read_csv("/uoa/home/s04bs3/data/full/dasa/val.csv")
test_csv_path = '/uoa/home/s04bs3/data/full/dasa/test.csv'

for n in range(len(model_name)):
    # Train the model
    model, history = train_model(model_name[n], train_data, val_data, epochs=30, batch_size=32)

    # Evaluate the model
    model_path = f'/uoa/home/s04bs3/data/full/dasa/models/{model_name[n]}.h5'    
    evaluate_model(model_name[n], model_path, test_csv_path)




Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 14.66 seconds
Confusion Matrix:
[[496   2  47   0  57   5   0   3   3   3  11   0]
 [  0 111   2   0   0   0   0   1   0   0   4   1]
 [ 12   1 438   0   6   1   1  35  39   3  25   1]
 [  1   6   5  95   2   1   0   2   1   4   1   0]
 [  1   0   0   0  35   0   0   0   0   1   0   1]
 [  1   0   0   0   0 205   0   0   0   0   0   0]
 [  0   0  26   0   2   0 612  62   6   4   0   1]
 [  7   0  36   2   2   0   1 846   8   3   1   5]
 [  3   0  18   0   6   0   0  13 281   6   1   1]
 [  2   0   3   0   8   4   1   6   0 102   1   0]
 [  1   0   8   0   0   0   0   1   1   1  62   0]
 [  0   3   3   0   1   0   1   6   1   0  18 208]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.95      0.79      0.86       627
     CHAETOCEROS_SUBTILIS       0.90      0.93      0.92       119
                 COPEPODS       0.75      0.78      0



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 15.49 seconds
Confusion Matrix:
[[558   3  10   0  22   0   0   8  12   8   6   0]
 [  0 116   2   0   0   0   0   0   1   0   0   0]
 [ 29   2 432   1   1   0   4  20  60   3   9   1]
 [  5   5   0  98   0   0   0   0   7   3   0   0]
 [  1   0   0   0  35   0   0   0   2   0   0   0]
 [  0   0   0   0   0 206   0   0   0   0   0   0]
 [  0   2   5   0   0   0 671  13  18   3   0   1]
 [ 16   0  26   6   1   1   7 807  16  31   0   0]
 [  4   1  19   0   6   0   1   6 288   2   1   1]
 [  3   0   0   0   4   0   1   4   5 110   0   0]
 [  6   2   7   0   1   0   0   2   0   1  55   0]
 [  2  15   5   0  11   0   3   3   2   1   9 190]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.89      0.89      0.89       627
     CHAETOCEROS_SUBTILIS       0.79      0.97      0.88       119
                 COPEPODS       0.85      0.77      0



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 15.33 seconds
Confusion Matrix:
[[501   0  17   0  71   3   1  22   6   1   5   0]
 [  0 111   1   0   0   0   3   0   1   0   1   2]
 [ 17   1 404   0   1   0   3  79  40   2  13   2]
 [  3   0   3 106   1   0   0   1   1   1   2   0]
 [  0   0   0   0  37   0   0   0   1   0   0   0]
 [  1   0   0   0   0 205   0   0   0   0   0   0]
 [  0   0   1   0   1   0 692  16   1   2   0   0]
 [  3   0   3   3   1   0   9 883   8   1   0   0]
 [  3   1  17   0   5   0   3  22 276   2   0   0]
 [  1   0   1   0   5   1   2   9   1 105   0   2]
 [  4   0   2   0   1   0   1   7   0   0  58   1]
 [  0   2   2   0   5   0   2  14   4   0   9 203]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.94      0.80      0.86       627
     CHAETOCEROS_SUBTILIS       0.97      0.93      0.95       119
                 COPEPODS       0.90      0.72      0



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 14.63 seconds
Confusion Matrix:
[[583   0  11   5   6   0   0   5   5   2  10   0]
 [  0 115   0   0   0   0   0   0   0   0   3   1]
 [  9   0 482   0   1   0   0  15  34   0  21   0]
 [  2   0   0 116   0   0   0   0   0   0   0   0]
 [  2   0   1   0  35   0   0   0   0   0   0   0]
 [  0   0   0   0   0 206   0   0   0   0   0   0]
 [  0   0   2   2   0   0 687  18   2   2   0   0]
 [  3   0   7  20   0   0   1 876   3   0   1   0]
 [  1   0  34   0   0   0   0  19 274   0   1   0]
 [  2   0   0   0   2   1   2   3   0 117   0   0]
 [  0   1   6   0   0   0   0   0   0   0  67   0]
 [  0   2   2   1   1   0   2  11   3   1  15 203]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.97      0.93      0.95       627
     CHAETOCEROS_SUBTILIS       0.97      0.97      0.97       119
                 COPEPODS       0.88      0.86      0



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 14.53 seconds
Confusion Matrix:
[[549   1  18   2  24   0   1  17   3   9   1   2]
 [  0 116   0   1   0   0   0   0   0   0   1   1]
 [  7   0 476   4   3   2   5  23  34   3   3   2]
 [  0   0   1 111   0   0   0   4   1   1   0   0]
 [  1   0   0   0  36   0   0   0   0   1   0   0]
 [  0   0   0   0   0 206   0   0   0   0   0   0]
 [  0   0   1   0   0   0 698  11   0   3   0   0]
 [  1   0   5   8   0   0   8 884   5   0   0   0]
 [  0   0  32   2   0   0   1  22 265   4   1   2]
 [  0   0   1   0   2   3   3   4   0 114   0   0]
 [  2   0   5   0   1   0   0   2   1   1  62   0]
 [  0   1   1   1   0   0   1   5   1   1   9 221]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.98      0.88      0.93       627
     CHAETOCEROS_SUBTILIS       0.98      0.97      0.98       119
                 COPEPODS       0.88      0.85      0



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 13.93 seconds
Confusion Matrix:
[[532   0  16   6  17   9   0  13   3  13  10   8]
 [  0 114   1   1   0   0   0   0   0   0   0   3]
 [  5   3 450   0   4   4   2  32  32   2  15  13]
 [  1   0   0 117   0   0   0   0   0   0   0   0]
 [  1   0   0   0  37   0   0   0   0   0   0   0]
 [  0   0   0   0   0 206   0   0   0   0   0   0]
 [  0   0   0   1   1   0 674  30   0   3   0   4]
 [  4   0   5   9   0   0   0 879   4   1   0   9]
 [  0   0  19   0   3   0   0  27 275   2   0   3]
 [  2   0   1   1   1   1   3   4   1 111   0   2]
 [  1   2   7   0   0   0   0   0   0   0  60   4]
 [  0   0   4   0   1   0   1   5   1   0   9 220]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.97      0.85      0.91       627
     CHAETOCEROS_SUBTILIS       0.96      0.96      0.96       119
                 COPEPODS       0.89      0.80      0



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 15.21 seconds
Confusion Matrix:
[[591   2  18   1   4   1   2   1   3   2   1   1]
 [  0 119   0   0   0   0   0   0   0   0   0   0]
 [  5   2 508   0   1   0   0  16  25   0   5   0]
 [  1   0   2 113   0   0   0   2   0   0   0   0]
 [  1   0   0   0  36   0   0   0   1   0   0   0]
 [  0   0   0   0   0 206   0   0   0   0   0   0]
 [  0   0   5   0   0   0 678  28   0   2   0   0]
 [  6   0   7   3   1   0   1 891   2   0   0   0]
 [  0   0  36   0   0   0   0   9 284   0   0   0]
 [  3   0   2   1   0   3   2   1   0 115   0   0]
 [  2   0  10   0   0   0   0   1   1   1  59   0]
 [  0   0   5   0   0   0   1   4   6   0   9 216]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.97      0.94      0.96       627
     CHAETOCEROS_SUBTILIS       0.97      1.00      0.98       119
                 COPEPODS       0.86      0.90      0



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 14.56 seconds
Confusion Matrix:
[[578   3  13   0   6   0   2   8   0   5  10   2]
 [  0 119   0   0   0   0   0   0   0   0   0   0]
 [ 12   1 479   0   1   0   4  28  18   3  13   3]
 [  1   0   0 117   0   0   0   0   0   0   0   0]
 [  2   0   0   0  33   0   0   0   0   3   0   0]
 [  0   0   0   0   0 206   0   0   0   0   0   0]
 [  0   0   1   0   0   0 693  15   0   2   0   2]
 [  5   0   4   1   0   0   1 898   1   0   1   0]
 [  6   0  27   0   4   0   1  24 264   3   0   0]
 [  5   0   0   1   1   2   2   4   0 110   1   1]
 [  3   0   3   0   0   0   0   0   1   1  66   0]
 [  0   0   1   0   1   0   2   6   1   0  10 220]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.94      0.92      0.93       627
     CHAETOCEROS_SUBTILIS       0.97      1.00      0.98       119
                 COPEPODS       0.91      0.85      0



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 14.81 seconds
Confusion Matrix:
[[575   0  10   0  22   0   1   6   3   7   1   2]
 [  0 116   1   0   0   0   0   0   1   0   0   1]
 [  7   0 480   0   5   1   2  19  35   6   6   1]
 [  1   1   1 112   1   0   0   2   0   0   0   0]
 [  0   0   0   0  37   0   0   0   1   0   0   0]
 [  0   0   0   0   0 206   0   0   0   0   0   0]
 [  0   0   3   0   0   0 671  21   8   9   0   1]
 [  5   0   8   3   1   0   3 890   1   0   0   0]
 [  1   0  14   0   0   0   0   8 306   0   0   0]
 [  3   0   0   0   3   0   1   1   1 118   0   0]
 [  2   0   8   0   0   0   0   1   1   2  59   1]
 [  0   0   3   0   1   0   2   5   2   0   9 219]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.97      0.92      0.94       627
     CHAETOCEROS_SUBTILIS       0.99      0.97      0.98       119
                 COPEPODS       0.91      0.85      0



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 16.6 seconds
Confusion Matrix:
[[557   0  23   2  10   1   1  16   2   4  10   1]
 [  0 106   6   0   0   0   4   0   0   0   3   0]
 [ 13   1 479   0   3   2   8  24  25   1   5   1]
 [  0   1   3 103   1   0   0   7   0   1   2   0]
 [  3   0   0   0  32   0   0   1   1   1   0   0]
 [  0   0   0   0   0 206   0   0   0   0   0   0]
 [  2   0   3   0   2   0 675  26   3   2   0   0]
 [  6   0  18   4   0   0  28 851   2   2   0   0]
 [  7   0  43   1   3   0   9  32 229   1   3   1]
 [  4   0   2   1   2   2   7   5   1 102   0   1]
 [  1   0   9   0   0   0   1   2   1   0  60   0]
 [  0   4   9   0   0   0  15  13   3   0   8 189]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.94      0.89      0.91       627
     CHAETOCEROS_SUBTILIS       0.95      0.89      0.92       119
                 COPEPODS       0.81      0.85      0.



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 20.43 seconds
Confusion Matrix:
[[591   1  18   1   1   0   0   4   3   7   0   1]
 [  0 116   1   0   0   0   1   0   0   0   0   1]
 [ 16   0 516   0   0   0   0   8  17   1   1   3]
 [  1   1   1 106   0   0   0   2   0   6   1   0]
 [  6   0   1   0  30   0   0   0   0   0   0   1]
 [  2   0   0   0   0 204   0   0   0   0   0   0]
 [  1   0  12   0   0   0 678  21   0   1   0   0]
 [  8   0  17   1   0   0   1 880   1   2   1   0]
 [  9   0  51   0   3   0   2  23 237   2   0   2]
 [  7   0   3   1   1   1   1   1   0 110   1   1]
 [  4   0  11   0   0   0   0   1   1   1  53   3]
 [  0   0   6   0   0   0   0   2   0   0   8 225]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.92      0.94      0.93       627
     CHAETOCEROS_SUBTILIS       0.98      0.97      0.98       119
                 COPEPODS       0.81      0.92      0



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 15.44 seconds
Confusion Matrix:
[[589   0  12   1   1   1   1   6   2   3   9   2]
 [  0 115   0   1   0   0   0   0   0   0   0   3]
 [ 11   0 483   0   1   0   2  23  30   0  10   2]
 [  7   3   1  99   1   0   1   4   1   0   1   0]
 [  2   0   0   0  35   0   0   0   0   0   0   1]
 [  4   0   0   0   0 202   0   0   0   0   0   0]
 [  0   0   2   0   0   0 696  11   2   2   0   0]
 [  4   0   3   0   1   0   8 890   0   0   1   4]
 [  3   0  22   0   3   0   1  13 284   1   0   2]
 [  5   0   0   0   2   0   2   3   0 115   0   0]
 [  3   0   1   0   0   0   0   2   1   0  67   0]
 [  0   0   3   0   0   0   2   1   0   0  11 224]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.94      0.94      0.94       627
     CHAETOCEROS_SUBTILIS       0.97      0.97      0.97       119
                 COPEPODS       0.92      0.86      0



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 16.5 seconds
Confusion Matrix:
[[591   0  10   3   6   1   2   4   1   4   5   0]
 [  0 113   0   3   1   0   0   0   0   0   2   0]
 [ 12   0 478   0   0   0   3  22  37   0  10   0]
 [  1   0   0 114   0   0   0   1   0   1   1   0]
 [  2   0   0   0  33   0   1   0   2   0   0   0]
 [  0   0   0   0   0 205   0   1   0   0   0   0]
 [  0   0   2   0   0   0 706   4   0   1   0   0]
 [  3   0   5   8   0   0   9 880   4   0   2   0]
 [  4   0  29   0   1   0   5  19 268   2   1   0]
 [  1   0   1   2   2   6   1   1   0 113   0   0]
 [  1   0   1   1   0   0   2   0   0   0  69   0]
 [  0   2   1   0   0   0   6   4   3   0  11 214]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.96      0.94      0.95       627
     CHAETOCEROS_SUBTILIS       0.98      0.95      0.97       119
                 COPEPODS       0.91      0.85      0.



Found 4065 validated image filenames belonging to 12 classes.
Testing Time: 18.35 seconds
Confusion Matrix:
[[599   0   3   0  10   1   0   3   1   3   6   1]
 [  0 113   0   2   0   0   0   1   0   0   2   1]
 [ 29   0 450   0   2   0   0  15  46   0  17   3]
 [  2   0   1 108   2   0   0   1   0   0   4   0]
 [  0   0   0   0  38   0   0   0   0   0   0   0]
 [  1   0   0   0   0 205   0   0   0   0   0   0]
 [  0   0   2   0   1   0 697   5   3   5   0   0]
 [ 10   0   4   0   1   0   6 882   3   1   4   0]
 [  5   0  11   1   7   0   4  11 289   0   1   0]
 [  1   0   0   2   3   2   0   6   1 111   1   0]
 [  3   0   1   0   0   0   0   1   1   0  65   3]
 [  0   0   2   0   0   0   2   1   2   0  11 223]]

Classification Report:
                           precision    recall  f1-score   support

          APPENDICULARIAN       0.92      0.96      0.94       627
     CHAETOCEROS_SUBTILIS       1.00      0.95      0.97       119
                 COPEPODS       0.95      0.80      0

In [10]:
# Replace this with the path to your saved models directory
models_directory = '/uoa/home/s04bs3/data/full/dasa/models/'

# List of model names
model_name = ["MobileNetV2", "MobileNetV2-0.75", "MobileNetV2-0.5",
              "MobileNetV3Small", "MobileNetV3Small-0.75", "MobileNetV3Small-Min",
              "MobileNetV3Large", "MobileNetV3Large-0.75", "MobileNetV3Large-Min",
              "VGG16","InceptionV3", "ResNet50", "ResNet101","ResNet152"]

for model in model_name:
    # Load the model
    model_path = os.path.join(models_directory, f'{model}.h5')
    loaded_model = load_model(model_path)

    # Get the size of the model file
    model_size_bytes = os.path.getsize(model_path)

    # Convert size to MB
    model_size_MB = model_size_bytes / (1024 * 1024)

    # Get the number of parameters (in millions)
    num_params = loaded_model.count_params() / 1e6

    print(f"{model} - Model Size: {model_size_MB:.2f} MB | Number of Parameters: {num_params:.2f} million")


MobileNetV2 - Model Size: 10.85 MB | Number of Parameters: 2.43 million
MobileNetV2-0.75 - Model Size: 7.51 MB | Number of Parameters: 1.55 million
MobileNetV2-0.5 - Model Size: 4.93 MB | Number of Parameters: 0.87 million
MobileNetV3Small - Model Size: 4.78 MB | Number of Parameters: 1.02 million
MobileNetV3Small-0.75 - Model Size: 3.22 MB | Number of Parameters: 0.64 million
MobileNetV3Small-Min - Model Size: 2.80 MB | Number of Parameters: 0.52 million
MobileNetV3Large - Model Size: 13.25 MB | Number of Parameters: 3.12 million
MobileNetV3Large-0.75 - Model Size: 8.37 MB | Number of Parameters: 1.91 million
MobileNetV3Large-Min - Model Size: 7.24 MB | Number of Parameters: 1.57 million
VGG16 - Model Size: 57.01 MB | Number of Parameters: 14.78 million
InceptionV3 - Model Size: 86.73 MB | Number of Parameters: 22.07 million
ResNet50 - Model Size: 93.37 MB | Number of Parameters: 23.85 million
ResNet101 - Model Size: 166.41 MB | Number of Parameters: 42.92 million
ResNet152 - Model Si