## TF Lite Breast Cancer Detection Week 13: Bring It All Together
### Yinda Chen and Alice Tang

For our final assignment, we will be including all of the most important code to represent an end to end modeling project, including only the most crucial parts of our process. 

Let's get started, shall we?

To preface, the dataset can be found here: https://www.kaggle.com/datasets/awsaf49/cbis-ddsm-breast-cancer-image-dataset

### Loading all needed packages.

In [1]:
import os
import PIL
import cv2
import uuid
import shutil
import random
import glob as gb
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

from PIL import Image
from tqdm import tqdm  # Progress bar
from scipy.special import gamma

from keras.optimizers import *
from keras.regularizers import l1_l2
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input
from keras.layers import GlobalAveragePooling2D
from keras.callbacks import LearningRateScheduler
from keras.layers import Conv2D, MaxPool2D, BatchNormalization

from tensorflow.keras.metrics import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator

2024-12-07 13:54:59.157244: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-07 13:54:59.157284: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-07 13:54:59.157730: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-07 13:54:59.160713: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Loading in our data.

In [2]:
# Load the csv data

dicom_df = pd.read_csv("../csv/dicom_info.csv")
mass_train = pd.read_csv("../csv/mass_case_description_train_set.csv")
mass_test  = pd.read_csv("../csv/mass_case_description_test_set.csv")
calc_train = pd.read_csv("../csv/calc_case_description_train_set.csv")
calc_test  = pd.read_csv("../csv/calc_case_description_test_set.csv")

### Replacing our image file paths.

In [3]:
# Replace the path of images

def replace_path(sample, old_path, new_path):
    return sample.replace(old_path, new_path, regex=True)

cropped_images = dicom_df[dicom_df.SeriesDescription=="cropped images"].image_path
full_mammogram = dicom_df[dicom_df.SeriesDescription=="full mammogram images"].image_path
roi_mask = dicom_df[dicom_df.SeriesDescription=="ROI mask images"].image_path

correct_dir = "../jpeg"

full_mammogram = replace_path(full_mammogram, "CBIS-DDSM/jpeg", correct_dir)
cropped_images = replace_path(cropped_images, "CBIS-DDSM/jpeg", correct_dir)
roi_mask = replace_path(roi_mask, "CBIS-DDSM/jpeg", correct_dir)
print('\nROI Mask Images paths:')
print(roi_mask.iloc[0])
print('\nCropped Images paths:')
print(cropped_images.iloc[0])
print('\nFull mammo Images paths:')
print(full_mammogram.iloc[0])


ROI Mask Images paths:
../jpeg/1.3.6.1.4.1.9590.100.1.2.153339052913121382622526066491844156138/2-270.jpg

Cropped Images paths:
../jpeg/1.3.6.1.4.1.9590.100.1.2.129308726812851964007517874181459556304/1-172.jpg

Full mammo Images paths:
../jpeg/1.3.6.1.4.1.9590.100.1.2.248386742010678582309005372213277814849/1-249.jpg


In [4]:
# Get the file name list for every kind of images

def get_image_file_name(data, new_dict):

    for dicom in data:
        key = dicom.split('/')[2]
        new_dict[key] = dicom
    print(f"the length of dataset ==> {len(new_dict.keys())}")

In [5]:
cropped_images_dict = dict()
full_mammo_dict = dict()
roi_img_dict = dict()

get_image_file_name(full_mammogram, full_mammo_dict)
get_image_file_name(cropped_images, cropped_images_dict)
get_image_file_name(roi_mask, roi_img_dict)

the length of dataset ==> 2857
the length of dataset ==> 3567
the length of dataset ==> 3247


### Here, we must fix the image paths in our csv. 

In [6]:
# Fix the image path in the csv

def fix_image_path(data):
    """Correct dicom paths to correct image paths."""
    for indx, image in enumerate(data.values):

        img_name = image[11].split('/')[2]

        if img_name in full_mammo_dict:
            data.iloc[indx, 11] = full_mammo_dict[img_name]
        else:
            data.iloc[indx, 11] = None
        
        img_name = image[12].split('/')[2]
        if img_name in cropped_images_dict:
            data.iloc[indx, 12] = cropped_images_dict[img_name]
        else:
            data.iloc[indx, 11] = None

        img_name = image[13].split('/')[2]
        if img_name in roi_img_dict:
            data.iloc[indx, 13] = roi_img_dict[img_name]

        else:
            data.iloc[indx, 13] = None

Doing it for all of our csv dataset files!

In [None]:
fix_image_path(mass_train)
fix_image_path(mass_test)
fix_image_path(calc_train)
fix_image_path(calc_test)

### Now, we must rename the name of our columns in the csv file!

In [8]:
# rename the name of columns in csv
mass_train = mass_train.rename(columns={'left or right breast': 'left_or_right_breast',
                                        'image view': 'image_view',
                                        'abnormality id': 'abnormality_id',
                                        'abnormality type': 'abnormality_type',
                                        'mass shape': 'mass_shape',
                                        'mass margins': 'mass_margins',
                                        'image file path': 'image_file_path',
                                        'cropped image file path': 'cropped_image_file_path',
                                        'ROI mask file path': 'ROI_mask_file_path'})

mass_test = mass_test.rename(columns={'left or right breast': 'left_or_right_breast',
                                      'image view': 'image_view',
                                      'abnormality id': 'abnormality_id',
                                      'abnormality type': 'abnormality_type',
                                      'mass shape': 'mass_shape',
                                      'mass margins': 'mass_margins',
                                      'image file path': 'image_file_path',
                                      'cropped image file path': 'cropped_image_file_path',
                                      'ROI mask file path': 'ROI_mask_file_path'})

calc_train = calc_train.rename(columns={'left or right breast': 'left_or_right_breast',
                                        'image view': 'image_view',
                                        'abnormality id': 'abnormality_id',
                                        'abnormality type': 'abnormality_type',
                                        'mass shape': 'mass_shape',
                                        'mass margins': 'mass_margins',
                                        'image file path': 'image_file_path',
                                        'cropped image file path': 'cropped_image_file_path',
                                        'ROI mask file path': 'ROI_mask_file_path'})

calc_test = calc_test.rename(columns={'left or right breast': 'left_or_right_breast',
                                      'image view': 'image_view',
                                      'abnormality id': 'abnormality_id',
                                      'abnormality type': 'abnormality_type',
                                      'mass shape': 'mass_shape',
                                      'mass margins': 'mass_margins',
                                      'image file path': 'image_file_path',
                                      'cropped image file path': 'cropped_image_file_path',
                                      'ROI mask file path': 'ROI_mask_file_path'})

In [9]:
mass_train.pathology.unique()

array(['MALIGNANT', 'BENIGN', 'BENIGN_WITHOUT_CALLBACK'], dtype=object)

In [10]:
# Merge all the dataset into one for training

full_dataset = pd.concat([mass_train, mass_test, calc_train, calc_test], axis=0)

In [11]:
# Set the number of classification

class_mapper = {'MALIGNANT': 1, 'BENIGN': 0, 'BENIGN_WITHOUT_CALLBACK': 0} 

In [12]:
target_size = (224, 224, 3)

# Apply class mapper to pathology column
full_dataset['labels'] = full_dataset['pathology'].replace(class_mapper).infer_objects(copy=False)

full_images = np.array(full_dataset[full_dataset["image_file_path"].notna()]["image_file_path"].tolist())
full_labels = np.array(full_dataset[full_dataset["image_file_path"].notna()]["labels"].tolist())

  full_dataset['labels'] = full_dataset['pathology'].replace(class_mapper).infer_objects(copy=False)


In [13]:
len(full_images)

3284

### Let's examine the different counts of each label, shall we?

In [14]:
# If full_labels is a NumPy array, convert it to a Pandas series
full_labels_series = pd.Series(full_labels)

# Count the occurrences of each class
label_counts = full_labels_series.value_counts()

# Assuming 0 = benign and 1 = malignant
benign_count = label_counts.get(0, 0)
malignant_count = label_counts.get(1, 0)

print(f"Benign images: {benign_count}")
print(f"Malignant images: {malignant_count}")

Benign images: 1930
Malignant images: 1354


In [17]:
num_classes = len(full_dataset['labels'].unique())
class_names = ['Benign', 'Malignant']

# Check the distribution of labels
label_counts = full_dataset['labels'].value_counts()
print(label_counts)

labels
0    2111
1    1457
Name: count, dtype: int64


### Here, we must define our data augmentation function.

In [None]:
# Define a function for data augmentation
def augment_image(image):
    # Apply data augmentation using tf.image functions
    image = tf.image.random_flip_left_right(image)
#     image = tf.image.random_flip_up_down(image)
    image = tf.image.random_brightness(image, max_delta=0.3)
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
    image = tf.image.random_saturation(image, lower=0.8, upper=1.2)
    return image

# Function to resize image to (224, 224, 3)
def resize_image(image_tensor):
    return tf.image.resize(image_tensor, [224, 224])

# Function to balance classes by augmenting images
def copy_images_with_unique_filenames(images, labels, source, destination, target_count=None):
    """
    Copy images from source to destination in subfolders '0' and '1',
    ensuring unique filenames and applying data augmentation and balancing.
    """
    benign_images = 0
    malignant_images = 0
    skipped_images = []

    # Create the destination subfolders '0' and '1'
    category_dest_dir_zero = os.path.join(destination, '0')
    os.makedirs(category_dest_dir_zero, exist_ok=True)

    category_dest_dir_one = os.path.join(destination, '1')
    os.makedirs(category_dest_dir_one, exist_ok=True)

    benign_images_list = []
    malignant_images_list = []

    for i, (image, label) in enumerate(zip(images, labels)):
#         img_name = data_frame.REFNUM[i]
#         abs_path = os.path.join(source, img_name + '.pgm')

        if os.path.exists(image):
            try:
                # Generate a unique filename
                filename = os.path.basename(image)
                unique_filename = f"{uuid.uuid4().hex}_{filename}"
        
                # Open the image using PIL
                with Image.open(image) as img:
                    # Convert the image to RGB mode (for saving as JPEG)
                    img = img.convert('RGB')
                    # Augment the image (convert it to a Tensor first)
                    img_tensor = tf.convert_to_tensor(img)
                    # Resize the image to (224, 224, 3)
                    resized_img_tensor = resize_image(img_tensor)
                    augmented_image_tensor = augment_image(resized_img_tensor)
                    # Convert Tensor back to PIL image for saving
                    augmented_image = tf.keras.preprocessing.image.array_to_img(augmented_image_tensor)

                    if label == 0:
                        benign_images_list.append(unique_filename)
                        dest_path = os.path.join(category_dest_dir_zero, unique_filename)
#                         augmented_image.save(dest_path, 'JPEG')
                        augmented_image.save(dest_path, 'JPEG')
                        benign_images += 1

                    elif label == 1:
                        malignant_images_list.append(unique_filename)
                        dest_path = os.path.join(category_dest_dir_one, unique_filename)
#                         augmented_image.save(dest_path, 'JPEG')
                        augmented_image.save(dest_path, 'JPEG')
                        malignant_images += 1
                        
#                 del img, img_tensor, resized_img_tensor, augmented_image_tensor, augmented_image
#                 gc.collect()
            except Exception as e:
                print(f"Error copying image {image}: {e}")
                skipped_images.append(image)
        else:
            print(f"Image not found: {image}")
            skipped_images.append(image)

    # If balancing is needed, duplicate/augment images from the smaller class
    benign_count = len(benign_images_list)
    malignant_count = len(malignant_images_list)

    if benign_count < malignant_count:
#         augment_and_save_images(benign_images_list, category_dest_dir_zero, target_count - benign_count)
        augment_and_save_images(benign_images_list, category_dest_dir_zero, malignant_count - benign_count)

    elif malignant_count < benign_count:
        augment_and_save_images(malignant_images_list, category_dest_dir_one, benign_count - malignant_count)

    augment_and_save_images(benign_images_list, category_dest_dir_zero, target_count)
    augment_and_save_images(malignant_images_list, category_dest_dir_one, target_count)

    print(f"\nCopying complete.")
    print(f"Benign images copied (label 0): {benign_images}")
    print(f"Benign count (label 0): {benign_count}")
    print(f"Malignant images copied (label 1): {malignant_images}")
    print(f"Malignant count (label 1): {malignant_count}")
    print(f"Total skipped images: {len(skipped_images)}")
    if skipped_images:
        print("Skipped images:")
        for img in skipped_images:
            print(img)
            

# Function to augment and save images to balance the dataset
def augment_and_save_images(images_list, destination_dir, num_augments):
    """
    Augment and save images to balance the dataset.
    """
    for i in range(num_augments):
        img_name = random.choice(images_list)
        abs_path = os.path.join(destination_dir, img_name)

        try:
            with Image.open(abs_path) as img:
                img = img.convert('RGB')
                # Augment the image
                img_tensor = tf.convert_to_tensor(img)
                # Resize the image
#                 resized_img_tensor = resize_image(img_tensor)
                augmented_image_tensor = augment_image(img_tensor)
                # Convert Tensor back to PIL image for saving
                augmented_image = tf.keras.preprocessing.image.array_to_img(augmented_image_tensor)
                # Remove the original extension from img_name 1-285.jpg --> 1-285
                img_name_without_ext = os.path.splitext(img_name)[0]
                # Save augmented image with a unique name
                augmented_image.save(os.path.join(destination_dir, img_name_without_ext + f'_aug{i}.jpg'), 'JPEG')
            
        except Exception as e:
            print(f"Error augmenting image {abs_path}: {e}")

In [None]:
source_dir = "../jpeg"
destination_dir = "../working/merged_images"

# target_count=0 meaning no Augmentation, There's just Data-Balance
target_count = (len(full_labels) * 3) - len(full_labels)
copy_images_with_unique_filenames(full_images, full_labels, source_dir, destination_dir, target_count)

We have created the dir for the benign images and malignant images in the past weeks.

In [2]:
# Check the number of images in each class folder after merging
zero_class_count = len(os.listdir("../working/merged_images/0"))
one_class_count  = len(os.listdir("../working/merged_images/1"))

print(f"Number of images in class 0: {zero_class_count}")
print(f"Number of images in class 1: {one_class_count}")

Number of images in class 0: 8498
Number of images in class 1: 8498


In [None]:
data_dir = '../working/merged_images'  # Update with the dataset path

# Create a dataset for the entire data to use for split
full_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    labels='inferred',
    label_mode='categorical',
    image_size=(224, 224),
    seed=50,
    shuffle=True,
    batch_size=13
)
# Calculate the total number of samples
total_samples = tf.data.experimental.cardinality(full_dataset).numpy()

train_size = int(0.75 * total_samples)                 # 70% for training
val_size   = int(0.2 * total_samples)                # 20% for validation
test_size = total_samples - train_size - val_size     # 10% for testing

# Create train, validation, and test datasets
train_dataset       = full_dataset.take(train_size)
validation_dataset  = full_dataset.skip(train_size).take(val_size)
test_dataset        = full_dataset.skip(train_size + val_size)

train_dataset      = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
validation_dataset = validation_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
test_dataset       = test_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Print the number of samples in each dataset
print(f"Train samples:      {train_size}     batches(13) ==> {train_size*13}")
print(f"Validation samples: {val_size}       batches(13) ==> {val_size*13}")
print(f"Test samples:       {test_size}      batches(13) ==> {test_size*13}")

Found 16996 files belonging to 2 classes.


2024-11-09 19:20:54.707965: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-09 19:20:54.720918: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-09 19:20:54.720945: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-09 19:20:54.722586: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-09 19:20:54.722606: I external/local_xla/xla/stream_executor

Train samples:      981     batches(13) ==> 12753
Validation samples: 261       batches(13) ==> 3393
Test samples:       66      batches(13) ==> 858


### Here, we conduct our data improvements and enhancements.

In [11]:
class MammogramPreProcessor:
    def __init__(self, target_size=(224, 224)):
        self.target_size = target_size

    # Function 1
    @tf.function
    def remove_background_tf(self, image):
        """
        TensorFlow implementation for background removal.
        """
        # Convert to grayscale if it's a 3-channel image
        if tf.shape(image)[-1] == 3:
            image = tf.image.rgb_to_grayscale(image)
        
        # Create a binary mask
        threshold = tf.cast(5, dtype=tf.float32)
        binary_mask = tf.cast(image > threshold, tf.float32)
        
        # Apply the mask
        return image * binary_mask

    # Function 2
    @tf.function
    def apply_clahe_tf(self, image):
        """
        TensorFlow implementation for CLAHE enhancement.
        """
        # Normalize to the range 0-255
        image = tf.cast(image, tf.float32)
        image = (image - tf.reduce_min(image)) / (tf.reduce_max(image) - tf.reduce_min(image)) * 255
        return image

    # Function 3
    @tf.function
    def normalize_tf(self, image):
        """
        Normalize the image.
        """
        image = tf.cast(image, tf.float32)
        mean = tf.reduce_mean(image)
        std = tf.math.reduce_std(image)
        return (image - mean) / (std + 1e-7)


In [12]:
def create_preprocessing_pipeline(target_size=(224, 224)):
    """
    Create a complete preprocessing pipeline.
    """
    processor = MammogramPreProcessor(target_size)
    
    def preprocess_function(images, labels):
        # Process each image in the batch
        def process_single_image(image):
            # Remove background
            image = processor.remove_background_tf(image)
            
            # Apply CLAHE enhancement
            image = processor.apply_clahe_tf(image)
            
            # Normalize the image
            image = processor.normalize_tf(image)
            
            # Ensure correct size
            image = tf.image.resize(image, target_size)
            
            # Ensure the correct number of channels (if 3 channels are needed)
            image = tf.tile(image, [1, 1, 3])
            
            return image
        
        # Process the entire batch
        processed_images = tf.map_fn(process_single_image, images)
        return processed_images, labels

    return preprocess_function

In [None]:
def prepare_dataset(full_dataset, batch_size=13):
    """
    prepare for the dataset and preprocessing.
    """
    AUTOTUNE = tf.data.AUTOTUNE
    
    # create the preprocess pipeline
    preprocess_fn = create_preprocessing_pipeline(target_size=(224, 224))
    
    # apply the preprocess
    processed_dataset = full_dataset.map(preprocess_fn, num_parallel_calls=AUTOTUNE)
    
    # improve the performance
    processed_dataset = processed_dataset.cache()
    processed_dataset = processed_dataset.prefetch(buffer_size=AUTOTUNE)
    
    return processed_dataset

### Let's get to modeling with our final model we chose- EfficientNet.

In [None]:
# Modeling

from tensorflow.keras.applications import EfficientNetV2B0

def model(dropout, trainable_layers):
    base_model = EfficientNetV2B0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    # Freeze all layers initially
    for layer in base_model.layers:
        layer.trainable = False

    # Calculate the index to start unfreezing layers
    from_index = int(np.round((len(base_model.layers) - 1) * (1.0 - trainable_layers / 100.0)))

    # Unfreeze layers from the calculated index onwards
    for layer in base_model.layers[from_index:]:
        layer.trainable = True

    # Add custom layers on top (Upper-Layers)
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = BatchNormalization()(x)

    x = Dense(1024, activation='relu')(x)
    x = BatchNormalization()(x)
    
    x = Dropout(dropout)(x)
    predictions = Dense(2, activation='softmax')(x)

    model = Model(inputs=base_model.input, outputs=predictions)
    
    return model

In [20]:
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, 
                              min_lr=5e-6, verbose=1)

early_stopping = EarlyStopping(monitor='val_loss', patience=4, 
                               restore_best_weights=False, verbose=1)

# ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', 
                             mode='max', save_best_only=True, verbose=1)

model = model(0.1, 30)
model.compile(optimizer=Adam(learning_rate=1e-4),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])

history = model.fit(
            train_dataset,
            validation_data=validation_dataset,
            batch_size=13,
            epochs=25,
            callbacks=[reduce_lr, early_stopping, checkpoint],
            verbose=1
        )

Epoch 1/25


2024-11-09 19:44:16.431008: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmodel_1/block2b_drop/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 1: val_accuracy improved from -inf to 0.82582, saving model to best_model.h5


  saving_api.save_model(


Epoch 2/25
Epoch 2: val_accuracy improved from 0.82582 to 0.86325, saving model to best_model.h5
Epoch 3/25
Epoch 3: val_accuracy improved from 0.86325 to 0.91306, saving model to best_model.h5
Epoch 4/25
Epoch 4: val_accuracy improved from 0.91306 to 0.92986, saving model to best_model.h5
Epoch 5/25
Epoch 5: val_accuracy did not improve from 0.92986
Epoch 6/25
Epoch 6: val_accuracy improved from 0.92986 to 0.94931, saving model to best_model.h5
Epoch 7/25
Epoch 7: val_accuracy did not improve from 0.94931
Epoch 8/25
Epoch 8: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.

Epoch 8: val_accuracy did not improve from 0.94931
Epoch 9/25
Epoch 9: val_accuracy improved from 0.94931 to 0.96522, saving model to best_model.h5
Epoch 10/25
Epoch 10: val_accuracy did not improve from 0.96522
Epoch 11/25
Epoch 11: val_accuracy improved from 0.96522 to 0.96729, saving model to best_model.h5
Epoch 12/25
Epoch 12: val_accuracy did not improve from 0.96729
Epoch 13/25
Epoch 13: Red

In [21]:
full_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    labels='inferred',
    label_mode='categorical',
    # image_size=(224, 224),
    image_size=(224, 224),
    seed=50,
    shuffle=False,
    batch_size=13
)

processed_dataset = prepare_dataset(full_dataset)

total_samples = tf.data.experimental.cardinality(processed_dataset).numpy()

train_size = int(0.75 * total_samples)                 # 70% for training
val_size   = int(0.2 * total_samples)                # 20% for validation
test_size = total_samples - train_size - val_size     # 10% for testing

# Create train, validation, and test datasets
train_dataset       = full_dataset.take(train_size)
validation_dataset  = full_dataset.skip(train_size).take(val_size)
test_dataset        = full_dataset.skip(train_size + val_size)

Found 16996 files belonging to 2 classes.


### Here, we'll assess the precision, recall, and F-1 score which are part of our evaluation metrics.

In [None]:
# Test the Precision, Recall and F1 Score.

model.load_weights("best_model.h5")
test_loss, test_accuracy = model.evaluate(test_dataset, verbose=1)
print(f"Test Accuracy: {test_accuracy}")

from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Get the prediciton
y_pred = model.predict(test_dataset)
y_pred_classes = np.argmax(y_pred, axis=1)

# Get the true labels
y_true = np.concatenate([y for x, y in test_dataset], axis=0)
y_true_classes = np.argmax(y_true, axis=1)

# Calculate Precision, Recall and F1 Score
precision = precision_score(y_true_classes, y_pred_classes, average='weighted')
recall = recall_score(y_true_classes, y_pred_classes, average='weighted')
f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Test Accuracy: 0.9858823418617249
Precision: 1.0
Recall: 0.9858823529411764
F1 Score: 0.9928909952606635


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Lastly, this is our demonstration on creating predictions!

In [None]:
# Demo
# You can choose an image from the folder prediction_subset and get the prediction.

import random
from tensorflow.keras.utils import img_to_array, load_img
from tensorflow.keras.models import load_model

# Choose a image randomly from the dataset
folder_path = '../prediction_subset'
image_files = [f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]

if image_files:
    img_path = os.path.join(folder_path, random.choice(image_files))
else:
    raise FileNotFoundError("No images found in the folder.")

# preprocess the image so that the model can use it
def preprocess_image(img_path):
    img = load_img(img_path, target_size=(224, 224))
    img_array = img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    return img_array

model = load_model("best_model.h5")

# Prepare the image to be fed into the model for prediction
processed_image = preprocess_image(img_path)
predictions = model.predict(processed_image)

print(predictions)