# Classification Performance Evaluation on Test Set

In [10]:
import os
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import image
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

data_dir = r'C:\Users\Anne\OneDrive - National University of Ireland, Galway\Documents\Data Analytics\PROJECT\Capstone2025_Anne\cbis_ddsm_dataset\merged_data'


# Create a dataset for the entire data to use for split
full_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    labels='inferred',
    label_mode='categorical',
    # image_size=(224, 224),
    image_size=(224, 224),
    seed=50,
    shuffle=True,
    batch_size=13
)
# Calculate the total number of samples
total_samples = tf.data.experimental.cardinality(full_dataset).numpy()

train_size = int(0.8 * total_samples)                 # 70% for training
val_size   = int(0.15 * total_samples)                # 20% for validation
test_size = total_samples - train_size - val_size     # 10% for testing

# test set
test_dataset        = full_dataset.skip(train_size + val_size)

test_dataset       = test_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Print the number of samples in each dataset
print(f"Test samples:       {test_size}      batches(13) ==> {test_size*13}")

Found 17353 files belonging to 2 classes.
Test samples:       67      batches(13) ==> 871


## V1

In [None]:

# --- Load model ---
model = tf.keras.models.load_model("trained_model/ResNet50.keras")  

# --- Predictions ---
y_true = []
y_pred = []
y_prob = []  # for AUC

for images, labels in test_dataset:
    preds = model.predict(images, verbose=0)
    pred_classes = np.argmax(preds, axis=1)
    true_classes = np.argmax(labels.numpy(), axis=1)
    
    y_pred.extend(pred_classes)
    y_true.extend(true_classes)
    y_prob.extend(preds[:, 1])  # Probability for class 1 (Malignant)

# --- Calculate and print metrics ---
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_prob)
cm = confusion_matrix(y_true, y_pred)

print("Classification Report on Test Set:\n")
print(classification_report(y_true, y_pred, target_names=["Benign", "Malignant"]))
print("Confusion Matrix:\n", cm)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC Score: {auc:.4f}")

# Optional: also evaluate with Keras built-in method
results = model.evaluate(test_dataset, verbose=1)
loss = results[0]  # first element is loss
print(f"Loss: {loss:.4f}")

Classification Report on Test Set:

              precision    recall  f1-score   support

      Benign       0.99      0.91      0.95       467
   Malignant       0.91      0.99      0.95       402

    accuracy                           0.95       869
   macro avg       0.95      0.95      0.95       869
weighted avg       0.95      0.95      0.95       869

Confusion Matrix:
 [[426  41]
 [  5 397]]
Accuracy: 0.9471
Precision: 0.9064
Recall: 0.9876
F1 Score: 0.9452
AUC Score: 0.9916
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 870ms/step - accuracy: 0.9388 - loss: 0.2345 - precision: 0.9388 - recall: 0.9388
Loss: 0.1858


## V2

In [None]:
# --- Load model ---
model = tf.keras.models.load_model("trained_model/ResNet50_v2.keras")  


# --- Predictions ---
y_true = []
y_pred = []
y_prob = []  # for AUC

for images, labels in test_dataset:
    preds = model.predict(images, verbose=0)
    pred_classes = np.argmax(preds, axis=1)
    true_classes = np.argmax(labels.numpy(), axis=1)
    
    y_pred.extend(pred_classes)
    y_true.extend(true_classes)
    y_prob.extend(preds[:, 1])  # Probability for class 1 (Malignant)

# --- Calculate and print metrics ---
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_prob)
cm = confusion_matrix(y_true, y_pred)

print("Classification Report on Test Set:\n")
print(classification_report(y_true, y_pred, target_names=["Benign", "Malignant"]))
print("Confusion Matrix:\n", cm)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC Score: {auc:.4f}")

# Optional: also evaluate with Keras built-in method
results = model.evaluate(test_dataset, verbose=1)
loss = results[0]  # first element is loss
print(f"Loss: {loss:.4f}")

Classification Report on Test Set:

              precision    recall  f1-score   support

      Benign       0.94      0.96      0.95       474
   Malignant       0.95      0.92      0.93       395

    accuracy                           0.94       869
   macro avg       0.94      0.94      0.94       869
weighted avg       0.94      0.94      0.94       869

Confusion Matrix:
 [[453  21]
 [ 30 365]]
Accuracy: 0.9413
Precision: 0.9456
Recall: 0.9241
F1 Score: 0.9347
AUC Score: 0.9862
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 637ms/step - accuracy: 0.9369 - loss: 0.1832 - precision_v2: 0.9369 - recall_v2: 0.9369
Loss: 0.1912


## V3

In [None]:
# --- Load model ---
model = tf.keras.models.load_model("trained_model/ResNet50_v3.keras")  

# --- Predictions ---
y_true = []
y_pred = []
y_prob = []  # for AUC

for images, labels in test_dataset:
    preds = model.predict(images, verbose=0)
    pred_classes = np.argmax(preds, axis=1)
    true_classes = np.argmax(labels.numpy(), axis=1)
    
    y_pred.extend(pred_classes)
    y_true.extend(true_classes)
    y_prob.extend(preds[:, 1])  # Probability for class 1 (Malignant)

# --- Calculate and print metrics ---
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_prob)
cm = confusion_matrix(y_true, y_pred)

print("Classification Report on Test Set:\n")
print(classification_report(y_true, y_pred, target_names=["Benign", "Malignant"]))
print("Confusion Matrix:\n", cm)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC Score: {auc:.4f}")

# Optional: also evaluate with Keras built-in method
results = model.evaluate(test_dataset, verbose=1)
loss = results[0]  # first element is loss
print(f"Loss: {loss:.4f}")

Classification Report on Test Set:

              precision    recall  f1-score   support

      Benign       0.97      0.88      0.92       473
   Malignant       0.87      0.97      0.92       396

    accuracy                           0.92       869
   macro avg       0.92      0.92      0.92       869
weighted avg       0.93      0.92      0.92       869

Confusion Matrix:
 [[416  57]
 [ 12 384]]
Accuracy: 0.9206
Precision: 0.8707
Recall: 0.9697
F1 Score: 0.9176
AUC Score: 0.9688
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 659ms/step - accuracy: 0.9214 - loss: 0.7063 - precision_v3: 0.9214 - recall_v3: 0.9214
Loss: 0.7424


## MobileNet

In [None]:
# --- Load model ---
model = tf.keras.models.load_model("trained_model/MobileNet.keras")  

# --- Predictions ---
y_true = []
y_pred = []
y_prob = []  # for AUC

for images, labels in test_dataset:
    preds = model.predict(images, verbose=0)
    pred_classes = np.argmax(preds, axis=1)
    true_classes = np.argmax(labels.numpy(), axis=1)
    
    y_pred.extend(pred_classes)
    y_true.extend(true_classes)
    y_prob.extend(preds[:, 1])  # Probability for class 1 (Malignant)
    
# --- Calculate and print metrics ---
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_prob)
cm = confusion_matrix(y_true, y_pred)

print("Classification Report on Test Set:\n")
print(classification_report(y_true, y_pred, target_names=["Benign", "Malignant"]))
print("Confusion Matrix:\n", cm)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC Score: {auc:.4f}")

# Optional: also evaluate with Keras built-in method
results = model.evaluate(test_dataset, verbose=1)
loss = results[0]  # first element is loss
print(f"Loss: {loss:.4f}")

Classification Report on Test Set:

              precision    recall  f1-score   support

      Benign       0.98      0.80      0.88       472
   Malignant       0.80      0.98      0.88       397

    accuracy                           0.88       869
   macro avg       0.89      0.89      0.88       869
weighted avg       0.90      0.88      0.88       869

Confusion Matrix:
 [[377  95]
 [  9 388]]
Accuracy: 0.8803
Precision: 0.8033
Recall: 0.9773
F1 Score: 0.8818
AUC Score: 0.9705
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 183ms/step - accuracy: 0.8812 - loss: 0.5188 - precision: 0.8812 - recall: 0.8812
Loss: 0.4213


## vgg16

In [9]:
# --- Load model ---
model = tf.keras.models.load_model("trained_model/vgg16.keras")  

# --- Predictions ---
y_true = []
y_pred = []
y_prob = []  # for AUC

for images, labels in test_dataset:
    preds = model.predict(images, verbose=0)
    pred_classes = np.argmax(preds, axis=1)
    true_classes = np.argmax(labels.numpy(), axis=1)
    
    y_pred.extend(pred_classes)
    y_true.extend(true_classes)
    y_prob.extend(preds[:, 1])  # Probability for class 1 (Malignant)

# --- Calculate and print metrics ---
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_prob)
cm = confusion_matrix(y_true, y_pred)

print("Classification Report on Test Set:\n")
print(classification_report(y_true, y_pred, target_names=["Benign", "Malignant"]))
print("Confusion Matrix:\n", cm)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC Score: {auc:.4f}")

# Optional: also evaluate with Keras built-in method
results = model.evaluate(test_dataset, verbose=1)
loss = results[0]  # first element is loss
print(f"Loss: {loss:.4f}")


Classification Report on Test Set:

              precision    recall  f1-score   support

      Benign       0.92      0.94      0.93       469
   Malignant       0.93      0.90      0.92       400

    accuracy                           0.92       869
   macro avg       0.92      0.92      0.92       869
weighted avg       0.92      0.92      0.92       869

Confusion Matrix:
 [[442  27]
 [ 39 361]]
Accuracy: 0.9241
Precision: 0.9304
Recall: 0.9025
F1 Score: 0.9162
AUC Score: 0.9739
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 2s/step - accuracy: 0.9094 - loss: 0.2604 - precision: 0.9094 - recall: 0.9094
Loss: 0.2291
