# Breast Cancer Detection With Ultrasound Images
# Tuner Version

Using CNN Deep Neural Network

(with keras_tuner to find the best neural network topology)

## About the Dataset

This dataset consists of ultrasound images related to benign and malignant breast cancers.

The images have been augmented by rotation and sharpening to produce sufficient amount of images.

## Importing the Data

### Determine all the images filepaths 

In [None]:
import os

def _get_all_filepath_from_folder(folder_filepath: str) -> list[str]:
    all_filepath: list[str] = []
    for curr_filename in os.listdir(folder_filepath):
        full_path = os.path.join(folder_filepath, curr_filename)
        if os.path.exists(full_path):
            all_filepath.append(full_path)

    return all_filepath

train_benign = f"../assets/ultrasound breast classification/train/benign"
train_malignant = f"../assets/ultrasound breast classification/train/malignant"
test_benign = f"../assets/ultrasound breast classification/val/benign"
test_malignant = f"../assets/ultrasound breast classification/val/malignant"

# make a list of all the filepath for train+benign
all_train_benign_filepath    = _get_all_filepath_from_folder(train_benign)
all_train_malignant_filepath = _get_all_filepath_from_folder(train_malignant)
all_test_benign_filepath     = _get_all_filepath_from_folder(test_benign)
all_test_malignant_filepath  = _get_all_filepath_from_folder(test_malignant)

print(f"all_train_benign_filepath    {len(all_train_benign_filepath)}")
print(f"all_train_malignant_filepath {len(all_train_malignant_filepath)}")
print(f"all_test_benign_filepath     {len(all_test_benign_filepath)}")
print(f"all_test_malignant_filepath  {len(all_test_malignant_filepath)}")

### Build train/test DataFrames

In [None]:
# we use "zip()" to merge 2 lists into 1 list of a tuple with 2 elements
columns_train_benign     = list(zip(all_train_benign_filepath, [0] * len(all_train_benign_filepath)))
columns_train_malignant  = list(zip(all_train_malignant_filepath, [1] * len(all_train_malignant_filepath)))
columns_test_benign      = list(zip(all_test_benign_filepath, [0] * len(all_test_benign_filepath)))
columns_test_malignant   = list(zip(all_test_malignant_filepath, [1] * len(all_test_malignant_filepath)))

# merging the benign/malignant lists into 1 list
columns_train_all = columns_train_benign + columns_train_malignant
columns_test_all = columns_test_benign + columns_test_malignant

import pandas as pd

# here we create the train/test dataframes
train_df = pd.DataFrame(columns_train_all, columns=['filepath', 'is_malignant'])
test_df = pd.DataFrame(columns_test_all, columns=['filepath', 'is_malignant'])

In [None]:
print('train_df.info()')
print(train_df.info())

In [None]:
print('test_df.info()')
print(test_df.info())

In [None]:
print('train_df')
print(train_df)

In [None]:
print('test_df')
print(test_df)

Here we shuffle the Data frames
* Otherwise all the benign values are at the start and all the malignant values at the end
* Here we use the random_state parameter to keep it deterministic between each launch

In [None]:
train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=0).reset_index(drop=True)

In [None]:
print('train_df')
print(train_df)

In [None]:
print('test_df')
print(test_df)

**Conclusion:** the data is now shuffled and can be safely used.

In [None]:
import tensorflow as tf

In [None]:
import numpy as np

img_square_size = 128

# will map the images filepath with their loaded images data
def _get_img_filepath_to_map(all_filepaths: list[str]) -> dict[str, np.ndarray]:

  total = len(all_filepaths)

  img_filepath_map: dict[str, np.ndarray] = {}

  for index, curr_filepath in enumerate(all_filepaths):

    # small feedback of how far in the list we currently are
    if index > 0 and index % 1000 == 0:
      print(f" -> loading images -> progress: {index}/{total} ({(index / total) * 100.0:.0f}%)")

    # load the image -> https://keras.io/api/data_loading/image/
    loaded_img_data = tf.keras.utils.load_img(
      curr_filepath,
      color_mode="rgb",
      target_size=(img_square_size, img_square_size), # <- will resize
      interpolation="nearest", # <- no aliasing when resized, not blurry
      keep_aspect_ratio=False,
    )

    # convert the colored image to gray scales
    greyed_img_data = tf.image.rgb_to_grayscale(loaded_img_data)

    # convert the image to an array that can be passed to a model
    matrix_img_data = tf.keras.utils.img_to_array(greyed_img_data)

    # save the loaded image data against it's filepath
    img_filepath_map[curr_filepath] = matrix_img_data

  return img_filepath_map

print(f"loading + mapping the image data against their filepath")

all_filepath_img_data_map: dict[str, np.ndarray] = {}

to_load = [
  ('all_train_benign', all_train_benign_filepath),
  ('all_train_malignant', all_train_malignant_filepath),
  ('all_test_benign', all_test_benign_filepath),
  ('all_test_malignant', all_test_malignant_filepath)
]
for list_name, curr_list in to_load:
  print(f"starting list: '{list_name}'")

  new_dict_of_img_data = _get_img_filepath_to_map(curr_list)
  print(f" ---> newly loaded images done: {len(new_dict_of_img_data)}")

  # add the new dict of loaded images to the main dict 
  all_filepath_img_data_map |= new_dict_of_img_data
  print(f" -----> total loaded images so far: {len(all_filepath_img_data_map)}")

print('All Done!')
print('all_filepath_img_data_map  ->', len(all_filepath_img_data_map))

In [None]:
# some debug (benign)
import matplotlib.pyplot as plt

size=1.5
nrows=3
ncols=8

plt.subplots(nrows, ncols, figsize=(ncols*size, nrows*size))

for ii in range(0, nrows * ncols):
  plt.subplot(nrows, ncols, 1 + ii)
  plt.imshow(all_filepath_img_data_map[all_train_benign_filepath[ii]])
  plt.axis('off')

plt.tight_layout()
plt.savefig('img-breast-cancer-benign.png')

In [None]:
# some debug (malignant)
import matplotlib.pyplot as plt

size=1.5
nrows=3
ncols=8

plt.subplots(nrows, ncols, figsize=(ncols*size, nrows*size))

for ii in range(0, nrows * ncols):
  plt.subplot(nrows, ncols, 1 + ii)
  plt.imshow(all_filepath_img_data_map[all_train_malignant_filepath[ii]])
  plt.axis('off')

plt.tight_layout()
plt.savefig('img-breast-cancer-malignant.png')

In [None]:
# get the list of image data (train)
X_train = np.array(list(map(lambda curr_filepath: all_filepath_img_data_map.get(curr_filepath), train_df['filepath'])))
print('X_train.shape', X_train.shape)

In [None]:
# get the list of is_malignant values (train)
y_train = np.array(train_df['is_malignant'])
print('y_train.shape', y_train.shape)

In [None]:
# get the list of image data (test)
X_test = np.array(list(map(lambda curr_filepath: all_filepath_img_data_map.get(curr_filepath), test_df['filepath'])))
print('X_test.shape', X_test.shape)

In [None]:
# get the list of is_malignant values (test)
y_test = np.array(test_df['is_malignant'])
print('y_test.shape', y_test.shape)

## Classification

In [None]:

def my_keras_code(
    units_feature_layers: int,
    total_feature_layers: int,
    activation_feature_layers: str,
    units_dense_layers: int,
    total_dense_layers: int,
    activation_dense_layers: str,
    optimizer: str,
    epoch: int,
) -> float:

    # Build model
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Input((img_square_size, img_square_size, 1)))

    # add X feature layers
    for i in range(0, total_feature_layers):

        my_local_units = (i + 1) * units_feature_layers

        model.add(tf.keras.layers.Conv2D(my_local_units, kernel_size=(
            3, 3), activation=activation_feature_layers, padding='same'))
        model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
        model.add(tf.keras.layers.MaxPooling2D(
            pool_size=(2, 2), padding='same'))

    # now switch to one dense layer
    model.add(tf.keras.layers.Flatten())

    # add X dense layers
    for i in range(0, total_dense_layers):
        model.add(tf.keras.layers.Dense(
            units_dense_layers, activation=activation_dense_layers))
        model.add(tf.keras.layers.LeakyReLU(alpha=0.1))

    # output layer
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    #
    #
    #

    model.compile(
        loss=tf.keras.losses.binary_crossentropy,
        optimizer=optimizer,
        metrics=['accuracy']
    )

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',

        # here "patience" is the same as "epoch"
        # -> we're just after the 'restore_best_weights' feature
        patience=epoch,

        restore_best_weights=True,
        verbose=0
    )

    model.fit(
        X_train,
        y_train,
        batch_size=32,
        validation_data=(X_test, y_test),
        # callbacks=[early_stopping],
        callbacks=[early_stopping],
        epochs=epoch,
        verbose=0
    )

    y_pred = model.predict(X_test, verbose=0)
    y_pred = (y_pred > 0.5)

    score = accuracy_score(y_test, y_pred)
    print("accuracy_score", score)

    # Return a single float as the objective value.
    return score  # higher is better


class MyTuner(keras_tuner.GridSearch):
    def run_trial(self, trial, **kwargs):
        hp = trial.hyperparameters
        return my_keras_code(

            units_feature_layers=hp.Int("units_feature_layers", 32, 512, 32),
            total_feature_layers=hp.Int("total_feature_layers", 1, 6, 1),
            activation_feature_layers=hp.Choice(
                "activation_feature_layers", ["linear", "sigmoid", "relu", "tanh", "leaky_relu"]),

            units_dense_layers=hp.Int("units_dense_layers", 32, 512, 32),
            total_dense_layers=hp.Int("total_dense_layers", 1, 6, 1),
            activation_dense_layers=hp.Choice(
                "activation_dense_layers", ["linear", "sigmoid", "relu", "tanh", "leaky_relu"]),

            optimizer=hp.Choice(
                "optimizer", ["adam", "adadelta", "adamw", "adagrad"]),

            epoch=3,
        )


tuner = MyTuner(
    # important: this set it
    objective=keras_tuner.Objective("val_accuracy", direction="max"),
    max_trials=10000,
    executions_per_trial=2,
    # overwrite=True,
    # directory=f"{_get_current_folder()}/my_dir",
    # project_name="keep_code_separate",
)

tuner.search_space_summary()

tuner.search()

# Retraining the model
best_hp = tuner.get_best_hyperparameters()[0]

print("best_hp", best_hp.values)