# MODEL - IMAGE LOADING & NEURAL NETWORK

In [None]:
#Import libraries
import csv
import os
import io
import cv2
from PIL import Image
import h5py
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

In [2]:
#pip install Tensorflow

In [3]:
#pip install numpy==1.26.4

## 1) GENERAL FUNCTIONS

In [4]:
#Function to show image
def show_img(image):
    plt.imshow(image, interpolation=None)
    plt.grid(None)
    plt.show()

In [5]:
#Image cropping
def crop_image(images_list, nbPix = 100):
    output_images = []
    for image in images_list:
        #Height adjustments
        h = len(image)
        adj = len(image) - nbPix
        h1 = round(adj / 2) #Top
        h2 = h - (adj - h1) #Bottom

        #Width adjustments
        w = len(image[0])
        w_adj = w - nbPix
        w1 = round(w_adj / 2) #Left
        w2 = w - (w_adj - w1) #Right

        img = image[h1:h2,w1:w2]
        output_images.append(img)
        
    return np.array(output_images)

## 2) IMPORT DATA

### 2.1 - Declare file paths

In [6]:
#General file paths
projectDir = os.getcwd() + "/"
parentDir = os.path.abspath(os.path.join(projectDir, os.pardir)) + "/"
dataPath = os.path.abspath(os.path.join(projectDir, os.pardir)) + "/isic-2024-challenge/"

#Metadata file paths
#metaPath = dataPath + "train-metadata.csv"
metaPath = dataPath + "sample-metadata.csv"

#Image file path
#hdf5_file = dataPath + "train-image.hdf5"
hdf5_file = dataPath + "sample-image.hdf5"

### 2.2 - Load metadata from csv

In [None]:
#Import metadata
metadata = pd.read_csv(metaPath, sep=",")

#METADATA: color and size features having no NAs
metadata = metadata[["isic_id",
                     "target",
                     "clin_size_long_diam_mm",
                     "tbp_lv_areaMM2",
                     "tbp_lv_area_perim_ratio",
                     "tbp_lv_eccentricity",
                     "tbp_lv_minorAxisMM",
                     "tbp_lv_color_std_mean",
                     "tbp_lv_deltaLBnorm",
                     "tbp_lv_radial_color_std_max"]]

#Verify that there are no NAs
print("-- X_meta NA counts --")
print(metadata.isna().sum())

In [8]:
#metadata["target_cheat"] = metadata["target"]

### 2.3 - Clean data

In [9]:
#Add code here


### 2.4a - Train, Validate, Test Split

In [10]:
#Function to perform train-validate or train-test-validate split on a list of isic_ids
def ttv_split(isic_ids, test_frac=0.2, validate_frac=0.2, random_state=88, shuffle=True, stratify=None):
    if test_frac < 0 or validate_frac < 0:
        print("ERROR: Test of validate fraction is negative")
        return None
    if test_frac > 1 or validate_frac > 1:
        print("ERROR: Test of validate fraction is above 0")
        return None
    if test_frac + validate_frac >= 1:
        print("ERROR: Test and validate fractions sum to 1 or more.")
        return None

    #Split training from the rest
    test_size = test_frac + validate_frac
    train, temp = train_test_split(isic_ids, test_size = test_size, random_state=random_state, shuffle=shuffle, stratify=stratify)
    #Split test and validate
    if test_frac == 0 or validate_frac == 0:
        return train.tolist(), temp.tolist()
    else:
        test_size = test_frac / (test_frac + validate_frac)
        test, validate = train_test_split(temp, test_size = test_size, random_state=random_state, shuffle=shuffle, stratify=stratify)
        return train.tolist(), test.tolist(), validate.tolist()

#Generate the splits of the isic_ids
train_ids, test_ids, val_ids = ttv_split(metadata["isic_id"])

In [None]:
"""
#Split train-test-validate portions of metadata into X and y
def Xy_split(metadata, ids):
    #Generate X (features without target)
    
    X = metadata[metadata["isic_id"].isin(ids)]
    X = X.loc[:, ~X.columns.isin(['isic_id', 'target'])]
    #Generate y
    y = metadata[metadata["isic_id"].isin(ids)]["target"]
    return X, y

X_train, y_train = Xy_split(metadata, train_ids)
X_test, y_test = Xy_split(metadata, test_ids)
X_validate, y_validate = Xy_split(metadata, val_ids)
"""

### 2.4b - Data augmentation
- Augment only the malignant data in the training set
- Reformat all lists (train_ids, test_ids, val_ids) to be compatible: list of tuples

In [12]:
#Make list of ids compatible with data augmentations
#Base data takes a value of 0, meaning it should not be modified
train_ids_mods = [(id, 0) for id in train_ids]
test_ids_mods = [(id, 0) for id in test_ids]
val_ids_mods = [(id, 0) for id in val_ids]

In [None]:
#Identify the malignant cases in the training data
all_pos = metadata[metadata["target"]==1]["isic_id"]
pos_in_train = all_pos[all_pos.isin(train_ids)]
print("Number of positives in training data:", len(pos_in_train))

In [None]:
#Augment the training list
#Duplicates of ids will each have a different number, indicating a specific augmentation to be used
nb_of_augments = 2

rng = np.random.default_rng()
for i in range(nb_of_augments):
    rand_nb = rng.random()
    #Option 1: use random float between 0 and 1
    train_ids_mods += [(id, rand_nb) for id in pos_in_train]
    #Option 2: use integer
    #train_ids_mods += [(id, nb_of_augments + 1) for id in pos_in_train]

#Shuffle the list
np.random.shuffle(train_ids_mods)
train_ids_mods

### 2.5 - Load images and create hybrid tensorflow dataset

In [15]:
# GENERATOR FOR HDF5 AND METADATA
"""
file = filepath to the hdf5 file containing the image data
metadata = the full metadata dataframe (col1 = "isic_id", col2 = "target")
img_names = list of tuples, each containing the isic_id followed by a number, signifying:
            * 0 when original data is to be used
            * random number - data augmentation is to be applied
imgSize = images are to be adjusted to this size (square) in pixels
"""
class hdf5_generator:
    def __init__(self, file, metadata, img_names, imgSize):
        self.file = file
        self.metadata = metadata
        self.img_names = img_names
        self.imgSize = imgSize

    def __call__(self):
        with h5py.File(self.file, 'r') as h5file:
            for img_name_tuple in self.img_names:
                img_name, mod = img_name_tuple
                try:
                    # Load image data from HDF5
                    img = np.array(Image.open(io.BytesIO(h5file[img_name][()])))
                    
                    # Resize the image
                    img = tf.image.resize(img, [self.imgSize, self.imgSize])
                    
                    if mod != 0:
                        # Data Augmentation 
                        img = tf.image.random_flip_left_right(img)
                        img = tf.image.random_flip_up_down(img)
                        img = tf.image.random_brightness(img, max_delta=0.2)

                    # Standardize and return as TensorFlow constant
                    img = tf.constant(img / 255, dtype=tf.float32)  # Standardize here

                    #Retrieve corresponding metadata
                    meta = self.metadata[self.metadata["isic_id"] == img_name].iloc[:,2:]

                    #Retrieve corresponding target
                    target = self.metadata[self.metadata["isic_id"] == img_name]["target"]
                    target = np.reshape(target, (1, 1))
                    
                    yield (img, meta), target
                    
                except Exception as e:
                    print(f"Error loading image {img_name}: {e}")
                    # log the error to a file for later analysis
                    with open('image_errors.log', 'a') as f:
                        f.write(f"Error loading image {img_name}: {e}\n")
                    continue

#Generate the dataset with batch size and prefetching
def make_dataset(hdf5_file, metadata, img_names, imgSize=100, batch_size=32, shuffle=True):
   
    # Get the number of metadata features (isic_id and target are present, so subtract)
    num_features = metadata.shape[-1] - 2
    
    # Generate image dataset
    element_spec = ((tf.TensorSpec(shape=(imgSize, imgSize, 3), dtype=tf.float32),
                 tf.TensorSpec(shape=(1, num_features), dtype=tf.float32)),
                tf.TensorSpec(shape=(1, 1), dtype=tf.int32))
    
    img_dataset = tf.data.Dataset.from_generator(
        hdf5_generator(hdf5_file, metadata, img_names, imgSize),
        output_signature=element_spec
    )

    # Add shuffling, batching, and prefetching
    if shuffle:
        dataset = img_dataset.shuffle(buffer_size=min(len(img_names), 10000)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
    else:
        dataset = img_dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    return dataset

In [16]:
#Make datasets
train_dataset = make_dataset(hdf5_file, metadata, train_ids_mods, shuffle=False)
validate_dataset = make_dataset(hdf5_file, metadata, val_ids_mods, shuffle=False)
test_dataset = make_dataset(hdf5_file, metadata, test_ids_mods, shuffle=False)

In [None]:
train_dataset

In [None]:
validate_dataset

## 3) CNN MODEL

### 3.1 - Model class

In [19]:
#Simple CNN model using only images and target
class CNN_model(tf.keras.Model):
    def __init__(self, neurons = 8, activ = 'leaky_relu', img_size = 100, img_channels=3):
        #Run the constructor of the parent class
        super().__init__()

        #Weight and bias initializers
        kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None)
        bias_initializer = tf.keras.initializers.RandomUniform(minval=-0.05, maxval=0.05, seed=None)
        
        #Image size declaration
        self.img_size = img_size
        self.img_channels = img_channels

        #Layers
        self.conv1 = tf.keras.layers.Conv2D(filters=16, kernel_size=5, strides=(1, 1), activation='relu', padding='same', input_shape=(img_size, img_size, img_channels),
                                            kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
        self.pool1 = tf.keras.layers.MaxPool2D(pool_size=(2,2))
        self.flatten = tf.keras.layers.Flatten()
        self.dense1 = tf.keras.layers.Dense(neurons, activation = activ, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
        self.dense2 = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)

    def call(self, inputs):
        x_image, x_meta = inputs

        # Convolutions
        x1 = self.conv1(x_image)
        x1 = self.pool1(x1)

        # Flattening of images for input layer
        x1 = self.flatten(x1)

        # Hidden layers of neural network
        x1 = self.dense1(x1)

        # Output layer of neural network
        output = self.dense2(x1)

        return output

#Metadata Neural Network
class Meta_model(tf.keras.Model):
    def __init__(self, neurons = 8, activ = 'tanh'):
        #Run the constructor of the parent class
        super().__init__()

        #Weight and bias initializers
        kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None)
        bias_initializer = tf.keras.initializers.RandomUniform(minval=-0.05, maxval=0.05, seed=None)

        #Layers
        self.dense1 = tf.keras.layers.Dense(neurons, activation = activ, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
        self.dense2 = tf.keras.layers.Dense(neurons, activation = activ, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
        self.dense3 = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
        self.dropout = tf.keras.layers.Dropout(0.25)

    def call(self, inputs, training=False):
        x_image, x_meta = inputs
        x_all = tf.reshape(x_meta, (tf.shape(x_meta)[0], x_meta.shape[-1]))
        # Neural Network
        x_all = self.dense1(x_all)
        x_all = self.dense2(x_all)
        if training:
            x_all = self.dropout(x_all, training=training)
        output = self.dense3(x_all)
        return output

#Hybrid CNN model taking metadata
class Hybrid_model(tf.keras.Model):
    def __init__(self, neurons = 8, activ = 'leaky_relu', img_size = 100, img_channels = 3):
        #Run the constructor of the parent class
        super().__init__()

        #Weight and bias initializers
        kernel_initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None)
        bias_initializer = tf.keras.initializers.RandomUniform(minval=-0.05, maxval=0.05, seed=None)

        #Image size declaration
        self.img_size = img_size
        self.img_channels = img_channels

        #Layers
        self.conv1 = tf.keras.layers.Conv2D(filters=32, kernel_size=5, strides=(1, 1), activation='relu', padding='same', input_shape=(img_size, img_size, img_channels),
                                            kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
        self.conv2 = tf.keras.layers.Conv2D(64, 5, activation='relu', kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
        self.pool = tf.keras.layers.MaxPool2D(pool_size=(2,2))
        self.flatten = tf.keras.layers.Flatten()
        self.dense1 = tf.keras.layers.Dense(neurons, activation = activ, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
        self.dropout1 = tf.keras.layers.Dropout(0.10)
        self.dense2 = tf.keras.layers.Dense(neurons, activation = activ, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
        self.dropout2 = tf.keras.layers.Dropout(0.10)
        self.dense3 = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=kernel_initializer, bias_initializer=bias_initializer)
        self.concatenate = keras.layers.Concatenate(axis=1)
        
    def call(self, inputs, training=False):
        x_image, x_meta = inputs
        # Convolutions
        x = self.conv1(x_image)
        x = self.pool(x)
        x = self.conv2(x)
        x = self.pool(x)
        # Flattening of images and concatenation with other data
        x = self.flatten(x)
        # Reshape metadata to match dimensions
        x_meta = tf.reshape(x_meta, (tf.shape(x_meta)[0], x_meta.shape[-1]))
        x_all = self.concatenate([x, x_meta])
        # Neural Network
        x_all = self.dense1(x_all)
        if training:
            x_all = self.dropout1(x_all, training=training)
        x_all = self.dense2(x_all)
        if training:
            x_all = self.dropout2(x_all, training=training)
        output = self.dense3(x_all)
        return output

### 3.2 - Model compiling

In [None]:
#Set seed
tf.random.set_seed(71)

#Initialize model
#model = CNN_model(neurons=8, activ='tanh')
model = Hybrid_model(neurons=36, activ='leaky_relu')
#model = Meta_model(neurons=18, activ='tanh')

#Define optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False,
                                          label_smoothing=0.0,
                                          axis=-1,
                                          reduction='sum_over_batch_size',
                                          name='binary_crossentropy')

#Compile the model with loss, optimizer, and metrics
model.compile(loss = loss,
              optimizer = optimizer,
              metrics = [
                  tf.keras.metrics.BinaryAccuracy(),
                  tf.keras.metrics.FalseNegatives(),
                  tf.keras.metrics.FalsePositives(),
                  tf.keras.metrics.TrueNegatives(),
                  tf.keras.metrics.TruePositives()
                  ]
)

In [None]:
# Take 1 batch from the dataset and check its content
for batch in train_dataset.take(1):
    (img_batch, meta_batch), target_batch = batch
    
    # Print the shapes of the individual components
    print(f"Image batch shape: {img_batch.shape}")
    print(f"Metadata batch shape: {meta_batch.shape}")
    print(f"Target batch shape: {target_batch.shape}")

# To count the total number of batches
batch_count = 0
for _ in train_dataset:
    batch_count += 1

print(f"Total number of batches in the dataset: {batch_count}")

### 3.3 - Model fit

In [None]:
mod = model.fit(train_dataset, epochs=25, validation_data = validate_dataset)

In [None]:
model.weights

In [None]:
mod.history

**BATCHES**

In [None]:
# Iterate through all batches in the dataset and print their shapes
for i, batch in enumerate(train_dataset):
    (img_batch, meta_batch), target_batch = batch
    
    # Print the shapes of the current batch
    print(f"Batch {i+1}:")
    print("  Image Batch Shape:", img_batch.shape)
    print("  Metadata Batch Shape:", meta_batch.shape)
    print("  Target Batch Shape:", target_batch.shape)

### 3.4 - Predict Test Data

In [None]:
predictions = model.predict(test_dataset)
y_pred = [round(i) for i  in predictions.flatten()]
y_test = np.concatenate([y for x, y in test_dataset], axis=0).flatten()
loss = sum(abs(y_test - y_pred))/len(y_pred)
print("Shape of prediction data:", predictions.shape)
print("Loss on test data:", loss)

In [None]:
y_pred

In [None]:
y_test