# MODEL - IMAGE LOADING & NEURAL NETWORK

In [20]:
#Import libraries
import csv
import os
import io
import cv2
from PIL import Image
import h5py
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

## 1) GENERAL FUNCTIONS

In [21]:
#Function to show image
def show_img(image):
    plt.imshow(image, interpolation=None)
    plt.grid(None)
    plt.show()

In [22]:
#Image cropping
def crop_image(images_list, nbPix = 100):
    output_images = []
    for image in images_list:
        #Height adjustments
        h = len(image)
        adj = len(image) - nbPix
        h1 = round(adj / 2) #Top
        h2 = h - (adj - h1) #Bottom

        #Width adjustments
        w = len(image[0])
        w_adj = w - nbPix
        w1 = round(w_adj / 2) #Left
        w2 = w - (w_adj - w1) #Right

        img = image[h1:h2,w1:w2]
        output_images.append(img)
        
    return np.array(output_images)

## 2) IMPORT DATA

### 2.1 - Declare file paths

In [23]:
#General file paths
projectDir = os.getcwd() + "/"
parentDir = os.path.abspath(os.path.join(projectDir, os.pardir)) + "/"
dataPath = os.path.abspath(os.path.join(projectDir, os.pardir)) + "/isic-2024-challenge/"

#Metadata file paths
#metaPath = dataPath + "train-metadata.csv"
metaPath = dataPath + "sample-metadata.csv"

#Image file path
#hdf5_file = dataPath + "train-image.hdf5"
hdf5_file = dataPath + "sample-image.hdf5"

### 2.2 - Load metadata from csv

In [None]:
#Import metadata
metadata = pd.read_csv(metaPath, sep=",")

#METADATA: color and size features having no NAs
metadata = metadata[["isic_id",
                     "target",
                     "clin_size_long_diam_mm",
                     "tbp_lv_areaMM2",
                     "tbp_lv_area_perim_ratio",
                     "tbp_lv_eccentricity",
                     "tbp_lv_minorAxisMM",
                     "tbp_lv_color_std_mean",
                     "tbp_lv_deltaLBnorm",
                     "tbp_lv_radial_color_std_max"]]

#Verify that there are no NAs
print("-- X_meta NA counts --")
print(metadata.isna().sum())

### 2.3 - Clean data

In [25]:
#Add code here


### 2.4 - Train, Validate, Test Split

In [26]:
#Function to perform train-validate or train-test-validate split on a list of isic_ids
def ttv_split(isic_ids, test_frac=0.2, validate_frac=0.2, random_state=88, shuffle=True, stratify=None):
    if test_frac < 0 or validate_frac < 0:
        print("ERROR: Test of validate fraction is negative")
        return None
    if test_frac > 1 or validate_frac > 1:
        print("ERROR: Test of validate fraction is above 0")
        return None
    if test_frac + validate_frac >= 1:
        print("ERROR: Test and validate fractions sum to 1 or more.")
        return None

    #Split training from the rest
    test_size = test_frac + validate_frac
    train, temp = train_test_split(isic_ids, test_size = test_size, random_state=random_state, shuffle=shuffle, stratify=stratify)
    #Split test and validate
    if test_frac == 0 or validate_frac == 0:
        return train.tolist(), temp.tolist()
    else:
        test_size = test_frac / (test_frac + validate_frac)
        test, validate = train_test_split(temp, test_size = test_size, random_state=random_state, shuffle=shuffle, stratify=stratify)
        return train.tolist(), test.tolist(), validate.tolist()

#Generate the splits of the isic_ids
train_ids, test_ids, val_ids = ttv_split(metadata["isic_id"])

In [16]:
#Split train-test-validate portions of metadata into X and y
def Xy_split(metadata, ids):
    #Generate X
    X = metadata[metadata["isic_id"].isin(ids)]
    X.loc[:, ~X.columns.isin(['isic_id', 'target'])]
    #Generate y
    y = metadata[metadata["isic_id"].isin(ids)]["target"]
    return X, y

X_train, y_train = Xy_split(metadata, train_ids)
X_test, y_test = Xy_split(metadata, test_ids)
X_validate, y_validate = Xy_split(metadata, val_ids)

### 2.5 - Load images and create hybrid tensorflow dataset

### OLD WORKING VERSION

In [18]:
#GENERATOR FOR HDF5
#Generates the image (standardized). Avoids multiple file open/read/close operations.
#file: full path for file
#imgs: list of images to load
#imgSize: number of pixels for size/resolution adjustment in square form
class hdf5_generator:
    def __init__(self, file, img_names, imgSize):
        self.file = file
        self.img_names = img_names
        self.imgSize = imgSize
    def __call__(self):
        with h5py.File(self.file, 'r') as h5file:
            for img in self.img_names:
                img = np.array(Image.open(io.BytesIO(h5file[img][()])))
                img = tf.image.resize(img, [self.imgSize, self.imgSize])
                img = tf.constant(np.reshape(img/255,(1,100,100,3)), dtype=tf.float32) #standardized here
                yield img

#DATASET FUNCTION
def make_dataset(hdf5_file, meta, target, img_names, imgSize=100):
    #Generate image dataset
    img_dataset = tf.data.Dataset.from_generator(
        hdf5_generator(hdf5_file, img_names, imgSize),
        output_types=tf.float32,
        output_shapes = tf.TensorShape([1, imgSize,imgSize,3])
        )

    #Generate target dataset
    target = [np.reshape(element, (1,1)) for element in target]
    target = tf.cast(target, dtype=tf.int32)
    target = tf.data.Dataset.from_tensor_slices(target, name = "target")

    #Generate metadata set
    meta = tf.cast(meta, dtype=tf.float32)
    meta = tf.reshape(meta, shape=(9,1,8))
    meta = tf.data.Dataset.from_tensor_slices(meta, name = "metadata")

    #Combine datasets into one
    dataset = tf.data.Dataset.zip((img_dataset, meta))
    dataset = tf.data.Dataset.zip((dataset, target))

    return dataset

In [None]:
#Make datasets
train_dataset = make_dataset(hdf5_file, X_train, y_train, train_ids)

In [14]:
#Delete duplicate data, keeping only the dataset
#del metadata
#del y_train
#del X_train

In [None]:
#Examine the dataset objects
print(train_dataset, "\n")

iterator = iter(train_dataset)
#print(iterator.get_next())
print(next(iterator))

## MARTIAL'S VERSION

In [14]:
# HDF5 Image Data Generator with Augmentation and Standardization
class hdf5_generator:
    def __init__(self, file, img_names, imgSize):
        self.file = file
        self.img_names = img_names
        self.imgSize = imgSize

    def __call__(self):
        with h5py.File(self.file, 'r') as h5file:
            for img_name in self.img_names:
                try:
                    # Load image data from HDF5
                    img = np.array(Image.open(io.BytesIO(h5file[img_name][()])))
                    
                    # Resize the image
                    img = tf.image.resize(img, [self.imgSize, self.imgSize])
                    
                    # Data Augmentation 
                    img = tf.image.random_flip_left_right(img)
                    img = tf.image.random_flip_up_down(img)
                    img = tf.image.random_brightness(img, max_delta=0.2)

                    # Standardize and return as TensorFlow constant
                    img = tf.constant(img / 255, dtype=tf.float32)  # Standardize here
               
                    
                    yield img
                    
                except Exception as e:
                    print(f"Error loading image {img_name}: {e}")
                    # log the error to a file for later analysis
                    with open('image_errors.log', 'a') as f:
                        f.write(f"Error loading image {img_name}: {e}\n")
                    continue

#Generate the dataset with batch size and prefetching
def make_dataset(hdf5_file, meta, target, img_names, imgSize=100, batch_size=5):
    # Generate image dataset
    img_dataset = tf.data.Dataset.from_generator(
        hdf5_generator(hdf5_file, img_names, imgSize),
        output_types=tf.float32,
        output_shapes=tf.TensorShape([imgSize, imgSize, 3])
    )

    # Generate target dataset
    target = [np.reshape(element, (1, 1)) for element in target]
    target = tf.cast(target, dtype=tf.int32)
    target = tf.data.Dataset.from_tensor_slices(target, name="target")

    # Generate metadata set
    meta = tf.cast(meta, dtype=tf.float32)
    meta = tf.reshape(meta, shape=(9,1,8))
    meta = tf.data.Dataset.from_tensor_slices(meta, name="metadata")

    # Combine datasets into one
    dataset = tf.data.Dataset.zip((img_dataset, meta))
    dataset = tf.data.Dataset.zip((dataset, target))

    # Add shuffling, batching, and prefetching
    dataset = dataset.shuffle(buffer_size=min(len(img_names), 10000)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    return dataset

In [None]:
#Make datasets
train_dataset = make_dataset(hdf5_file, X_train, y_train, train_ids)

## 3) CNN MODEL

### 3.1 - Model class

In [41]:
#Simple CNN model using only images and target
class CNN_model(tf.keras.Model):
    def __init__(self, neurons = 8, activ = 'tanh'):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv2D(filters=16, kernel_size=5, strides=(1, 1), activation='relu', padding='same', input_shape=(100, 100, 3))
        self.pool1 = tf.keras.layers.MaxPool2D(pool_size=(2,2))
        self.flatten = tf.keras.layers.Flatten()
        self.dense1 = tf.keras.layers.Dense(neurons, activation = activ)
        self.dense2 = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        x_image, x_meta = inputs

        # Convolutions
        x1 = self.conv1(x_image)
        x1 = self.pool1(x1)

        # Flattening of images for input layer
        x1 = self.flatten(x1)

        # Hidden layers of neural network
        x1 = self.dense1(x1)

        # Output layer of neural network
        output = self.dense2(x1)

        return output

#Hybrid CNN model taking metadata
class Hybrid_model(tf.keras.Model):
    def __init__(self, neurons = 8, activ = 'tanh'):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv2D(filters=16, kernel_size=5, strides=(1, 1), activation='relu', padding='same')
        self.conv2 = tf.keras.layers.Conv2D(32, 5, activation='relu')
        self.pool = tf.keras.layers.MaxPool2D(pool_size=(2,2))
        self.flatten = tf.keras.layers.Flatten()
        self.dense1 = tf.keras.layers.Dense(neurons, activation = activ)
        self.dense2 = tf.keras.layers.Dense(neurons, activation = activ)
        self.dense3 = tf.keras.layers.Dense(1, activation='sigmoid')
        #self.dropout = tf.keras.layers.dropout(0.25)

    def call(self, inputs, training=False):
        x_image, x_meta = inputs
        # Convolutions
        x = self.conv1(x_image)
        x = self.pool(x)
        #x = self.conv2(x)
        #x = self.pool(x)
        # Flattening of images and concatenation with other data
        x = self.flatten(x)
        #x_all = tf.concat([x,x_meta], axis=1)
        x_all = keras.layers.Concatenate(axis=1)([x, x_meta])
        # Neural Network
        x_all = self.dense1(x_all)
        #x_all = self.dense2(x_all)
        #if training:
        #    x_all = self.dropout(x_all, training=training)
        output = self.dense3(x_all)
        return output

### 3.2 - Model compiling

In [42]:
#Set seed
tf.random.set_seed(71)

#Initialize model
model = CNN_model(neurons=8, activ='tanh')
#model = Hybrid_model(neurons=8, activ='tanh')

#Define optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True,
                                          label_smoothing=0.0,
                                          axis=-1,
                                          reduction='sum_over_batch_size',
                                          name='binary_crossentropy')

#Compile the model with loss, optimizer, and metrics
model.compile(loss = loss,
              optimizer = optimizer,
              metrics = [
                  tf.keras.metrics.BinaryAccuracy(),
                  tf.keras.metrics.FalseNegatives(),
                  tf.keras.metrics.FalsePositives(),
                  tf.keras.metrics.TrueNegatives(),
                  tf.keras.metrics.TruePositives()
                  ]
)

### 3.3 - Model fit

In [None]:
#Fit the model
mod = model.fit(train_dataset, epochs=4)

In [None]:
mod.history