In [7]:
import sys
print(sys.executable)

import boto3
from botocore.exceptions import NoCredentialsError

# Function to download images from S3
def download_images_from_s3(bucket_name, image_keys, local_dir="images"):
    s3 = boto3.client('s3', aws_access_key_id='YOUR_AWS_ACCESS_KEY',
                      aws_secret_access_key='YOUR_AWS_SECRET_KEY')
    
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)

    downloaded_images = []
    
    for key in image_keys:
        local_file = os.path.join(local_dir, key.split('/')[-1])
        try:
            s3.download_file(bucket_name, key, local_file)
            print(f"Downloaded {key} to {local_file}")
            # Assuming images are to be loaded using PIL after download
            image = Image.open(local_file)
            downloaded_images.append(np.array(image))
        except NoCredentialsError:
            print("Credentials not available")
    
    return downloaded_images


c:\Users\Cedric DJIVO\OneDrive\DATA SCIENCE\DSTI\Projects\deep-learning\.venv\Scripts\python.exe


# MODEL - IMAGE LOADING & NEURAL NETWORK

In [6]:
#Import libraries
import csv
import os
import io
import cv2
from PIL import Image
import h5py
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [2]:
#pip install Tensorflow

In [3]:
#pip install numpy==1.26.4

## 1) GENERAL FUNCTIONS

In [4]:
#Function to show image
def show_img(image):
    plt.imshow(image, interpolation=None)
    plt.grid(None)
    plt.show()

In [5]:
#Image cropping
def crop_image(images_list, nbPix = 100):
    output_images = []
    for image in images_list:
        #Height adjustments
        h = len(image)
        adj = len(image) - nbPix
        h1 = round(adj / 2) #Top
        h2 = h - (adj - h1) #Bottom

        #Width adjustments
        w = len(image[0])
        w_adj = w - nbPix
        w1 = round(w_adj / 2) #Left
        w2 = w - (w_adj - w1) #Right

        img = image[h1:h2,w1:w2]
        output_images.append(img)
        
    return np.array(output_images)

## 2) IMPORT DATA

### 2.1 - Declare file paths

In [6]:
#General file paths
projectDir = os.getcwd() + "/"
parentDir = os.path.abspath(os.path.join(projectDir, os.pardir)) + "/"
dataPath = os.path.abspath(os.path.join(projectDir, os.pardir)) + "/isic-2024-challenge/"

#Metadata file paths
#metaPath = dataPath + "train-metadata.csv"
metaPath = dataPath + "sample-metadata.csv"

#Image file path
#hdf5_file = dataPath + "train-image.hdf5"
hdf5_file = dataPath + "sample-image.hdf5"

### 2.2 - Load metadata from csv

In [7]:
#Import metadata
metadata = pd.read_csv(metaPath, sep=",")

#METADATA: color and size features having no NAs
metadata = metadata[["isic_id",
                     "target",
                     "clin_size_long_diam_mm",
                     "tbp_lv_areaMM2",
                     "tbp_lv_area_perim_ratio",
                     "tbp_lv_eccentricity",
                     "tbp_lv_minorAxisMM",
                     "tbp_lv_color_std_mean",
                     "tbp_lv_deltaLBnorm",
                     "tbp_lv_radial_color_std_max"]]

#Verify that there are no NAs
print("-- X_meta NA counts --")
print(metadata.isna().sum())

-- X_meta NA counts --
isic_id                        0
target                         0
clin_size_long_diam_mm         0
tbp_lv_areaMM2                 0
tbp_lv_area_perim_ratio        0
tbp_lv_eccentricity            0
tbp_lv_minorAxisMM             0
tbp_lv_color_std_mean          0
tbp_lv_deltaLBnorm             0
tbp_lv_radial_color_std_max    0
dtype: int64


### 2.3 - Clean data

In [8]:
#Add code here


### 2.4a - Train, Validate, Test Split

In [9]:
#Function to perform train-validate or train-test-validate split on a list of isic_ids
def ttv_split(isic_ids, test_frac=0.2, validate_frac=0.2, random_state=88, shuffle=True, stratify=None):
    if test_frac < 0 or validate_frac < 0:
        print("ERROR: Test of validate fraction is negative")
        return None
    if test_frac > 1 or validate_frac > 1:
        print("ERROR: Test of validate fraction is above 0")
        return None
    if test_frac + validate_frac >= 1:
        print("ERROR: Test and validate fractions sum to 1 or more.")
        return None

    #Split training from the rest
    test_size = test_frac + validate_frac
    train, temp = train_test_split(isic_ids, test_size = test_size, random_state=random_state, shuffle=shuffle, stratify=stratify)
    #Split test and validate
    if test_frac == 0 or validate_frac == 0:
        return train.tolist(), temp.tolist()
    else:
        test_size = test_frac / (test_frac + validate_frac)
        test, validate = train_test_split(temp, test_size = test_size, random_state=random_state, shuffle=shuffle, stratify=stratify)
        return train.tolist(), test.tolist(), validate.tolist()

#Generate the splits of the isic_ids
train_ids, test_ids, val_ids = ttv_split(metadata["isic_id"])

In [10]:
"""
#Split train-test-validate portions of metadata into X and y
def Xy_split(metadata, ids):
    #Generate X (features without target)
    
    X = metadata[metadata["isic_id"].isin(ids)]
    X = X.loc[:, ~X.columns.isin(['isic_id', 'target'])]
    #Generate y
    y = metadata[metadata["isic_id"].isin(ids)]["target"]
    return X, y

X_train, y_train = Xy_split(metadata, train_ids)
X_test, y_test = Xy_split(metadata, test_ids)
X_validate, y_validate = Xy_split(metadata, val_ids)
"""

'\n#Split train-test-validate portions of metadata into X and y\ndef Xy_split(metadata, ids):\n    #Generate X (features without target)\n    \n    X = metadata[metadata["isic_id"].isin(ids)]\n    X = X.loc[:, ~X.columns.isin([\'isic_id\', \'target\'])]\n    #Generate y\n    y = metadata[metadata["isic_id"].isin(ids)]["target"]\n    return X, y\n\nX_train, y_train = Xy_split(metadata, train_ids)\nX_test, y_test = Xy_split(metadata, test_ids)\nX_validate, y_validate = Xy_split(metadata, val_ids)\n'

### 2.4b - Data augmentation
- Augment only the malignant data in the training set
- Reformat all lists (train_ids, test_ids, val_ids) to be compatible: list of tuples

In [11]:
#Make list of ids compatible with data augmentations
#Base data takes a value of 0, meaning it should not be modified
train_ids_mods = [(id, 0) for id in train_ids]
test_ids_mods = [(id, 0) for id in test_ids]
val_ids_mods = [(id, 0) for id in val_ids]

In [12]:
#Identify the malignant cases in the training data
all_pos = metadata[metadata["target"]==1]["isic_id"]
pos_in_train = all_pos[all_pos.isin(train_ids)]
print("Number of positives in training data:", len(pos_in_train))

Number of positives in training data: 36


In [13]:
#Augment the training list
#Duplicates of ids will each have a different number, indicating a specific augmentation to be used
nb_of_augments = 2

rng = np.random.default_rng()
for i in range(nb_of_augments):
    rand_nb = rng.random()
    #Option 1: use random float between 0 and 1
    train_ids_mods += [(id, rand_nb) for id in pos_in_train]
    #Option 2: use integer
    #train_ids_mods += [(id, nb_of_augments + 1) for id in pos_in_train]

#Shuffle the list
np.random.shuffle(train_ids_mods)
train_ids_mods

[('ISIC_0104229', 0.23820295593020158),
 ('ISIC_6016458', 0),
 ('ISIC_9454701', 0.736650921942243),
 ('ISIC_9762220', 0.23820295593020158),
 ('ISIC_2023222', 0.23820295593020158),
 ('ISIC_7464445', 0.23820295593020158),
 ('ISIC_6269100', 0.736650921942243),
 ('ISIC_5609822', 0.736650921942243),
 ('ISIC_4027733', 0.23820295593020158),
 ('ISIC_6255931', 0),
 ('ISIC_3450877', 0.23820295593020158),
 ('ISIC_6448076', 0),
 ('ISIC_5609822', 0),
 ('ISIC_4567455', 0),
 ('ISIC_4114301', 0),
 ('ISIC_3472364', 0),
 ('ISIC_3156688', 0.23820295593020158),
 ('ISIC_6784337', 0.23820295593020158),
 ('ISIC_5609822', 0.23820295593020158),
 ('ISIC_7928246', 0.736650921942243),
 ('ISIC_5475654', 0),
 ('ISIC_4824767', 0),
 ('ISIC_3516326', 0),
 ('ISIC_1805261', 0),
 ('ISIC_2604412', 0.23820295593020158),
 ('ISIC_9762220', 0),
 ('ISIC_7928246', 0),
 ('ISIC_2439617', 0.23820295593020158),
 ('ISIC_0528190', 0),
 ('ISIC_4766910', 0),
 ('ISIC_7575093', 0.23820295593020158),
 ('ISIC_2604412', 0),
 ('ISIC_9972877'

### 2.5 - Load images and create hybrid tensorflow dataset

In [14]:
# GENERATOR FOR HDF5 AND METADATA
"""
file = filepath to the hdf5 file containing the image data
metadata = the full metadata dataframe (col1 = "isic_id", col2 = "target")
img_names = list of tuples, each containing the isic_id followed by a number, signifying:
            * 0 when original data is to be used
            * random number - data augmentation is to be applied
imgSize = images are to be adjusted to this size (square) in pixels
"""
class hdf5_generator:
    def __init__(self, file, metadata, img_names, imgSize):
        self.file = file
        self.metadata = metadata
        self.img_names = img_names
        self.imgSize = imgSize

    def __call__(self):
        with h5py.File(self.file, 'r') as h5file:
            for img_name_tuple in self.img_names:
                img_name, mod = img_name_tuple
                try:
                    # Load image data from HDF5
                    img = np.array(Image.open(io.BytesIO(h5file[img_name][()])))
                    
                    # Resize the image
                    img = tf.image.resize(img, [self.imgSize, self.imgSize])
                    
                    if mod != 0:
                        # Data Augmentation 
                        img = tf.image.random_flip_left_right(img)
                        img = tf.image.random_flip_up_down(img)
                        img = tf.image.random_brightness(img, max_delta=0.2)

                    # Standardize and return as TensorFlow constant
                    img = tf.constant(img / 255, dtype=tf.float32)  # Standardize here

                    #Retrieve corresponding metadata
                    meta = self.metadata[self.metadata["isic_id"] == img_name].iloc[:,2:]

                    #Retrieve corresponding target
                    target = self.metadata[self.metadata["isic_id"] == img_name]["target"]
                    target = np.reshape(target, (1, 1))
                    
                    yield (img, meta), target
                    
                except Exception as e:
                    print(f"Error loading image {img_name}: {e}")
                    # log the error to a file for later analysis
                    with open('image_errors.log', 'a') as f:
                        f.write(f"Error loading image {img_name}: {e}\n")
                    continue

#Generate the dataset with batch size and prefetching
def make_dataset(hdf5_file, metadata, img_names, imgSize=100, batch_size=32):
   
    # Get the number of metadata features (isic_id and target are present, so subtract)
    num_features = metadata.shape[-1] - 2
    
    # Generate image dataset
    element_spec = ((tf.TensorSpec(shape=(imgSize, imgSize, 3), dtype=tf.float32),
                 tf.TensorSpec(shape=(1, num_features), dtype=tf.float32)),
                tf.TensorSpec(shape=(1, 1), dtype=tf.int32))
    
    img_dataset = tf.data.Dataset.from_generator(
        hdf5_generator(hdf5_file, metadata, img_names, imgSize),
        output_signature=element_spec
    )

    # Add shuffling, batching, and prefetching
    dataset = img_dataset.shuffle(buffer_size=min(len(img_names), 10000)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    return dataset

In [15]:
#Make datasets
train_dataset = make_dataset(hdf5_file, metadata, train_ids_mods)
validate_dataset = make_dataset(hdf5_file, metadata, val_ids_mods)

In [16]:
train_dataset

<_PrefetchDataset element_spec=((TensorSpec(shape=(None, 100, 100, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1, 8), dtype=tf.float32, name=None)), TensorSpec(shape=(None, 1, 1), dtype=tf.int32, name=None))>

In [17]:
validate_dataset

<_PrefetchDataset element_spec=((TensorSpec(shape=(None, 100, 100, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1, 8), dtype=tf.float32, name=None)), TensorSpec(shape=(None, 1, 1), dtype=tf.int32, name=None))>

## 3) CNN MODEL

### 3.1 - Model class

In [18]:
#Simple CNN model using only images and target
class CNN_model(tf.keras.Model):
    def __init__(self, neurons = 8, activ = 'tanh'):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv2D(filters=16, kernel_size=5, strides=(1, 1), activation='relu', padding='same', input_shape=(100, 100, 3))
        self.pool1 = tf.keras.layers.MaxPool2D(pool_size=(2,2))
        self.flatten = tf.keras.layers.Flatten()
        self.dense1 = tf.keras.layers.Dense(neurons, activation = activ)
        self.dense2 = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        x_image, x_meta = inputs

        # Convolutions
        x1 = self.conv1(x_image)
        x1 = self.pool1(x1)

        # Flattening of images for input layer
        x1 = self.flatten(x1)

        # Hidden layers of neural network
        x1 = self.dense1(x1)

        # Output layer of neural network
        output = self.dense2(x1)

        return output

#Hybrid CNN model taking metadata
class Hybrid_model(tf.keras.Model):
    def __init__(self, neurons = 8, activ = 'tanh'):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv2D(filters=16, kernel_size=5, strides=(1, 1), activation='relu', padding='same', input_shape=(100, 100, 3))
        self.conv2 = tf.keras.layers.Conv2D(32, 5, activation='relu')
        self.pool = tf.keras.layers.MaxPool2D(pool_size=(2,2))
        self.flatten = tf.keras.layers.Flatten()
        self.dense1 = tf.keras.layers.Dense(neurons, activation = activ)
        self.dense2 = tf.keras.layers.Dense(neurons, activation = activ)
        self.dense3 = tf.keras.layers.Dense(1, activation='sigmoid')
        #self.dropout = tf.keras.layers.dropout(0.25)

    def call(self, inputs, training=False):
        x_image, x_meta = inputs
        # Convolutions
        x = self.conv1(x_image)
        x = self.pool(x)
        #x = self.conv2(x)
        #x = self.pool(x)
        # Flattening of images and concatenation with other data
        x = self.flatten(x)
        # Reshape metadata to match dimensions
        x_meta = tf.reshape(x_meta, (tf.shape(x_meta)[0], 8))
        #x_all = tf.concat([x,x_meta], axis=1)
        x_all = keras.layers.Concatenate(axis=1)([x, x_meta])
        # Neural Network
        x_all = self.dense1(x_all)
        #x_all = self.dense2(x_all)
        #if training:
        #    x_all = self.dropout(x_all, training=training)
        output = self.dense3(x_all)
        return output

### 3.2 - Model compiling

In [19]:
#Set seed
tf.random.set_seed(71)

#Initialize model
#model = CNN_model(neurons=8, activ='tanh')
model = Hybrid_model(neurons=8, activ='tanh')

#Define optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True,
                                          label_smoothing=0.0,
                                          axis=-1,
                                          reduction='sum_over_batch_size',
                                          name='binary_crossentropy')

#Compile the model with loss, optimizer, and metrics
model.compile(loss = loss,
              optimizer = optimizer,
              metrics = [
                  tf.keras.metrics.BinaryAccuracy(),
                  tf.keras.metrics.FalseNegatives(),
                  tf.keras.metrics.FalsePositives(),
                  tf.keras.metrics.TrueNegatives(),
                  tf.keras.metrics.TruePositives()
                  ]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
# Take 1 batch from the dataset and check its content
for batch in train_dataset.take(1):
    (img_batch, meta_batch), target_batch = batch
    
    # Print the shapes of the individual components
    print(f"Image batch shape: {img_batch.shape}")
    print(f"Metadata batch shape: {meta_batch.shape}")
    print(f"Target batch shape: {target_batch.shape}")

# To count the total number of batches
batch_count = 0
for _ in train_dataset:
    batch_count += 1

print(f"Total number of batches in the dataset: {batch_count}")

Image batch shape: (32, 100, 100, 3)
Metadata batch shape: (32, 1, 8)
Target batch shape: (32, 1, 1)


2024-09-19 12:09:47.540942: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Total number of batches in the dataset: 5


2024-09-19 12:09:48.386420: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


### 3.3 - Model fit

In [26]:
mod = model.fit(train_dataset, epochs=20, validation_data=validate_dataset)

Epoch 1/20


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 94ms/step - binary_accuracy: 0.7102 - false_negatives: 0.0000e+00 - false_positives: 29.6667 - loss: 0.6021 - true_negatives: 0.0000e+00 - true_positives: 74.0000 - val_binary_accuracy: 0.4815 - val_false_negatives: 0.0000e+00 - val_false_positives: 14.0000 - val_loss: 0.8045 - val_true_negatives: 0.0000e+00 - val_true_positives: 13.0000
Epoch 2/20


  self.gen.throw(typ, value, traceback)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 95ms/step - binary_accuracy: 0.7150 - false_negatives: 0.0000e+00 - false_positives: 28.8333 - loss: 0.5980 - true_negatives: 0.0000e+00 - true_positives: 74.8333 - val_binary_accuracy: 0.4815 - val_false_negatives: 0.0000e+00 - val_false_positives: 14.0000 - val_loss: 0.8081 - val_true_negatives: 0.0000e+00 - val_true_positives: 13.0000
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 87ms/step - binary_accuracy: 0.7393 - false_negatives: 0.0000e+00 - false_positives: 28.5000 - loss: 0.5760 - true_negatives: 0.0000e+00 - true_positives: 75.1667 - val_binary_accuracy: 0.4815 - val_false_negatives: 0.0000e+00 - val_false_positives: 14.0000 - val_loss: 0.8105 - val_true_negatives: 0.0000e+00 - val_true_positives: 13.0000
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 89ms/step - binary_accuracy: 0.7011 - false_negatives: 0.0000e+00 - false_positives: 30.6667 - loss: 0.6105 - tr

In [22]:
mod.history

{'binary_accuracy': [0.695364236831665,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878,
  0.7152317762374878],
 'false_negatives': [16.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'false_positives': [30.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0,
  43.0],
 'loss': [0.6219391226768494,
  0.644404947757721,
  0.699401319026947,
  0.6613954305648804,
  0.6257321834564209,
  0.6011008620262146,
  0.5987812280654907,
  0.6016252040863037,
  0.74126201868057

**BATCHES**

In [23]:
# Iterate through all batches in the dataset and print their shapes
for i, batch in enumerate(train_dataset):
    (img_batch, meta_batch), target_batch = batch
    
    # Print the shapes of the current batch
    print(f"Batch {i+1}:")
    print("  Image Batch Shape:", img_batch.shape)
    print("  Metadata Batch Shape:", meta_batch.shape)
    print("  Target Batch Shape:", target_batch.shape)

Batch 1:
  Image Batch Shape: (32, 100, 100, 3)
  Metadata Batch Shape: (32, 1, 8)
  Target Batch Shape: (32, 1, 1)
Batch 2:
  Image Batch Shape: (32, 100, 100, 3)
  Metadata Batch Shape: (32, 1, 8)
  Target Batch Shape: (32, 1, 1)
Batch 3:
  Image Batch Shape: (32, 100, 100, 3)
  Metadata Batch Shape: (32, 1, 8)
  Target Batch Shape: (32, 1, 1)
Batch 4:
  Image Batch Shape: (32, 100, 100, 3)
  Metadata Batch Shape: (32, 1, 8)
  Target Batch Shape: (32, 1, 1)
Batch 5:
  Image Batch Shape: (23, 100, 100, 3)
  Metadata Batch Shape: (23, 1, 8)
  Target Batch Shape: (23, 1, 1)
