In [1]:
import re
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from functools import partial
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
import tempfile
import matplotlib.pyplot as plt


try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)
    
print(tf.__version__)

2023-02-03 11:06:53.867635: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2023-02-03 11:06:53.867824: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Device: grpc://10.0.0.2:8470


2023-02-03 11:07:00.884767: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-02-03 11:07:00.887358: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2023-02-03 11:07:00.887398: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2023-02-03 11:07:00.887421: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (55cd33250aa8): /proc/driver/nvidia/version does not exist
2023-02-03 11:07:00.890166: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

Number of replicas: 8
2.4.1


In [2]:
#Helps optimise the performace of data loading and preprocessing as it allows tensorflow to process data in parallel
AUTOTUNE = tf.data.experimental.AUTOTUNE

#this is the path to get the dataset which is located in Google Cloud Storage
GCS_PATH = KaggleDatasets().get_gcs_path()
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
IMAGE_SIZE = [1024, 1024]
IMAGE_RESIZE = [256, 256]

In [3]:
trainingRecords, validationRecords = train_test_split(
    #returns a list of files matching the given pattern
    tf.io.gfile.glob(GCS_PATH + '/tfrecords/train*.tfrec'),
    test_size=0.1, random_state=5 
)
testRecords = tf.io.gfile.glob(GCS_PATH + '/tfrecords/test*.tfrec')
print('Train TFRecord Files:', len(trainingRecords))
print('Validation TFRecord Files:', len(validationRecords))
print('Test TFRecord Files:', len(testRecords))

Train TFRecord Files: 14
Validation TFRecord Files: 2
Test TFRecord Files: 16


2023-02-03 11:07:06.753053: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.
2023-02-03 11:07:06.828202: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.


We need to know convert the images to tensors ( Tensors are similar to matrices and can have a specific shape and dimensionality)

In [4]:
def transformAndNormalise(img):
    #image is transformed to RGB
    img = tf.image.decode_jpeg(img,channels=3)
    #Image is normalised
    img = tf.cast(img,tf.float32)/255
    #Reshaping the tensor
    img = tf.reshape(img,[*IMAGE_SIZE, 3])
    return img
    

In [5]:
def readTFRecord(example,labeled):
    #Defines the format of the data stored in TFRecord
    tfrecord_format = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64)
    } if labeled else {
        "image": tf.io.FixedLenFeature([], tf.string),
        "image_name": tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, tfrecord_format)
    image = transformAndNormalise(example['image'])
    if labeled:
        #if labelled return image and target as tuble
        label = tf.cast(example['target'], tf.int32)
        return image, tf.cast(label, tf.float32)
    # if not labeled return iamge and image names as tuple
    idNumber = example['image_name']
    return image, idNumber

In [7]:
def loadDataset(filenames,labeled=True, ordered = False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(partial(readTFRecord, labeled=labeled), num_parallel_calls=AUTOTUNE)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

In [8]:
def dataAugmentation(image, label):
    # Adds a bit of noise to the images to increase model robustness
    image = tf.image.random_jpeg_quality(image,75,95)
    #Will randomly brighten and dim the image
    image = tf.image.random_brightness(image,0.3)
    #Flip image horizontally randomly
    image = tf.image.random_flip_left_right(image)
    #Resize image
    image = tf.image.resize(image, IMAGE_RESIZE)
    
    return image, label

In [9]:
def resizeImage(image, label):
    image = tf.image.resize(image, IMAGE_RESIZE)
    return image, label

In [10]:
# Returns the traing dataset
def get_training_dataset():
    dataset = loadDataset(trainingRecords, labeled=True)
    # using the map function data augmentation is applied to each image
    dataset = dataset.map(dataAugmentation, num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat()
    #shuffles the dataset randomly to avoid overfitting
    dataset = dataset.shuffle(2048)
    # batches the data in to specific sized groups
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [11]:
def get_validation_dataset(ordered=False):
    dataset = loadDataset(validationRecords, labeled=True, ordered=ordered)
    dataset = dataset.map(resizeImage, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [12]:
def get_test_dataset(ordered=False):
    dataset = loadDataset(testRecords, labeled=False, ordered=ordered)
    dataset = dataset.map(resizeImage, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [15]:
#Takes the image records as input and returns the number of images in thosse records
def count_data_items(filenames):
    count = 0
    for filename in filenames:
        match = re.search(r"-([0-9]*)\.", filename)
        count += int(match.group(1))
    return count

In [16]:
#displays the number of images in each dataset
NUM_TRAINING_IMAGES = count_data_items(trainingRecords)
NUM_VALIDATION_IMAGES = count_data_items(validationRecords)
NUM_TEST_IMAGES = count_data_items(testRecords)
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
print(
    'Dataset: {} training images, {} validation images, {} unlabeled test images'.format(
        NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES
    )
)

Dataset: 28984 training images, 4142 validation images, 10982 unlabeled test images


In [17]:
#Reading the csv's
train_csv = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
test_csv = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')

In [19]:
# We are looking to get the disparity between malignant and benign images
totalImages = train_csv['target'].size
#gets the number of total targets that are 1
malignant = np.count_nonzero(train_csv['target'])

# the remaining are 0 values hence benign
benign = totalImages - malignant

print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    totalImages, malignant, 100 * malignant / totalImages))

Examples:
    Total: 33126
    Positive: 584 (1.76% of total)



In [20]:
# Ensuring that the datasets are of the right shape
trainDataset = get_training_dataset()
validationDataset = get_validation_dataset()

trainDataset.take(1)

<TakeDataset shapes: ((None, 256, 256, 3), (None,)), types: (tf.float32, tf.float32)>

In [21]:
validationDataset.take(1)

<TakeDataset shapes: ((None, 256, 256, 3), (None,)), types: (tf.float32, tf.float32)>

In [22]:
images, labels = next(iter(trainDataset))

In [23]:
from tensorflow.python.keras import backend as K
# focal loss is a type of loss function that addresses class imbalances.
# The loss function works by modulating the standard cross - entropy function loss by down-weighting well - classified examples and 
# up -weighting the poorly - classified examples.
def focal_loss(alpha=0.25, gamma=2.0):
    def focal_crossentropy(y_true, y_pred):
        # calculate binary cross-entropy loss
        binary_crossentropy = K.binary_crossentropy(y_true, y_pred)
        
        # clip predictions to avoid log(0) error
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        
        # calculate p_t - probability of positive class
        positive_prob = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        
        # calculate alpha_factor
        alpha_factor = y_true * alpha + (1 - alpha) * (1 - y_true)
        
        # calculate modulating_factor
        modulating_factor = K.pow((1 - positive_prob), gamma)
        
        # calculate final loss by combining alpha_factor and modulating_factor with binary_crossentropy
        loss = K.mean(alpha_factor * modulating_factor * binary_crossentropy, axis=-1)
        
        return loss
    return focal_crossentropy

In [34]:
import tensorflow_addons as tfa
def make_model(output_bias = None, metrics = None):
    
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    # Creating a model with pre - trained weights from the VGG16 architecture
    
    base_model = tf.keras.applications.vgg16.VGG16(input_shape=(*IMAGE_RESIZE, 3),
                                                include_top=False,
                                                weights='imagenet')
    #Freezing the base models weights
    base_model.trainable = False
    
    #Create a new sequential model
    model = tf.keras.Sequential([
        # Add the base model as the first layer
        base_model,
        tf.keras.layers.Dense(8, activation='swish'),
        tf.keras.layers.Flatten(),
        # In the last dense layer we set the bias_initailizer to the output bias,
        #which is computed as a log of the ratio of malignant to benign samples
        # it is added to ajust the output of the model to accout for unbalanced classes.
        tf.keras.layers.Dense(1, activation='sigmoid',
                              bias_initializer=output_bias)
    ])
    
    model.compile(optimizer='adam',
                  loss= tfa.losses.SigmoidFocalCrossEntropy(),
                  metrics=metrics)
    
    return model

In [35]:
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
VALID_STEPS = NUM_VALIDATION_IMAGES // BATCH_SIZE

In [36]:
# To better handle the class imbalance and improve the models performance, we calculate and add a inital bias to the model 
initial_bias = np.log([malignant/benign])
initial_bias

array([-4.02038586])

In [37]:
with strategy.scope():
    model = make_model(output_bias = initial_bias, metrics=tf.keras.metrics.AUC(name='auc'))

In [38]:
#This callback function saves the best model weights whilst trainging 
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("melanoma_model4.h5",
                                                    save_best_only=True)
#This callback function stops training when the validation set's accuracys stops improving
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10,
                                                     restore_best_weights=True)

In [39]:
def exponential_decay(initial_learning_rate, steps):

    def exponential_decay_fn(epoch):
        
        lr = initial_learning_rate * 0.1 ** (epoch / steps)
        return lr

    return exponential_decay_fn

# Define the initial learning rate and steps for decay
initial_learning_rate = 0.01
steps = 20

# Create the exponential decay function
exponential_decay_fn = exponential_decay(initial_learning_rate, steps)

# Use the exponential decay function to schedule the learning rate
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)


In [40]:
history = model.fit(
    trainDataset, epochs=100,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=validationDataset,
    validation_steps=VALID_STEPS,
    callbacks=[checkpoint_cb, early_stopping_cb, lr_scheduler]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


In [41]:
# Getting the test data set and computing preditions
test_ds = get_test_dataset(ordered=True)
test_images_ds = test_ds.map(lambda image, idnum: image)

print('predicting')



probabilities = model.predict(test_images_ds)


predicting


2023-02-03 12:04:48.071114: W ./tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h:57] Ignoring an error encountered when deleting remote tensors handles: Invalid argument: Unable to find the relevant tensor remote_handle: Op ID: 38852, Output num: 0
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1675425888.067590060","description":"Error received from peer ipv4:10.0.0.2:8470","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Unable to find the relevant tensor remote_handle: Op ID: 38852, Output num: 0","grpc_status":3}


In [44]:
sub = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')
sub.head()

Unnamed: 0,image_name,target
0,ISIC_0052060,0
1,ISIC_0052349,0
2,ISIC_0058510,0
3,ISIC_0073313,0
4,ISIC_0073502,0


In [45]:


# Unpack the id numbers from the test dataset
id_numbers = test_ds.map(lambda image, idnum: idnum)

# Unbatch the id numbers
unbatched_ids = id_numbers.unbatch()

# Get a batch of all the test images
all_ids = next(iter(unbatched_ids.batch(NUM_TEST_IMAGES)))

# Convert the batch of ids to numpy array of strings
test_ids = all_ids.numpy().astype('U')

print('Generating submission.csv file...')


Generating submission.csv file...


In [50]:
pred_df = pd.DataFrame({'image_name': test_ids, 'target': np.concatenate(probabilities)})
pred_df.head()

Unnamed: 0,image_name,target
0,ISIC_6381819,0.064375
1,ISIC_5583376,0.133628
2,ISIC_6408546,0.139268
3,ISIC_6932354,0.156253
4,ISIC_8191278,0.125516


In [51]:
# Remove the target column from the submission DataFrame
sub.drop('target', axis=1, inplace=True)

# Merge the submission DataFrame with the prediction DataFrame on the image_name column
sub = sub.merge(pred_df, on='image_name')

# Save the merged DataFrame to a csv file without the index
sub.to_csv('submission.csv', index=False)

# Print the first 5 rows of the merged DataFrame
sub.head()

Unnamed: 0,image_name,target
0,ISIC_0052060,0.110213
1,ISIC_0052349,0.166452
2,ISIC_0058510,0.090335
3,ISIC_0073313,0.060395
4,ISIC_0073502,0.088133
