In [1]:
import re
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from functools import partial
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
import tempfile
import matplotlib.pyplot as plt


try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)
    
print(tf.__version__)

2023-01-26 09:22:41.964921: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


Number of replicas: 1
2.4.1


In [3]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
GCS_PATH = KaggleDatasets().get_gcs_path()
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
IMAGE_SIZE = [1024, 1024]
IMAGE_RESIZE = [256, 256]

In [4]:
trainingRecords, validationRecords = train_test_split(
    #returns a list of files matching the given pattern
    tf.io.gfile.glob(GCS_PATH + '/tfrecords/train*.tfrec'),
    test_size=0.1, random_state=5 
)
testRecords = tf.io.gfile.glob(GCS_PATH + '/tfrecords/test*.tfrec')
print('Train TFRecord Files:', len(trainingRecords))
print('Validation TFRecord Files:', len(validationRecords))
print('Test TFRecord Files:', len(testRecords))

2023-01-26 09:22:56.174106: W tensorflow/core/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "Not found: Could not locate the credentials file.". Retrieving token from GCE failed with "Failed precondition: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Couldn't resolve host 'metadata'".


Train TFRecord Files: 14
Validation TFRecord Files: 2
Test TFRecord Files: 16


We need to know convert the images to tensors ( Tensors are similar to matrices and can have a specific shape and dimensionality)

In [5]:
def transformAndNormalise(img):
    #image is transformed to RGB
    img = tf.image.decode_jpeg(img,channels=3)
    #Image is normalised
    img = tf.cast(img,tf.float32)/255
    #Reshaping the tensor
    img = tf.reshape(img,[*IMAGE_SIZE, 3])
    return img
    

In [6]:
def readTFRecord(example,labeled):
    #Defines the format of the data stored in TFRecord
    tfrecord_format = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64)
    } if labeled else {
        "image": tf.io.FixedLenFeature([], tf.string),
        "image_name": tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, tfrecord_format)
    image = transformAndNormalise(example['image'])
    if labeled:
        #if labelled return image and target as tuble
        label = tf.cast(example['target'], tf.int32)
        return image, label
    # if not labeled return iamge and image names as tuple
    idNumber = example['image_name']
    return image, idNumber

In [7]:
def loadDataset(filenames,labeled=True, ordered = False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(partial(readTFRecord, labeled=labeled), num_parallel_calls=AUTOTUNE)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

In [10]:
def dataAugmentation(image, label):
    
    #Flip image horizontally randomly
    image = tf.image.random_flip_left_right(image)
    #Resize image
    image = tf.image.resize(image, IMAGE_RESIZE)
    
    return image, label

In [11]:
def resizeImage(image, label):
    image = tf.image.resize(image, IMAGE_RESIZE)
    return image, label

In [12]:
def get_training_dataset():
    dataset = loadDataset(trainingRecords, labeled=True)
    dataset = dataset.map(dataAugmentation, num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [13]:
def get_validation_dataset(ordered=False):
    dataset = loadDataset(validationRecords, labeled=True, ordered=ordered)
    dataset = dataset.map(resizeImage, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [14]:
def get_test_dataset(ordered=False):
    dataset = loadDataset(testRecords, labeled=False, ordered=ordered)
    dataset = dataset.map(resizeImage, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [15]:
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [16]:
NUM_TRAINING_IMAGES = count_data_items(trainingRecords)
NUM_VALIDATION_IMAGES = count_data_items(validationRecords)
NUM_TEST_IMAGES = count_data_items(testRecords)
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
print(
    'Dataset: {} training images, {} validation images, {} unlabeled test images'.format(
        NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES
    )
)

Dataset: 28984 training images, 4142 validation images, 10982 unlabeled test images


In [17]:
#Reading the csv's
train_csv = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
test_csv = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')

In [18]:
# We are looking to get the disparity between malignant and benign images
totalImages = train_csv['target'].size

malignant = np.count_nonzero(train_csv['target'])
benign = totalImages - malignant

print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    totalImages, malignant, 100 * malignant / totalImages))

Examples:
    Total: 33126
    Positive: 584 (1.76% of total)



In [19]:
trainDataset = get_training_dataset()
validationDataset = get_validation_dataset()

trainDataset.take(1)

<TakeDataset shapes: ((None, 256, 256, 3), (None,)), types: (tf.float32, tf.int32)>

In [20]:
validationDataset.take(1)

<TakeDataset shapes: ((None, 256, 256, 3), (None,)), types: (tf.float32, tf.int32)>

In [21]:
images, labels = next(iter(trainDataset))

In [28]:
def make_model(output_bias = None, metrics = None):    
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    # Creating a model with pre - trained weights from the EfficientNetB1 architecture
    
    base_model = tf.keras.applications.efficientnet.EfficientNetB1(input_shape=(*IMAGE_RESIZE, 3),
                                                include_top=False,
                                                weights='imagenet')
    #Freezing the base models weights
    base_model.trainable = False
    
    #Create a new sequential model
    model = tf.keras.Sequential([
        # Add the base model as the first layer
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(8, activation='swish'),
        tf.keras.layers.Dense(1, activation='sigmoid',
                              bias_initializer=output_bias)
    ])
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=metrics)
    
    return model

In [29]:


STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
VALID_STEPS = NUM_VALIDATION_IMAGES // BATCH_SIZE

In [30]:
initial_bias = np.log([malignant/benign])
initial_bias

array([-4.02038586])

In [31]:
weight_for_0 = (1 / benign)*(totalImages)/2.0 
weight_for_1 = (1 / malignant)*(totalImages)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.51
Weight for class 1: 28.36


In [32]:
with strategy.scope():
    model = make_model(output_bias = initial_bias, metrics=tf.keras.metrics.AUC(name='auc'))

In [34]:
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("melanoma_model.h5",
                                                    save_best_only=True)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10,
                                                     restore_best_weights=True)

In [None]:
#The purpose of this function is to ensure that the learning rate decreases over time, The techniques is called learning rate sheduling 
#The learning rate decreases exponentially (The learn rate starts high and decreases quickly at the begining of training)

def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        return lr0 * 0.1 **(epoch / s)
    return exponential_decay_fn

exponential_decay_fn = exponential_decay(0.01, 20)

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)

In [None]:
history = model.fit(
    trainDataset, epochs=100,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=validationDataset,
    validation_steps=VALID_STEPS,
    callbacks=[checkpoint_cb, early_stopping_cb, lr_scheduler],
    class_weight=class_weight
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100

In [None]:
test_ds = get_test_dataset(ordered=True)
test_images_ds = test_ds.map(lambda image, idnum: image)

print('predicting')
probabilities = model.predict(test_images_ds)

In [None]:
sub = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')
sub.head()

In [None]:
# Unpack the id numbers from the test dataset
id_numbers = test_ds.map(lambda image, idnum: idnum)

# Unbatch the id numbers
unbatched_ids = id_numbers.unbatch()

# Get a batch of all the test images
all_ids = next(iter(unbatched_ids.batch(NUM_TEST_IMAGES)))

# Convert the batch of ids to numpy array of strings
test_ids = all_ids.numpy().astype('U')

print('Generating submission.csv file...')

In [None]:
pred_df = pd.DataFrame({'image_name': test_ids, 'target': np.concatenate(probabilities)})
pred_df.head()

In [None]:
# Remove the target column from the submission DataFrame
sub.drop('target', axis=1, inplace=True)

# Merge the submission DataFrame with the prediction DataFrame on the image_name column
sub = sub.merge(pred_df, on='image_name')

# Save the merged DataFrame to a csv file without the index
sub.to_csv('submission.csv', index=False)

# Print the first 5 rows of the merged DataFrame
sub.head()