In [None]:
!mkdir -p data
!wget https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_JPEG.zip
!mv ISIC_2020_Training_JPEG.zip data/jpeg.zip
!unzip data/jpeg.zip -d data/jpeg
!rename.ul jpg jpeg data/jpeg/train/*.jpg
!wget https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth.csv
!mv ISIC_2020_Training_GroundTruth.csv data/train.csv
!rm data/jpeg.zip


In [85]:

import os
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
#import keras_tuner as kt


mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']






In [2]:
DATA_DIR = 'data'
IMAGES_DIR = os.path.join(DATA_DIR, 'jpeg', 'train')
TRAIN_CSV = os.path.join(DATA_DIR, 'train.csv')
TF_PREFIX = 'train'
TRAIN_SPLIT = 0.9
SMALL_SPLIT = 0.01
IMAGE_SIZE = (224, 222)
BATCH_SIZE = 256



In [3]:
age_normalization_fn = lambda x :  x / 90 if np.isfinite(x) else 0.5
sex_nomalization_fn = lambda x : 1.0 if x == "male" else 0.0

class CsvTransformer:
    cols = ['age', 'sex', 'target']
    
    def __init__(self, csv_path):
        df = pd.read_csv(csv_path)
        df['age'] = df['age_approx'].apply(age_normalization_fn).values
        df['sex'] = df['sex'].apply(sex_nomalization_fn).values
        df['target'] = df['target'].astype("float64")
        df = df.set_index('image_name')
        self.df = df
        self.init_tables()
        
    def init_tables(self):
        # build a lookup table
        self.lookup_tables = {}
        for col in self.cols:
            table = tf.lookup.StaticHashTable(
                initializer=tf.lookup.KeyValueTensorInitializer(
                    keys=tf.constant(list(self.df.index.values)),
                    values=tf.constant(list(self.df[[col]].values.flatten())),
                ),
                default_value=tf.constant(-1.0, dtype=tf.float64),
                name="class_weight"
            )
            
            self.lookup_tables[col] = table
        
    def get_vector_from_image_name(self, col, image_name):
            
        return self.lookup_tables[col].lookup(image_name)
    
transformer = CsvTransformer(TRAIN_CSV)



2022-01-18 09:14:57.415976: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [64]:
np.random.seed(0)
original_df = pd.read_csv(TRAIN_CSV).sample(frac=1).reset_index(drop=True)
original_df.set_index('image_name', inplace = True)

validation_df = original_df.iloc[:2000]
original_df = original_df.iloc[2000:]

split_point = int(len(original_df) * TRAIN_SPLIT)
small_split_point = int(len(original_df) * SMALL_SPLIT)

train_df = original_df.iloc[:split_point]
test_df = original_df[split_point:]
small_df = original_df[:small_split_point]

train_df.shape, test_df.shape, small_df.shape, validation_df.shape


((28013, 7), (3113, 7), (311, 7), (2000, 7))

In [65]:

def map_fn(filename):
    image = tf.image.decode_jpeg(tf.io.read_file(filename))
    image = tf.image.resize(image, IMAGE_SIZE)

    image_name = tf.strings.split(filename, sep='/')[-1]
    image_name = tf.strings.split(image_name, sep='.')[0]
    sex = transformer.get_vector_from_image_name('sex', image_name)
    age = transformer.get_vector_from_image_name('age', image_name)
    target = tf.cast(transformer.get_vector_from_image_name('target', image_name), dtype=tf.int32)

    return {"image": image, 
            "image_name": image_name,
            "sex": sex,
            "age": age}, target

def get_dataset(df: pd.DataFrame, batch_size: int):
    filenames = IMAGES_DIR + '/' + df.index.values + ".jpeg"

    ds = tf.data.Dataset.from_tensor_slices(filenames)
    ds = ds.map(map_fn)\
            .cache() \
            .prefetch(tf.data.AUTOTUNE)\
            .batch(batch_size)

    return ds



In [6]:
#train_ds = get_dataset(train_df, BATCH_SIZE)
test_dataset = get_dataset(test_df, BATCH_SIZE)
small_dataset = get_dataset(small_df, BATCH_SIZE)
validation_df = get_dataset(validation_df, BATCH_SIZE)

In [7]:
dfs = [(train_df, "Train"), (test_df, "test"), (small_df, "small")]

for df, df_name in dfs:

    neg, pos = np.bincount(df['target'])
    total = neg + pos
    print('{} Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
        df_name, total, pos, 100 * pos / total))


Train Examples:
    Total: 29813
    Positive: 526 (1.76% of total)

test Examples:
    Total: 3313
    Positive: 58 (1.75% of total)

small Examples:
    Total: 331
    Positive: 5 (1.51% of total)



# Model

In [80]:
class MyModel(tf.keras.Model):

    def __init__(self,  preprocessor = tf.keras.applications.mobilenet_v3.preprocess_input,
                        network = tf.keras.applications.MobileNetV3Large,
                        pooling = "max",
                        dense_intermediate = -1,
                        dropout = 0.5,
                        extra_cols = ["sex", "age"],
                        extra_cols_out = 32,
                        bias = None):
                        
        super().__init__()
        self.extra_cols = extra_cols
        self.image_preprocessor = preprocessor
        self.image_feature_extractor = network(weights='imagenet', include_top = False, pooling = pooling)

        self.image_size = IMAGE_SIZE

        self.extra_cols_dense = tf.keras.layers.Dense(extra_cols_out, activation = "relu", name="extra_cols_dense")
        self.extra_cols_flatten = tf.keras.layers.Flatten()

        self.use_intermediate = dense_intermediate > 0
        if self.use_intermediate:
            self.dropout1 = tf.keras.layers.Dropout(dropout)
            self.dense1 = tf.keras.layers.Dense(dense_intermediate, activation="relu", name = "final_dense_intermediate")

        self.dropout2 = tf.keras.layers.Dropout(dropout)
        self.out = tf.keras.layers.Dense(1, activation = "sigmoid", bias_initializer=bias, name = "final_dense")

    def preprocess_images(self, images):
        x = self.image_preprocessor(images)
        x = tf.image.resize(x, self.image_size)
        return x

    def call(self, inputs):
        preprocessed_images = self.preprocess_images(inputs['image'])
        image_output = self.image_feature_extractor(preprocessed_images, training = False)

        reshaped = []
        for col in self.extra_cols:
            reshaped.append(inputs[col])
        reshaped = tf.stack(reshaped, axis = -1)

        reshaped = self.extra_cols_dense(reshaped)
        reshaped = self.extra_cols_flatten(reshaped)

        x = tf.concat([reshaped, image_output], axis = -1)
        if self.use_intermediate:
            x = self.dropout1(x)
            x = self.dense1(x)
            
        x = self.dropout2(x)
        out = self.out(x)
        
        return out



In [81]:
try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    print("All devices: ", tf.config.list_logical_devices('TPU'))
    strategy = tf.distribute.experimental.TPUStrategy(resolver)
except ValueError:
    print("No TPU found")
    devices_found = tf.config.list_physical_devices('GPU')
    print("Devices:", devices_found, len(devices_found))
    if len(devices_found) < 1:
        print("No devices found, using default")
        strategy = tf.distribute.get_strategy() 
    else:
        print("Devices found, using mirrored")
        strategy = tf.distribute.MirroredStrategy()


No TPU found
Devices: [] 0
No devices found, using default


In [82]:

with strategy.scope():
    output_bias = tf.keras.initializers.Constant(np.log([500 / 30000]))
    model = MyModel(bias=output_bias)




In [83]:
for el, target in small_dataset:
    model(el)

print(model.summary())

Model: "my_model_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
MobilenetV3large (Functional (None, 1280)              4226432   
_________________________________________________________________
extra_cols_dense (Dense)     multiple                  96        
_________________________________________________________________
flatten (Flatten)            multiple                  0         
_________________________________________________________________
dropout_22 (Dropout)         multiple                  0         
_________________________________________________________________
final_dense (Dense)          multiple                  1313      
Total params: 4,227,841
Trainable params: 4,203,441
Non-trainable params: 24,400
_________________________________________________________________
None


In [84]:
ds = small_dataset

METRICS = [
        tf.keras.metrics.TruePositives(name='tp'),
        tf.keras.metrics.FalsePositives(name='fp'),
        tf.keras.metrics.TrueNegatives(name='tn'),
        tf.keras.metrics.FalseNegatives(name='fn'), 
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]


model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=METRICS,
              )

baseline_results = model.evaluate(ds,
                                  batch_size=BATCH_SIZE, 
                                  verbose=0)
                                  
print("Train results before training")                       
for name, value in zip(model.metrics_names, baseline_results):
  print(name, ': ', value)
print()

Train results before training
loss :  6.3953166007995605
tp :  5.0
fp :  324.0
tn :  2.0
fn :  0.0
accuracy :  0.021148037165403366
precision :  0.015197568573057652
recall :  1.0
auc :  0.5352761149406433
prc :  0.01687612757086754



In [54]:
lr = 3e-5
epochs = 10
train_ds = small_dataset
test_ds = small_dataset


model.compile(optimizer=tf.keras.optimizers.Adam(lr = lr),
            loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
            metrics=METRICS,
            )


early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_prc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)



model.fit(  train_ds, 
            epochs=epochs,
            callbacks=[early_stopping],
            validation_data=test_ds)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa52ad88940>

In [None]:
def model_builder(hp):

    hp_extra_cols = hp.Choice('extra_cols', values=[["sex", "age"], ['sex'], ['age']])
    hp_extra_cols_out = hp.Choice('extra_cols_out', values=[1, 32, 128])
    hp_intermediate = hp.Choice('intermediate', values=[-1, 32, 128])

    MyModel(extra_cols=hp_extra_cols, extra_cols_out=hp_extra_cols_out, dense_intermediate=hp_intermediate)

    hp_learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4, 3e-5, 1e-5])

    model.compile(  optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                    metrics=METRICS)

    return model


tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')


stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(train_ds, epochs=50, validation_split=0.2, callbacks=[stop_early])

best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")
