In [None]:
!pip install -q efficientnet

In [None]:
import os, re, time, tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics, model_selection

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras 
from tensorflow.keras import backend as K
from efficientnet import tfkeras as efnet

from kaggle_datasets import KaggleDatasets

In [None]:
# USE DIFFERENT SEED FOR DIFFERENT STRATIFIED KFOLD
SEED = 15
tf.random.set_seed(SEED)
np.random.seed(SEED)

# 
#FOLDS = 5

FOLDS = 10
INCLUDE_2019 = 1
INCLUDE_2018 = 1
INCLUDE_MALIGNANT = 0

# DATA PARAMS
IMG_READ_SIZE     = 384
IMG_SIZE          = 384
BALANCE_POS_RATIO = 0.01

# MODEL PARAMS
EFF_NET      = 0
# loss and loss params
LOSS_PARAMS  = dict(label_smoothing=0.09)

# TRAINING PARAMS
BATCH_SIZE  = 32
EPOCHS      = 12


# VALID AND TEST PARAMS
TBM        = 6
TTA        = 15
VALID_FREQ = 1


## Hardware Setting

In [None]:
DEVICE = "TPU"
print("connecting to TPU...")
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    print("Could not connect to TPU")
    tpu = None
if tpu:
    try:
        print("initializing  TPU ...")
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("TPU initialized")
    except _:
        print("failed to initialize TPU")
else:
    DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    
AUTO              = tf.data.experimental.AUTOTUNE
REPLICAS          = strategy.num_replicas_in_sync
GLOBAL_BATCH_SIZE = BATCH_SIZE * REPLICAS
print("REPLICAS: %d" % REPLICAS)

## Preprocessing

https://www.kaggle.com/c/siim-isic-melanoma-classification/discussion/169139

In [None]:
GCS_PATH1 = KaggleDatasets().get_gcs_path('melanoma-%ix%i' % (IMG_READ_SIZE, IMG_READ_SIZE))
GCS_PATH2 = KaggleDatasets().get_gcs_path('isic2019-%ix%i' % (IMG_READ_SIZE, IMG_READ_SIZE))
GCS_PATH3 = KaggleDatasets().get_gcs_path('malignant-v2-%ix%i' % (IMG_READ_SIZE, IMG_READ_SIZE))

In [None]:
df_base_train = pd.read_csv("../input/siim-isic-melanoma-classification/train.csv")

df_base_test = pd.read_csv("../input/siim-isic-melanoma-classification/test.csv")
df_base_train.shape

In [None]:
train_files = tf.io.gfile.glob(os.path.join(GCS_PATH1, "train*.tfrec"))
print(len(train_files))
#
if INCLUDE_2019:
    train_files += tf.io.gfile.glob([os.path.join(GCS_PATH2, "train%.2i*.tfrec" % i) for i in range(1, 30, 2)])
#     print(train_files.shape)
if INCLUDE_2018:
    train_files += tf.io.gfile.glob([os.path.join(GCS_PATH2, "train%.2i*.tfrec" % i) for i in range(0, 30, 2)])
#     print(train_files.shape)
#
if INCLUDE_MALIGNANT:
    train_files += tf.io.gfile.glob([os.path.join(GCS_PATH3, "train%.2i*.tfrec" % i) for i in range(15, 30, 1)])
#     print(train_files.shape)
print("%d train files found" % len(train_files))

In [None]:
test_files = tf.io.gfile.glob(os.path.join(GCS_PATH1, "test*.tfrec"))
print("%d test files found" % len(test_files))

## Data pipeline

In [None]:
def read_labeled_tfrecord(example):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_name'                   : tf.io.FixedLenFeature([], tf.string),
        'target'                       : tf.io.FixedLenFeature([], tf.int64)
    }           
    example = tf.io.parse_single_example(example, tfrec_format)
    return example['image'], example['target']


def read_unlabeled_tfrecord(example, return_image_name=True):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_name'                   : tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_single_example(example, tfrec_format)
    return example['image'], example['image_name'] if return_image_name else 0


def prepare_image(img):    
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.cast(img, tf.float32) / 255.0           
    return img


def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) 
         for filename in filenames]
    return np.sum(n)

In [None]:
def Transform(
        dim=256,
        rot_mult=180.0,
        shear_mult=2.0,
        hzoom_mult=8.0,
        wzoom_mult=8.0,
        hshift_mult=8.0,
        wshift_mult=8.0):
    def _transform(image):
        # input image - is one image of size [dim,dim,3] not a batch of [b,dim,dim,3]
        # output - image randomly rotated, sheared, zoomed, and shifted
        XDIM = dim % 2  # fix for size 331

        rot = rot_mult * tf.random.normal([1], dtype='float32')
        shr = shear_mult * tf.random.normal([1], dtype='float32')
        h_zoom = 1.0 + tf.random.normal([1], dtype='float32') / hzoom_mult
        w_zoom = 1.0 + tf.random.normal([1], dtype='float32') / wzoom_mult
        h_shift = hshift_mult * tf.random.normal([1], dtype='float32')
        w_shift = wshift_mult * tf.random.normal([1], dtype='float32')

        # GET TRANSFORMATION MATRIX
        m = _get_mat(rot, shr, h_zoom, w_zoom, h_shift, w_shift)

        # LIST DESTINATION PIXEL INDICES
        x = tf.repeat(tf.range(dim // 2, -dim // 2, -1), dim)
        y = tf.tile(tf.range(-dim // 2, dim // 2), [dim])
        z = tf.ones([dim * dim], dtype='int32')
        idx = tf.stack([x, y, z])

        # ROTATE DESTINATION PIXELS ONTO ORIGIN PIXELS
        idx2 = K.dot(m, tf.cast(idx, dtype='float32'))
        idx2 = K.cast(idx2, dtype='int32')
        idx2 = K.clip(idx2, -dim // 2 + XDIM + 1, dim // 2)

        # FIND ORIGIN PIXEL VALUES
        idx3 = tf.stack([dim // 2 - idx2[0, ], dim // 2 - 1 + idx2[1, ]])
        d = tf.gather_nd(image, tf.transpose(idx3))

        return tf.reshape(d, [dim, dim, 3])

    return _transform

def base_aug(img):
    img = tf.image.random_flip_left_right(img)
    #img = tf.image.random_hue(img, 0.01)
    img = tf.image.random_saturation(img, 0.7, 1.3)
    img = tf.image.random_contrast(img, 0.8, 1.2)
    img = tf.image.random_brightness(img, 0.1)
    return img



transform_aug = Transform(dim=IMG_READ_SIZE)


def basic_augmentation_pipeline(ds: tf.data.Dataset, dim=None, batch_size=None) -> tf.data.Dataset:
    ds = ds.map(lambda i, o: (transform_aug(i), o), num_parallel_calls=AUTO)
    ds = ds.map(lambda i, o: (base_aug(i), o), num_parallel_calls=AUTO)
    return ds

In [None]:
def get_dataset(files, augment=False, repeat=False, shuffle=False, labeled=True, batch_size=16, drop_remainder=False, 
                dim=256, read_dim=None
               ) -> tf.data.Dataset:
    if read_dim is None:
        read_dim = dim
    
    ds = tf.data.TFRecordDataset(files, num_parallel_reads=AUTO)
    ds = ds.cache()
    
    if repeat:
        ds = ds.repeat()
    
    if shuffle: 
        ds = ds.shuffle(1024 * 8)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
        
    if labeled: 
        ds = ds.map(read_labeled_tfrecord, num_parallel_calls=AUTO)
    else:
        ds = ds.map(read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    ds = ds.map(lambda i, o: (prepare_image(i), o), num_parallel_calls=AUTO)

    if augment:
        ds = basic_augmentation_pipeline(ds, batch_size=8 * batch_size, dim=read_dim) 
        if isinstance(augment, list):
            for a in augment:
                ds = ds.map(a, num_parallel_calls=AUTO)
        
    ds = ds.map(lambda i, o: (tf.image.resize(i, [dim, dim]), o), num_parallel_calls=AUTO)
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)
    ds = ds.prefetch(AUTO)
    return ds

def separate_by_target(ds: tf.data.Dataset, idx: int = 1, thr: float = 0.5
                       ) -> typing.Tuple[tf.data.Dataset, tf.data.Dataset]:
    def _cond0(*args):
        return tf.cast(args[idx], tf.float32) < thr

    def _cond1(*args):
        return tf.cast(args[idx], tf.float32) >= thr

    ds0 = ds.filter(_cond0)
    ds1 = ds.filter(_cond1)

    return ds0, ds1


def merge_dataset(ds0: tf.data.Dataset, ds1: tf.data.Dataset, pos_ratio: float = 0.5) -> tf.data.Dataset:
    n1 = int(1000 * pos_ratio)
    n0 = 1000 - n1
    choice_ds = tf.data.Dataset.from_tensor_slices(
        np.concatenate([np.zeros(n0), np.ones(n1)]).astype('int64')).shuffle(n0 + n1).repeat()
    ds = tf.data.experimental.choose_from_datasets([ds0, ds1],
                                                   choice_ds.prefetch(tf.data.experimental.AUTOTUNE))
    return ds

def get_balanced_dataset(files, augment=False, repeat=False, shuffle=False, batch_size=16, drop_remainder=False, 
                         dim=256, read_dim=None, pos_ratio=False) -> tf.data.Dataset:
    if read_dim is None:
        read_dim = dim
    
    ds = tf.data.TFRecordDataset(files, num_parallel_reads=AUTO)
    ds = ds.cache()
    
    if repeat:
        ds = ds.repeat()
    
    ds = ds.map(read_labeled_tfrecord, num_parallel_calls=AUTO)
        
    ds0, ds1 = balance.separate_by_target(ds)
    ds0 = ds.map(lambda i, o: (prepare_image(i), o), num_parallel_calls=AUTO)
    ds1 = ds.map(lambda i, o: (prepare_image(i), o), num_parallel_calls=AUTO)
    ds = merge_dataset(ds0, ds1, pos_ratio)
    if shuffle: 
        ds = ds.shuffle(1024)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
    
    if augment:
        ds = basic_augmentation_pipeline(ds, batch_size=8 * batch_size, dim=read_dim) 
        if isinstance(augment, list):
            for a in augment:
                ds = ds.map(a, num_parallel_calls=AUTO)
    ds = ds.map(lambda i, o: (tf.image.resize(i, [dim, dim]), o), num_parallel_calls=AUTO)
        
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)
    ds = ds.prefetch(AUTO)
    return ds

## Model building

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, GlobalAveragePooling2D, Activation
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.losses import BinaryCrossentropy
def build_model_batchnormalization(dim=128, ef=0):
    inp = Input(shape=(dim,dim,3))
    base = getattr(efnet, 
                   'EfficientNetB%d' % ef)(input_shape=(dim, dim, 3), 
                   weights='imagenet', 
                   include_top=False)
    x = base(inp)
    x = GlobalAveragePooling2D()(x)
    x = BatchNormalization()(x)
    x = Dense(512,activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dense(256,activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dense(256,activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(128,activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(1)(x)
    x = Activation('sigmoid', dtype='float32')(x)
    model = Model(inputs=inp,outputs=x)
    opt = Adam(learning_rate=1e-3)
#     opt = SGD(learning_rate=1e-3)

    loss = BinaryCrossentropy(**LOSS_PARAMS)

    model.compile(optimizer=opt, loss=loss, metrics=['AUC'])
    return model
def build_model(dim=128, ef=0):
    inp = Input(shape=(dim,dim,3))
    base = getattr(efnet, 
                   'EfficientNetB%d' % ef)(input_shape=(dim, dim, 3), 
                   weights='imagenet', 
                   include_top=False)
    x = base(inp)
    x = GlobalAveragePooling2D()(x)
    x = BatchNormalization()(x)
    x = Dense(512,activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dense(256,activation='relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(256,activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(128,activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(1)(x)
    x = Activation('sigmoid', dtype='float32')(x)
    model = Model(inputs=inp,outputs=x)
    opt = Adam(learning_rate=1e-3)
#     opt = SGD(learning_rate=1e-3)
    loss = BinaryCrossentropy(**LOSS_PARAMS)
    model.compile(optimizer=opt, loss=loss, metrics=['AUC'])
    return model

In [None]:
# build checkpoint folder
CKPT_FOLDER = "../working/ckpt"
if not os.path.exists(CKPT_FOLDER):
    os.mkdir(CKPT_FOLDER)
# build folds
folds = list(model_selection.KFold(n_splits=FOLDS, shuffle=True, random_state=SEED).split(np.arange(15)))
testiness = pd.read_csv("../input/spicv-spicy-vi-make-your-cv-more-testy/testiness.csv")
#
TOTAL_POS = 581 + 2858 * INCLUDE_2019 + 1651 * INCLUDE_2018 + 580 * INCLUDE_MALIGNANT

In [None]:
VERBOSE = 1
PLOT    = 1

histories = []
df_oof = pd.DataFrame(); df_res = pd.DataFrame()
t_start = time.time()
for fold, (idTrain, idValid) in enumerate(folds):
    print("#" * 68)
    print(("#" * 20 + "\t\tFold %d\t\t" + "#" * 20) % fold)
    print("#" * 68)
    # prepare TPU
    if DEVICE == 'TPU':
        if tpu: 
            tf.tpu.experimental.initialize_tpu_system(tpu)
    # build fold train-valid split   
    fold_valid_files = [f for f in train_files if any([int(re.match("^train([0-9]+)", f.split("/")[-1]).group(1)) % 15 == i for i in idValid])]
    fold_valid_files = [f for f in fold_valid_files if GCS_PATH1 in f] # only data from the original dataset
    # fold_train_files = [f for f in train_files if any([int(re.match("^train([0-9]+)", f.split("/")[-1]).group(1)) % 15 == i for i in idTrain])]
    fold_train_files = [f for f in train_files if f not in fold_valid_files]
    np.random.shuffle(fold_train_files)
    print("Train files: %d\t\t Valid files: %d" % (len(fold_train_files), len(fold_valid_files)))
    # build model and set precision policy
    K.clear_session()   
    if DEVICE == 'TPU':
        keras.mixed_precision.experimental.set_policy('mixed_bfloat16')
    with strategy.scope():
        model = build_model(dim=IMG_SIZE, ef=EFF_NET)
   
    # build training dataset
    print("Using balanced dataset with pos_ratio = %d%%" % int(100 * BALANCE_POS_RATIO))
    ds_train = get_balanced_dataset(fold_train_files, repeat=True,  augment=True,  drop_remainder=True,                                                    shuffle=True,  
                                        pos_ratio=BALANCE_POS_RATIO,
                                        dim=IMG_SIZE, read_dim=IMG_READ_SIZE, batch_size=GLOBAL_BATCH_SIZE)
    FOLD_POS = TOTAL_POS * (FOLDS - 1) / FOLDS
    STEPS = int(FOLD_POS / BALANCE_POS_RATIO / GLOBAL_BATCH_SIZE)



    ds_valid = get_dataset(fold_valid_files, repeat=False, augment=False, drop_remainder=False, shuffle=False, 
                           dim=IMG_SIZE, read_dim=IMG_READ_SIZE, batch_size=GLOBAL_BATCH_SIZE * TBM)
    # callbacks
    FOLD_CKPT_FOLDER = os.path.join(CKPT_FOLDER, "fold%d" % fold)
    if not os.path.exists(FOLD_CKPT_FOLDER):
        os.mkdir(FOLD_CKPT_FOLDER)
    callbacks =[
        keras.callbacks.ModelCheckpoint(os.path.join(FOLD_CKPT_FOLDER, "model_fold%d_e{epoch:02d}.h5" % fold),                                             save_weights_only=True,
                                       save_freq= int(STEPS*3))]    
    # train
    
    print("Training...")
    history = model.fit(
                            ds_train,
        validation_data   = ds_valid,
        epochs            = EPOCHS,
        steps_per_epoch   = STEPS,
        verbose           = VERBOSE,
        callbacks         = callbacks,
        validation_freq   = VALID_FREQ
    )
    histories.append(history)    
    
    # SWA
    ckpt_files = np.sort(tf.io.gfile.glob(os.path.join(FOLD_CKPT_FOLDER, "*.h5")))

    for f in ckpt_files:
        os.remove(f)
    model.save(os.path.join(CKPT_FOLDER, "model_fold%d.h5" % fold))
    # VALID
    ds_valid = get_dataset(fold_valid_files, augment=TTA >= 1, repeat=True, dim=IMG_SIZE, read_dim=IMG_READ_SIZE, batch_size=GLOBAL_BATCH_SIZE * TBM, drop_remainder=True)
    ct_valid = count_data_items(fold_valid_files); STEPS = int(np.ceil(TTA * ct_valid / GLOBAL_BATCH_SIZE / TBM))
    fold_valid_pred = model.predict(ds_valid, steps=STEPS, verbose=1)
    fold_valid_pred = fold_valid_pred[:ct_valid * TTA,]
    ds_valid = get_dataset(fold_valid_files, augment=False, repeat=False, dim=IMG_SIZE, batch_size=GLOBAL_BATCH_SIZE * TBM, drop_remainder=False, labeled=False)
    fold_valid_names = np.concatenate([np.array([ni.decode("utf-8") for ni in n.numpy()]) for n in ds_valid.map(lambda i, n: n)], 0)
    
    fold_df = pd.DataFrame({'image_name': np.tile(fold_valid_names, [TTA]), 'pred': fold_valid_pred.squeeze(), 'fold': fold})
    df_oof = pd.concat([df_oof, fold_df])
    fold_df['image_name'] = fold_df['image_name'].str.replace('_downsampled', '')
    fold_df = fold_df.groupby('image_name').mean().reset_index()
    fold_df = fold_df.merge(df_base_train[['image_name', 'patient_id', 'target']], on='image_name').merge(testiness, on='image_name')
    fold_df['fold'] = fold
    auc  = metrics.roc_auc_score(fold_df.target, fold_df.pred)
    
    # TEST
    ds_test = get_dataset(test_files, augment=TTA >= 1, repeat=True, dim=IMG_SIZE, read_dim=IMG_READ_SIZE, batch_size=GLOBAL_BATCH_SIZE * TBM, drop_remainder=True, labeled=False)
    ct_test = count_data_items(test_files); STEPS = int(np.ceil(TTA * ct_test / GLOBAL_BATCH_SIZE / TBM))
    fold_test_pred = model.predict(ds_test.map(lambda i, l: i), steps=STEPS, verbose=1)
    fold_test_pred = fold_test_pred[:ct_test * TTA,]
    ds_test = get_dataset(test_files, augment=False, repeat=False, dim=IMG_SIZE, batch_size=GLOBAL_BATCH_SIZE * TBM, drop_remainder=False, labeled=False)
    fold_test_names = np.concatenate([np.array([ni.decode("utf-8") for ni in n.numpy()]) for n in ds_test.map(lambda i, n: n)], 0)
    
    fold_res = pd.DataFrame({'image_name': np.tile(fold_test_names, [TTA]), 'pred': fold_test_pred.squeeze(), 'fold': fold})
    df_res = pd.concat([df_res, fold_res])
    
    # time
    used_time_till_now = time.time() - t_start
    time_per_fold = used_time_till_now / (fold + 1)
    print("Validation AUC last epoch = %.4f" % history.history['val_auc'][-1])
    print("Validation AUC  (TTA %2d) = %.4f" % (TTA, auc))
    print("Total time = %ds\t\tTime per fold = %ds" % (int(used_time_till_now), int(time_per_fold)))
    
    # plot
    if PLOT:
        plt.figure(figsize=(16, 4))
        plt.subplot(1, 2, 1)
        plt.plot(history.history['loss'], color='tab:blue', marker='o')
        plt.plot(range(VALID_FREQ - 1, EPOCHS, VALID_FREQ), history.history['val_loss'], color='tab:blue', marker='x', linestyle=':')
        plt.yscale('log')
        plt.subplot(1, 2, 2)
        plt.plot(history.history['auc'], color='tab:red', marker='o')
        plt.plot(range(VALID_FREQ - 1, EPOCHS, VALID_FREQ), history.history['val_auc'], color='tab:red', marker='x', linestyle=':')
        plt.show()
    
    del model, ds_train, ds_valid, ds_test
    print("\n\n")
    


In [None]:
avgh = dict()
for history in histories:
    for k in history.history.keys():
        if k in avgh.keys():
            avgh[k] = np.concatenate([avgh[k], np.array(history.history[k]).reshape(-1, 1)], 1)
        else:
            avgh[k] = np.array(history.history[k]).reshape(-1, 1)
plt.figure(figsize=(16, 4))
plt.subplot(1, 2, 1)
plt.title('Log Loss')
plt.plot(avgh['loss'], marker='o', color='tab:blue', alpha=0.2)
plt.plot(avgh['loss'].mean(1), color='tab:blue')
plt.plot(range(VALID_FREQ - 1, EPOCHS, VALID_FREQ), avgh['val_loss'], color='tab:blue', alpha=0.2, linestyle=":")
plt.plot(range(VALID_FREQ - 1, EPOCHS, VALID_FREQ), avgh['val_loss'].mean(1), marker='x', color='tab:blue', linestyle=":")
plt.yscale('log')
plt.subplot(1, 2, 2)
plt.title('AUC')
plt.plot(avgh['auc'], marker='o', color='tab:red', alpha=0.2)
plt.plot(avgh['auc'].mean(1), color='tab:red')
plt.plot(range(VALID_FREQ - 1, EPOCHS, VALID_FREQ), avgh['val_auc'], color='tab:red', alpha=0.2, linestyle=":")
plt.plot(range(VALID_FREQ - 1, EPOCHS, VALID_FREQ), avgh['val_auc'].mean(1), marker='x', color='tab:red', linestyle=":");