In [None]:
!mkdir -p data
!wget https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_JPEG.zip
!mv ISIC_2020_Training_JPEG.zip data/jpeg.zip
!unzip data/jpeg.zip -d data/jpeg
!rename.ul jpg jpeg data/jpeg/train/*.jpg
!wget https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth.csv
!mv ISIC_2020_Training_GroundTruth.csv data/train.csv
!rm data/jpeg.zip


In [1]:
!which python

/home/ubuntu/anaconda3/envs/mel/bin/python


In [2]:
!which jupyter

/home/ubuntu/anaconda3/envs/mel/bin/jupyter


In [3]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
from model import MyModel
from data_utils import *


In [4]:
DATA_DIR = 'data'
IMAGES_DIR = os.path.join(DATA_DIR, 'jpeg', 'train')
TRAIN_CSV = os.path.join(DATA_DIR, 'train.csv')
TF_PREFIX = 'train'
TRAIN_SPLIT = 0.9
SMALL_SPLIT = 0.01
BATCH_SIZE = 32
IMAGE_SIZE = (224, 224)



In [5]:
np.random.seed(0)
original_df = pd.read_csv(TRAIN_CSV).sample(frac=1).reset_index(drop=True)
original_df.set_index('image_name', inplace = True)

validation_df = original_df.iloc[:2000]
original_df = original_df.iloc[2000:]

split_point = int(len(original_df) * TRAIN_SPLIT)
small_split_point = int(len(original_df) * SMALL_SPLIT)

train_df = original_df.iloc[:split_point]
test_df = original_df[split_point:]
small_df = original_df[:small_split_point]

train_df.shape, test_df.shape, small_df.shape, validation_df.shape


((28013, 7), (3113, 7), (311, 7), (2000, 7))

In [6]:
transformer = CsvTransformer(TRAIN_CSV)


In [7]:
def map_fn(filename):
    image = tf.image.decode_jpeg(tf.io.read_file(filename))
    image = tf.image.resize(image, IMAGE_SIZE)

    image_name = tf.strings.split(filename, sep='/')[-1]
    image_name = tf.strings.split(image_name, sep='.')[0]
    data = transformer.get_data_vector(image_name)
    target = tf.cast(transformer.get_vector_from_image_name('target', image_name), dtype=tf.int32)

    return {"image": image, 
            "image_name": image_name,
            "data": data}, target

def get_dataset(df: pd.DataFrame, images_dir, batch_size: int, cache = True):
    filenames = images_dir + '/' + df.index.values + ".jpeg"

    ds = tf.data.Dataset.from_tensor_slices(filenames)
    ds = ds.map(map_fn)
    if cache:
        ds = ds.cache()
    ds = ds.prefetch(tf.data.AUTOTUNE).batch(batch_size)

    return ds

In [8]:
#train_dataset = get_dataset(train_df, BATCH_SIZE, cache = True)
small_train_dataset = get_dataset(train_df.iloc[:2000], IMAGES_DIR, BATCH_SIZE)
test_dataset = get_dataset(test_df, IMAGES_DIR, BATCH_SIZE)
small_dataset = get_dataset(small_df, IMAGES_DIR, BATCH_SIZE)
validation_dataset = get_dataset(validation_df, IMAGES_DIR, BATCH_SIZE, cache = False)

In [9]:
dfs = [(test_df, "test"), (small_df, "small")]

for df, df_name in dfs:

    neg, pos = np.bincount(df['target'])
    total = neg + pos
    print('{} Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
        df_name, total, pos, 100 * pos / total))


test Examples:
    Total: 3113
    Positive: 52 (1.67% of total)

small Examples:
    Total: 311
    Positive: 3 (0.96% of total)



In [10]:
weights_model = MyModel.create_standard_version(load_weights_path="weights/", compile=True)
weights_model

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v3/weights_mobilenet_v3_large_224_1.0_float_no_top.h5


<model.MyModel at 0x7ffb14aefb10>

In [11]:
weights_model.evaluate(test_dataset)



[0.06712061166763306,
 0.0,
 1.0,
 3060.0,
 52.0,
 0.982974648475647,
 0.0,
 0.0,
 0.8586528301239014,
 0.14941127598285675]

In [12]:
weights_model.evaluate(validation_dataset)



[0.07123943418264389,
 1.0,
 1.0,
 1963.0,
 35.0,
 0.9819999933242798,
 0.5,
 0.02777777798473835,
 0.8723480701446533,
 0.14258483052253723]

In [13]:

def get_stats(dataset, threshold):
    predictions, targets = [], []
    for el, target in dataset:
        preds = weights_model.predict(el)
        predictions.append(preds)
        targets.append(target.numpy())
    
    predictions, targets = np.vstack(predictions), np.expand_dims(np.hstack(targets), 1)
    print(f"pred stats - mean: {predictions.mean()}, std: {predictions.std()}")
    deciles = np.percentile(predictions, np.arange(10, 100, 10))
    print(f"pred deciles: {deciles}")
    thresh_predictions = (predictions > threshold).astype("int")
    res = tf.math.confusion_matrix(labels=targets.flatten(), predictions=thresh_predictions.flatten())
    true_positives, false_positives, true_negatives, false_negatives = res[1, 1], res[0, 1], res[0, 0], res[1, 0]
    print("true_positives: %d, false_positives: %d, true_negatives: %d, false_negatives: %d" % (true_positives, false_positives, true_negatives, false_negatives))
    sensitivity = true_positives / (true_positives + false_negatives) * 100.0
    specificity = true_negatives /(true_negatives + false_positives) * 100.0

    print(f"sensitivity: {sensitivity:.2f}%, specificity: {specificity:.2f}%")

In [14]:
get_stats(dataset = small_train_dataset, threshold = 0.023)

pred stats - mean: 0.016367629170417786, std: 0.03150975704193115
pred deciles: [0.00028777 0.0008963  0.00192593 0.00411637 0.00856157 0.01317607
 0.01849228 0.02461386 0.03629587]
true_positives: 24, false_positives: 424, true_negatives: 1544, false_negatives: 8
sensitivity: 75.00%, specificity: 78.46%


In [15]:
get_stats(dataset = test_dataset, threshold = 0.023)

pred stats - mean: 0.017042944207787514, std: 0.031137915328145027
pred deciles: [0.0003391  0.00102918 0.00236574 0.00456951 0.00919554 0.01446005
 0.01885822 0.02452556 0.03598046]
true_positives: 40, false_positives: 649, true_negatives: 2412, false_negatives: 12
sensitivity: 76.92%, specificity: 78.80%


In [16]:
get_stats(dataset = validation_dataset, threshold = 0.023)

pred stats - mean: 0.017199963331222534, std: 0.032993678003549576
pred deciles: [0.00035127 0.00099075 0.00215774 0.00435485 0.00933594 0.01425113
 0.01867158 0.0247126  0.0367173 ]
true_positives: 29, false_positives: 417, true_negatives: 1547, false_negatives: 7
sensitivity: 80.56%, specificity: 78.77%
