In [None]:
import os
import tensorflow as tf
import numpy as np



In [13]:
DATA_DIR = 'data'
IMAGES_DIR = os.path.join(DATA_DIR, 'jpeg', 'train')
TRAIN_CSV = os.path.join(DATA_DIR, 'train.csv')
TF_PREFIX = 'train'
TRAIN_SPLIT = 0.9
SMALL_SPLIT = 0.01
IMAGE_SIZE = (224, 222)
BATCH_SIZE = 32
MODEL_PATH = "models/model_v2"



In [4]:
age_normalization_fn = lambda x :  x / 90 if np.isfinite(x) else 0.5
sex_nomalization_fn = lambda x : 1.0 if x == "male" else 0.0

class CsvTransformer:
    cols = ['sex', 'age', 'anatom_head/neck', 'anatom_lower extremity', 'anatom_oral/genital', 'anatom_palms/soles', \
        'anatom_torso', 'anatom_upper extremity', 'target']
    
    def __init__(self, csv_path):
        df = pd.read_csv(csv_path)
        df['age'] = df['age_approx'].apply(age_normalization_fn).values
        df['sex'] = df['sex'].apply(sex_nomalization_fn).values
        df['target'] = df['target'].astype("float64")
        df = pd.get_dummies(df,prefix=['anatom'], columns = ['anatom_site_general_challenge'], drop_first=False)
        df = df.set_index('image_name')
        self.df = df
        self.init_tables()
        
    def init_tables(self):
        # build a lookup table
        self.lookup_tables = {}
        for col in self.cols:
            table = tf.lookup.StaticHashTable(
                initializer=tf.lookup.KeyValueTensorInitializer(
                    keys=tf.constant(list(self.df.index.values)),
                    values=tf.constant(list(self.df[[col]].values.astype("float64").flatten())),
                ),
                default_value=tf.constant(-1.0, dtype=tf.float64),
                name="class_weight"
            )
            
            self.lookup_tables[col] = table

    def get_data_vector(self, image_names):
        vals = []
        for col in self.cols:
            if col == 'target':
                continue
            table = self.lookup_tables[col]
            val = table.lookup(image_names)
            vals.append(val)

        return tf.transpose(tf.stack(vals))

    def get_vector_from_image_name(self, col, image_name):
            
        return self.lookup_tables[col].lookup(image_name)
    
transformer = CsvTransformer(TRAIN_CSV)


2022-01-19 10:15:33.801917: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
np.random.seed(0)
original_df = pd.read_csv(TRAIN_CSV).sample(frac=1).reset_index(drop=True)
original_df.set_index('image_name', inplace = True)

validation_df = original_df.iloc[:2000]
original_df = original_df.iloc[2000:]

split_point = int(len(original_df) * TRAIN_SPLIT)
small_split_point = int(len(original_df) * SMALL_SPLIT)

train_df = original_df.iloc[:split_point]
test_df = original_df[split_point:]
small_df = original_df[:small_split_point]

train_df.shape, test_df.shape, small_df.shape, validation_df.shape


((28013, 7), (3113, 7), (311, 7), (2000, 7))

In [6]:

def map_fn(filename):
    image = tf.image.decode_jpeg(tf.io.read_file(filename))
    image = tf.image.resize(image, IMAGE_SIZE)

    image_name = tf.strings.split(filename, sep='/')[-1]
    image_name = tf.strings.split(image_name, sep='.')[0]
    data = transformer.get_data_vector(image_name)
    target = tf.cast(transformer.get_vector_from_image_name('target', image_name), dtype=tf.int32)

    return {"image": image, 
            "image_name": image_name,
            "data": data}, target

def get_dataset(df: pd.DataFrame, batch_size: int, cache = True):
    filenames = IMAGES_DIR + '/' + df.index.values + ".jpeg"

    ds = tf.data.Dataset.from_tensor_slices(filenames)
    ds = ds.map(map_fn)
    if cache:
        ds = ds.cache()
    ds = ds.prefetch(tf.data.AUTOTUNE).batch(batch_size)

    return ds



In [7]:
#train_dataset = get_dataset(train_df, BATCH_SIZE, cache = True)
small_train_dataset = get_dataset(train_df.iloc[:2000], BATCH_SIZE)
test_dataset = get_dataset(test_df, BATCH_SIZE)
small_dataset = get_dataset(small_df, BATCH_SIZE)
validation_dataset = get_dataset(validation_df, BATCH_SIZE, cache = False)

In [8]:
dfs = [(test_df, "test"), (small_df, "small")]

for df, df_name in dfs:

    neg, pos = np.bincount(df['target'])
    total = neg + pos
    print('{} Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
        df_name, total, pos, 100 * pos / total))


test Examples:
    Total: 3113
    Positive: 52 (1.67% of total)

small Examples:
    Total: 311
    Positive: 3 (0.96% of total)



In [24]:
from tensorflow.keras.applications import *

class MyModel(tf.keras.Model):
    
    models = {"mobilenet":  (mobilenet_v3.preprocess_input, MobileNetV3Large),
              "xception":   (xception.preprocess_input, Xception),
              "resnet": (resnet_v2.preprocess_input, resnet_v2.ResNet50V2) }


    def __init__(self,  network_model = "mobilenet",
                        pooling = "max",
                        dense_intermediate = -1,
                        dropout = 0.5,
                        extra_cols_out = 32,
                        bias = tf.keras.initializers.Constant(np.log([500 / 30000]))
                ):
                        
        super().__init__()
        preprocessor_and_network = self.models[network_model]
        self.image_preprocessor = preprocessor_and_network[0]
        self.image_feature_extractor = preprocessor_and_network[1](weights='imagenet', include_top = False, pooling = pooling)

        self.image_size = IMAGE_SIZE

        self.extra_cols_dense = tf.keras.layers.Dense(extra_cols_out, activation = "relu", name="extra_cols_dense")
        self.extra_cols_flatten = tf.keras.layers.Flatten()

        self.use_intermediate = dense_intermediate > 0
        if self.use_intermediate:
            self.dropout1 = tf.keras.layers.Dropout(dropout)
            self.dense1 = tf.keras.layers.Dense(dense_intermediate, activation="relu", name = "final_dense_intermediate")

        self.dropout2 = tf.keras.layers.Dropout(dropout)
        self.out = tf.keras.layers.Dense(1, activation = "sigmoid", bias_initializer=bias, name = "final_dense")

    def preprocess_images(self, images):
        x = self.image_preprocessor(images)
        x = tf.image.resize(x, self.image_size)
        return x

    def call(self, inputs):
        preprocessed_images = self.preprocess_images(inputs['image'])
        image_output = self.image_feature_extractor(preprocessed_images, training = False)

        data = inputs['data']
        data = self.extra_cols_dense(data)
        data = self.extra_cols_flatten(data)

        x = tf.concat([data, image_output], axis = -1)
        if self.use_intermediate:
            x = self.dropout1(x)
            x = self.dense1(x)
            
        x = self.dropout2(x)
        out = self.out(x)
        
        return out



In [37]:
model = MyModel(network_model = "mobilenet", pooling = "max", extra_cols_out = 128, dense_intermediate = 32)





In [None]:
METRICS = [
        tf.keras.metrics.TruePositives(name='tp'),
        tf.keras.metrics.FalsePositives(name='fp'),
        tf.keras.metrics.TrueNegatives(name='tn'),
        tf.keras.metrics.FalseNegatives(name='fn'), 
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

weights_model = MyModel(network_model = "mobilenet", pooling = "max", extra_cols_out = 128, dense_intermediate = 256)
weights_model.compile( loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=METRICS)

In [None]:
weights_model.evaluate(test_dataset)

In [None]:
weights_model.load_weights('weights/')

In [None]:
weights_model.evaluate(test_dataset)

In [None]:
weights_model.evaluate(validation_dataset)