In [1]:
from utils import *

In [20]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
data_path = Path('../')
BATCH_SIZE = 32
IMAGE_SIZE = [512, 512]

In [4]:
len(tf.io.gfile.glob(str(data_path) + '/train*.tfrec'))

30

In [13]:
TRAIN_FILENAMES, VALID_FILENAMES = train_test_split(get_files_from_dir(data_path, 'train', 'tfrec'), test_size=0.1, random_state=8)
TEST_FILENAMES = get_files_from_dir(data_path, 'test', 'tfrec')

print('Train TFRecord Files:', len(TRAIN_FILENAMES))
print('Validation TFRecord Files:', len(VALID_FILENAMES))
print('Test TFRecord Files:', len(TEST_FILENAMES))

Train TFRecord Files: 27
Validation TFRecord Files: 3
Test TFRecord Files: 16


In [18]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


In [19]:
feature = {
  'image': _bytes_feature,
  'image_name': _bytes_feature,
  'patient_id': _int64_feature,
  'sex': _int64_feature,
  'age_approx': _int64_feature,
  'anatom_site_general_challenge': _int64_feature,
  'source': _int64_feature,
  'target': _int64_feature
}

In [21]:
def decode_image(image, channels=3):
    image = tf.image.decode_jpeg(image, channels=channels)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.reshape(image, [*IMAGE_SIZE, channels])
    return image

In [25]:
def read_tfrecord(example, for_train=True):
    feature = {
        'image': _bytes_feature,
        'image_name': _bytes_feature,
        'patient_id': _int64_feature,
        'sex': _int64_feature,
        'age_approx': _int64_feature,
        'anatom_site_general_challenge': _int64_feature,
        'source': _int64_feature,
        'target': _int64_feature
    } if for_train else {
          'image': _bytes_feature,
          'image_name': _bytes_feature,
          'patient_id': _int64_feature,
          'sex': _int64_feature,
          'age_approx': _int64_feature,
          'anatom_site_general_challenge': _int64_feature
    }
    example = tf.io.parse_single_example(example, feature)
    image = decode_image(example['image'])
    return image, example['target']

In [23]:
def augmentation_pipeline(image, label):
    image = tf.image.random_flip_left_right(image)
    return image, label

In [26]:
def load_dataset(filenames, for_train=True):
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=AUTOTUNE)
    return dataset

In [27]:
def get_training_dataset():
    dataset = load_dataset(TRAIN_FILENAMES)
    dataset = dataset.map(augmentation_pipeline, num_parallel_reads=AUTOTUNE)
    dataset = dataset.repeat().shuffle(2048).batch(BATCH_SIZE)
    return dataset.prefetch(AUTOTUNE)

In [30]:
def get_validation_dataset():
    dataset = load_dataset(VALID_FILENAMES)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset.prefetch(AUTOTUNE)

In [31]:
def get_test_dataset():
    dataset = load_dataset(TEST_FILENAMES, for_train=False)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset.prefetch(AUTOTUNE)

In [36]:
import re
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [35]:
NUM_TRAINING_IMAGES = count_data_items(TRAIN_FILENAMES)
NUM_VALIDATION_IMAGES = count_data_items(VALID_FILENAMES)
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
print(
    'Dataset: {} training images, {} validation images, {} unlabeled test images'.format(
        NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES
    )
)

Dataset: 55917 training images, 4570 validation images, 10982 unlabeled test images


In [37]:
def make_model(output_bias = None, metrics = None):
    base_model = tf.keras.applications.VGG16(input_shape=(*IMAGE_SIZE, 3),
                                             include_top=False,
                                             weights='imagenet')
    
    base_model.trainable = False
    
    model = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(8, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid',
                              bias_initializer=output_bias)
    ])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss='binary_crossentropy',
                  metrics=metrics)
    
    return model

In [38]:
train_dataset = get_training_dataset()
valid_dataset = get_validation_dataset()

ValueError: in user code:

    <ipython-input-25-694f1c32a7a8>:19 read_tfrecord  *
        example = tf.io.parse_single_example(example, feature)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/parsing_ops.py:447 parse_single_example_v2  **
        return parse_example_v2(serialized, features, example_names, name)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/parsing_ops.py:309 parse_example_v2
        RaggedFeature
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/parsing_config.py:461 from_features
        (type(feature).__name__, feature))

    ValueError: Unsupported function <function _int64_feature at 0x7f21d7ded730>.
