In [1]:
# Esc + II to interupt kernel
# ESC + 00 to restart kernel
import json
import os
import lib
from lib.taxonomy.utils import SynonymsList
from lib.notebooks.vis_utils import tic, toc
from lib.taxonomy.loading import getEntryValues, gatherSynset, gatherPathsAndLabels
from lib.taxonomy.loading import TRAINING_SET, TESTING_SET, NO_SET, VALIDATION_SET
import numpy as np

%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
dataset_directory = '/archive/esteva/skindata4/images/'
meta_file = '/archive/esteva/skindata4/meta.json'

In [3]:
def imageExists(m, directory):
    """Returns true if the image pointed to by data_point exists on the filesystem.

        Args:
            data_point(dict): A dictionary in 'skindata' format.
            directory (str): A path to the directory where we check if the image exists.

        Returns:
            True if the image exists, false otherwise.
    """
    p = os.path.join(directory, m['filename'])
    return os.path.exists(p)
    

In [4]:
meta_all = json.load(open(meta_file))

In [5]:
# We load in images that exist on our filesystem
skin_prob = 0.4
tax_path_score = 0.8

meta = meta_all
meta = [m for m in meta if imageExists(m, dataset_directory)]
meta_exists = meta
meta = [m for m in meta if 'tax_path_score' in m.keys() and m['tax_path_score'] >= tax_path_score]
meta = [m for m in meta if m['tax_path']]
meta = [m for m in meta if 'skin_prob' in m.keys() and m['skin_prob'] >= skin_prob]

# Fix the naming convention issues of the top 9 categories
syns = SynonymsList()
for m in meta:
    rootname = '-'.join(m['tax_path'][0])
    rootrename = syns.synonymOf(rootname).split('-')
    m['tax_path'][0] = rootrename

    
# Rename 'label' field to 'disease_name'
for m in meta:
    if 'label' in m:
        m['disease_name'] = m['label']
        m['label'] = None
    
print "Total Meta Entries: %d" % len(meta_all)
print "Kept Meta Entries: %d" % len(meta)

Total Meta Entries: 300289
Kept Meta Entries: 135393


In [6]:
def rootNodeClasses(meta):
    """Creates labels for our dataset based on the root nodes (the 9).

        Args:
            meta(list): a list of dictionaries in 'skindata' format.

        Returns:
            A dictionary of [label : class_name] key-value pairs sorted alphabetically.
            A list of integer labels for each entry in meta.
    """
    root_nodes = {rootNode(m) for m in meta}
    root_nodes = np.sort([r for r in root_nodes])
    root_nodes = {r : i for i,r in enumerate(root_nodes)}
    labels = [root_nodes[rootNode(entry)] for entry in meta]
    classes = {v : k for k,v in root_nodes.iteritems()}
    return classes, np.array(labels)


def rootNode(data_point):
    """Return the root node in the taxonomy for a data point.

        Args:
            data_point(dict): a dict in skindata format.
        Returns:
            The name of the root node.
    """
    return '-'.join(data_point['tax_path'][0])

def setEntries(meta, key, values):
    """Set the key in each entry of meta to its corresponding entry in the list value

        Args:
            meta (list): A list of dictionaries in 'skindata' format.
            key (string): the key to add/alter in each dictionary of meta.
            values (iterable, or single value): the iterable of values to add. len(values) == len(meta).
                If values is not iterable (i.e. a single value) then we set all entries of meta[key] to this value.
    """
    # If Values is not iterable
    if not hasattr(values, '__iter__'):
        values = len(meta) * [values]

    assert(len(meta) == len(values))
    for m, v in zip(meta, values):
        m[key] = v

classes, labels = rootNodeClasses(meta)
setEntries(meta, 'label', labels)
setEntries(meta, 'clinical_label', labels)
for k,v in classes.iteritems():
    print k,v
    

0 cutaneous-lymphoma
1 dermal-tumor-benign
2 dermal-tumor-malignant
3 epidermal-tumor-benign
4 epidermal-tumor-malignant
5 genodermatosis
6 inflammatory
7 pigmented-lesion-benign
8 pigmented-lesion-malignant


In [10]:
# Calculate time-camera edge matrix as a sparse matrix (about 2 minutes)
from lib.taxonomy.edge_extraction import *

meta = meta_exists
for i,m in enumerate(meta):
    m['index'] = i
    
insert_datetime_field(meta)
insert_abs_time_field(meta)

meta_datetime = getEntries(meta, 'datetime', None)
cameras, camera_models = extract_cameras(meta_datetime)
abs_times = np.array([m['abs_time'] for m in meta_datetime]).reshape((-1,1))
cam_indx = np.array([camera_models[c] for c in cameras]).reshape((-1,1))
print '%d Entries have the datetime metadata' % len(meta_datetime)

print 'Calculating time-camera edge matrix...'
M = len(meta_datetime)
N = len(meta)

t = tic()
edge, _, _= edge_matrix(abs_times, cam_indx)
edge = squareform_sparse(edge, M)
toc(t)

[FUNC: insert_datetime_field] Skipping 172 entries that could not load datetime
46288 Entries have the datetime metadata
Calculating time-camera edge matrix...




40000 / 43591 Elapsed Time: 5.56227016449 Time Remaining: 0.499201267231 Elapsed Time:  100.792287827


In [11]:
# Initialize the N x N Edge matrix
# (will add edges to it, below)

import scipy.sparse as sp
E = sp.lil_matrix((N,N), dtype=bool)

In [12]:
# Insert the datetime edges
c = 0
for i,j,v in sparse_matrix_iterator(edge):
    if v:
        c += 1
        idx_i = meta_datetime[i]['index']
        idx_j = meta_datetime[j]['index']
        E[idx_i, idx_j] = v   
print 'Adding %d edges to the graph' % c

Adding 43591 edges to the graph


In [13]:
# Add into the edge matrix the duplicates of turk2 and turk 1
turk_results = [
    '/archive/esteva/skindata4/duplicate_urls_turk1.json',
    '/archive/esteva/skindata4/duplicate_urls_turk2.json',    
    ]

def dict2list(dict_):
    list_ = []
    for key, value in dict_.iteritems():
        d = [key]
        d.extend([v for v in value])
        list_.append(d)   
    return list_

for tr in turk_results:
    turk = json.load(open(tr, 'r'))
    duplicates = dict2list(turk)
    insert_edges_into_edge_matrix(E, duplicates, meta, field_name='filename')
    print 'Adding %d edges to the graph' % np.sum([len(v)-1 for v in duplicates])

Adding 5183 edges to the graph
Adding 2647 edges to the graph


In [14]:
# Add dermquest ids into the graph.
dermquest = getEntries(meta, 'database', 'dermquest')
dermquest = getEntries(dermquest, 'case', None)
case2meta = Field2meta(dermquest, field='case')
cases = np.unique(getEntryValues(dermquest, 'case'))
duplicates = [[m['index'] for m in case2meta(case)] for case in cases]
insert_edges_into_edge_matrix(E, duplicates, meta, field_name='index')
print 'Adding %d edges to the graph' % np.sum([len(v)-1 for v in duplicates])

Adding 17204 edges to the graph


In [15]:
# Add meta entries that share the same filenames as edges
filename2meta = Field2meta(meta, field='filename')
filenames = np.unique([m['filename'] for m in meta])
duplicates = []
for fn in filenames:
    meta_filename = filename2meta(fn)
    if len(meta_filename) == 0:
        print 'wtf'
        break
    if len(meta_filename) > 1:
        duplicates.append([m['index'] for m in meta_filename])
insert_edges_into_edge_matrix(E, duplicates, meta, field_name='index')
print 'Adding %d edges to the graph' % np.sum([len(v)-1 for v in duplicates])        

Adding 21434 edges to the graph


In [16]:
# Extract connected components and assign them to the meta
n_components, connected_components = sp.csgraph.connected_components(E, directed=False)
unique_component_numbers, component_sizes = np.unique(connected_components, return_counts=True)

for m, c in zip(meta, connected_components):
    m['connected_component'] = c
print 'We find %d connected components' % n_components

We find 238254 connected components


In [17]:
# Propose a test set (from the turked set)
test_set = '/archive/esteva/skindata4/duplicate_urls_turk2.json'
test_set = json.load(open(test_set, 'r'))
test_set = [key for key in test_set.keys()]

filename2meta = Field2meta(meta, field='filename')
cc2meta = Field2meta(meta, field='connected_component')

meta_test = [m for fn in test_set for m in filename2meta(fn)]
setEntries(meta, 'set_identifier', TRAINING_SET)
setEntries(meta_test, 'set_identifier', TESTING_SET)

print 'Proposed Test Set has %d entries' % len(meta_test)

Proposed Test Set has 20958 entries


In [18]:
# Iterate over elements of the test set and push connected components to train or test

def component_is_split(comp):
    """Returns true if this component is split between train and test"""
    set_ids = set([m['set_identifier'] for m in comp if m['set_identifier'] != NO_SET])    
    return len(set_ids) == 2

def random_partition(meta_test, return_splits=False):
    """Randomly partitions the components of meta_test to train/test by flipping a coin."""
    split_comps = []
    for m in meta_test:
        comp = cc2meta(m['connected_component'])    
        if component_is_split(comp):
            if np.random.rand() < 0.5:
                setEntries(comp, 'set_identifier', TRAINING_SET)
            else:
                setEntries(comp, 'set_identifier', TESTING_SET)
            split_comps.append(comp)
    if return_splits:
        return split_comps        
    
def maxset_partition(meta_test):
    """Deterministically place component into whichever set they already have more images of."""
    for m in meta_test:
        comp = cc2meta(m['connected_component'])    
        if component_is_split(comp):
            N_test = len(getEntries(comp, 'set_identifier', TESTING_SET))
            N_train = len(getEntries(comp, 'set_identifier', TRAINING_SET))
            if N_test >= N_train:
                setEntries(comp, 'set_identifier', TESTING_SET)
            else:
                setEntries(comp, 'set_identifier', TRAINING_SET)        

maxset_partition(meta_test)
meta_test = getEntries(meta, 'set_identifier', TESTING_SET)

print 'Partitioned Test Set has %d meta entries' % len(meta_test)

Partitioned Test Set has 19644 meta entries


In [20]:
# Assign NO_SET to meta entries that dont make the threshold, and reduce meta to just those that do.
# We both assigning NO_SET so that meta_all is a reflection of meta but with the extra entries as NO_SET

meta = meta_exists
pruned = 0
for m in meta:
    if 'skin_prob' not in m or 'tax_path_score' not in m or 'tax_path' not in m:
        m['set_identifier'] = NO_SET
        pruned += 1
        continue
    if m['skin_prob'] < skin_prob or m['tax_path_score'] < tax_path_score:
        m['set_identifier'] = NO_SET
        pruned += 1
        
meta = [m for m in meta if m['set_identifier'] in [TRAINING_SET, TESTING_SET]]
classes, labels = rootNodeClasses(meta)
setEntries(meta, 'label', labels)
setEntries(meta, 'clinical_label', labels)

meta_train = getEntries(meta, 'set_identifier', TRAINING_SET)
meta_test = getEntries(meta, 'set_identifier', TESTING_SET)
synset = gatherSynset(meta_train)

print 'Pruning out %d / %d meta entries to NO_SET and assigning labels and clinical labels' % (pruned, len(meta_exists))
print 'Size of meta_train %d' % len(meta_train)
print 'Size of meta_test %d' % len(meta_test)


Pruning out 157244 / 292637 meta entries to NO_SET and assigning labels and clinical labels
Final size of meta_train 117293
Final size of meta_test 18100


In [46]:
# Keep only the test set entries that have passed manual curation
curated_test = [line.strip() for line in 
                open('/archive/esteva/skindata4/splits/nine-way/test_curated.txt').readlines()]
curated_test = np.array([os.path.basename(t.split()[0]) for t in curated_test])
    
filename2meta = Field2meta(meta_test, field='filename')
for fn in curated_test:
    ms = filename2meta(fn)
    for m in ms:
        m['cc_keep'] = True
    
for m in meta_test:
    if 'cc_keep' not in m:
        m['set_identifier'] = NO_SET
        
meta_test = getEntries(meta, 'set_identifier', TESTING_SET)
print len(meta_test)

14839


In [48]:
print 'Generating Training, Validation, and Testing sets.'

# Gather each dataset's paths and labels
trainset = gatherPathsAndLabels(meta, dataset_directory, TRAINING_SET)
valset = gatherPathsAndLabels(meta, dataset_directory, VALIDATION_SET)
testset = gatherPathsAndLabels(meta, dataset_directory, TESTING_SET)
no_set = gatherPathsAndLabels(meta, dataset_directory, NO_SET)

# Since some images have multiple diseases, we keep only the unique 'path [label]' entries
trainset = np.unique(trainset)
valset = np.unique(valset)
testset = np.unique(testset)
noset = np.unique(no_set)

# Lets check that there is no overlap between train and test paths
trainpaths = np.unique([os.path.basename(t.split()[0]) for t in trainset])
testpaths = np.unique([os.path.basename(t.split()[0]) for t in testset])
intersection = np.intersect1d(trainpaths, testpaths)
print 'Train and test share %d images, according to filenames' % len(intersection)

getClassFromValidationSet = lambda meta, c: [m for m in meta if m['set_identifier'] == VALIDATION_SET and m['clinical_label'] == c]
getClassFromTrainingSet = lambda meta, c: [m for m in meta if m['set_identifier'] == TRAINING_SET and m['clinical_label'] == c]
getClassFromTestingSet = lambda meta, c: [m for m in meta if m['set_identifier'] == TESTING_SET and m['clinical_label'] == c]
print 'Dataset sizes (Based on Metadata):'
print 'Train,\tVal,\tTest,\tTotal'
for c in classes:
    v = len(getClassFromValidationSet(meta, c))
    t = len(getClassFromTrainingSet(meta, c))
    te = len(getClassFromTestingSet(meta, c))
    print t, '\t', v, '\t', te, '\t', v + t + te

print ''
print len(getEntries(meta, 'set_identifier', TRAINING_SET)),
print len(getEntries(meta, 'set_identifier', VALIDATION_SET)),
print len(getEntries(meta, 'set_identifier', TESTING_SET))
print ''

print 'Dataset sizes (Based on unique images):'
print 'Train,\tVal,\tTest,\tTotal'
for c in classes:    
    v = len(np.unique([m['filename'] for m in getClassFromValidationSet(meta, c)]))
    t = len(np.unique([m['filename'] for m in getClassFromTrainingSet(meta, c)]))
    te = len(np.unique([m['filename'] for m in getClassFromTestingSet(meta, c)]))
    print t, '\t', v, '\t', te, '\t', v + t + te
    
print '# Unique Images in Training:', len(trainset)
print '# Unique Images in Validation:', len(valset)
print '# Unique Images in Testing:', len(testset)
print '# Unique Images tossed out:', len(noset)
print ''

Generating Training, Validation, and Testing sets.
Train and test share 0 images, according to filenames
Dataset sizes (Based on Metadata):
Train,	Val,	Test,	Total
1036 	0 	118 	1154
7919 	0 	872 	8791
950 	0 	76 	1026
4567 	0 	641 	5208
8808 	0 	1461 	10269
5085 	0 	368 	5453
81907 	0 	10740 	92647
2777 	0 	380 	3157
4244 	0 	183 	4427

117293 0 14839

Dataset sizes (Based on unique images):
Train,	Val,	Test,	Total
978 	0 	117 	1095
7618 	0 	858 	8476
925 	0 	76 	1001
4426 	0 	635 	5061
8443 	0 	1449 	9892
4959 	0 	366 	5325
77165 	0 	10666 	87831
2699 	0 	375 	3074
4161 	0 	170 	4331
# Unique Images in Training: 111374
# Unique Images in Validation: 0
# Unique Images in Testing: 14712
# Unique Images tossed out: 3245



In [73]:
# Make training directory structure
def make_directory_structure(dir_name, subfolders):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    for s in subfolders:
        p = os.path.join(dir_name, s)
        os.makedirs(p)
    print 'Creating %s' % dir_name
    
subclasses = [s.split()[1] for s in synset]
make_directory_structure('./train', subclasses)
make_directory_structure('./val', subclasses)

Creating ./train
Creating ./val


In [76]:
def generate_symlinks(dataset, dirname, subclasses):
    syms = []
    for entry in dataset:
        p = entry.split()[0]
        l = int(entry.split()[1])
        s = " ".join([p, os.path.join(dirname, subclasses[l], os.path.basename(p))])
        syms.append(s)
    return syms

syms_train = generate_symlinks(trainset, './train', subclasses)
syms_test = generate_symlinks(testset, './test', subclasses)

In [75]:
def create_symlinks(symlinks):
    for entry in symlinks:
        src = entry.split()[0]
        dst = entry.split()[1]
        os.symlink(src, dst)
        
create_symlinks(syms_train)
create_symlinks(syms_test)