# Connected Components Partition
 - Partitioning Strategy: Recursive Dividing N=1000

In [1]:
import json
import os
import numpy as np

import lib
from lib.taxonomy.utils import SynonymsList
from lib.notebooks.vis_utils import tic, toc
from lib.taxonomy.loading import getEntryValues, gatherSynset, gatherPathsAndLabels, rootNode, rootNodeClasses
from lib.taxonomy.loading import setEntries, getEntries
from lib.taxonomy.loading import imageExists, Field2meta
from lib.taxonomy.loading import TRAINING_SET, TESTING_SET, NO_SET, VALIDATION_SET
from lib.taxonomy.graph_structure import Taxonomy, recursive_division
from lib.taxonomy.edge_extraction import partition_connected_components
from lib.taxonomy.io import generate_symlinks, make_directory_structure, create_symlinks
from lib.taxonomy.io import print_partition_statistics

import scipy.sparse as sp


In [2]:
dataset_directory = '/ssd/esteva/skindata4/images/'
meta_file = '/ssd/esteva/skindata4/meta.json'

train_dir = '/ssd/esteva/skindata4/splits/recursive_dividing_N=1000/train'
test_dir = '/ssd/esteva/skindata4/splits/recursive_dividing_N=1000/test'
labels_file = '/ssd/esteva/skindata4/splits/recursive_dividing_N=1000/labels.txt'

skin_prob = 0.4 
tax_path_score = 0.8 
N=1000

curated_test_file = '/ssd/esteva/skindata4/test_sets/validation_set.txt'

# Files with entries of the form [path/to/image] [label]
# All basenames listed in excluded_datasets will be ommitted from train/val
excluded_datasets = [ 
        '/ssd/esteva/skindata4/test_sets/dermoscopy_test.txt',
        '/ssd/esteva/skindata4/test_sets/epidermal_test.txt',
        '/ssd/esteva/skindata4/test_sets/melanocytic_test.txt'
        ]


In [3]:
# We load in images that exist on our filesystem,
meta = json.load(open(meta_file))
meta = [m for m in meta if imageExists(m, dataset_directory)]

# Connected components partition assigns one of TRAINING_SET or TESTING_SET to field 'set_identifier'
partition_connected_components(meta)


[FUNC: insert_datetime_field] Skipping 172 entries that could not load datetime
46288 Entries have the datetime metadata
Calculating time-camera edge matrix...




40000 / 43591 Elapsed Time: 5.61479592323 Time Remaining: 0.503915336227 Elapsed Time:  596.342364073
Adding 43591 edges to the graph
Adding 5183 turk edges to the graph
Adding 2647 turk edges to the graph
Adding 17204 dermquest edges to the graph
Adding 21434 edges to the graph based on identical filenames
We find 250181 connected components
Proposing test set from duplicate_urls_turk2.json
Proposed Test Set has 20958 entries
Partitioned Test Set has 19644 meta entries


In [4]:
isic = getEntries(meta, 'database', 'isic')
isic = [i for i in isic if 'label' in i and i['label'] in ['benign', 'malignant']]

In [5]:
# Keep meta with desired skin probs and tax path scores
meta = [m for m in meta if 'tax_path_score' in m and m['tax_path_score'] >= tax_path_score]
meta = [m for m in meta if m['tax_path']]
meta = [m for m in meta if 'skin_prob' in m and m['skin_prob'] >= skin_prob]
meta.extend(isic)
meta = [m for m in meta if m['set_identifier'] in [TRAINING_SET, TESTING_SET]]


In [6]:
# Fix the naming convention issues of the top 9 categories (to dermal-tumor-benign, etc.)
syns = SynonymsList()
for m in meta:
    rootname = '-'.join(m['tax_path'][0])
    rootrename = syns.synonymOf(rootname).split('-')
    m['tax_path'][0] = rootrename


In [8]:
# Rename 'label' field to 'disease_name'. 'label' will be used for integer labels.
for m in meta:
    if 'label' in m:
        m['disease_name'] = m['label']
        m['label'] = None

print "Kept Meta Entries: %d" % len(meta)

# Assign nine-way rootnode classes.
classes, labels = rootNodeClasses(meta)
setEntries(meta, 'label', labels)
setEntries(meta, 'clinical_label', labels)

meta_train = getEntries(meta, 'set_identifier', TRAINING_SET)
meta_test = getEntries(meta, 'set_identifier', TESTING_SET)

# Reassign to the training set new labels based on a recursive dividing treelearning partition.
taxonomy = Taxonomy(meta_train)
new_classes, new_names = recursive_division(taxonomy.top_node, N)
sort_indices = np.argsort(new_names)
new_classes = [new_classes[i] for i in sort_indices]
new_names = [new_names[i] for i in sort_indices]
for i, (new_class, new_name) in enumerate(zip(new_classes, new_names)):
    new_name = new_name.strip('/').replace('/', '_')
    for entry in new_class:
        entry['label'] = i
        entry['label_name'] = new_name
print 'Applying TreeLearning: Recursive Dividing with N=%d' % N

def collectSynset(meta_train):
    """Returns the synset and checks that it is sorted.

    Args:
        meta_train (list): list of dicts in skindata format. Must contain field 'label' and 'label_name'

    Returns
        Sorted list of class names in the format [label_name] [label].
    """
    synset = []
    for m in meta_train:
        synset.append([m['label_name'], m['label']])
    synset = {tuple(s) for s in synset}
    synset = [list(s) for s in synset]
    synset.sort(key=lambda x: x[0])
    synset = [[str(ss) for ss in s] for s in synset]
    synset = [" ".join(s) for s in synset]

    # run sort checks
    ss = np.sort(synset)
    for i,j in zip(ss, synset):
        assert i == j

    for i, j in zip([s.split()[1] for s in ss], [s.split()[1] for s in synset]):
        assert i == j

    return synset


synset = collectSynset(meta_train)


Kept Meta Entries: 146138
Initializing Taxonomy
Creating vertices...
Distributing metadata entries...
Initializing vertex variables...
Identifying root nodes...
Adding top node...
Applying TreeLearning: Recursive Dividing with N=1000


In [9]:
print 'Size of meta_train %d' % len(meta_train)
print 'Size of meta_test %d' % len(meta_test)

# Keep only the test set entries that have passed manual curation
print 'Keeping test set images that have been manually curated.',
print 'Using curated test file: %s' % curated_test_file
curated_test = [line.strip() for line in
                open(curated_test_file).readlines()]
curated_test = np.array([os.path.basename(t.split()[0]) for t in curated_test])

filename2meta = Field2meta(meta_test, field='filename')
for fn in curated_test:
    ms = filename2meta(fn)
    for m in ms:
        m['cc_keep'] = True

for m in meta_test:
    if 'cc_keep' not in m:
        m['set_identifier'] = NO_SET

# Exclude all specified datasets
for exclusion_file in excluded_datasets:
    filenames = [os.path.basename(line.strip().split()[0]) for line in open(exclusion_file).readlines()]

    for fn in filenames:
        ms = filename2meta(fn)
        for m in ms:
            m['set_identifier'] = NO_SET

meta_test = getEntries(meta, 'set_identifier', TESTING_SET)
print len(meta_test)


Size of meta_train 128038
Size of meta_test 18100
Keeping test set images that have been manually curated. Using curated test file: /ssd/esteva/skindata4/test_sets/validation_set.txt
14839


In [10]:
print 'Gathering paths and labels from the metadata'
trainset = np.unique(gatherPathsAndLabels(meta, dataset_directory, TRAINING_SET))
valset = np.unique(gatherPathsAndLabels(meta, dataset_directory, VALIDATION_SET))
testset = np.unique(gatherPathsAndLabels(meta, dataset_directory, TESTING_SET))
no_set = np.unique(gatherPathsAndLabels(meta, dataset_directory, NO_SET))

print_partition_statistics(meta, classes, dataset_directory)

# Make testing directory structure - rootnode classes
subclasses = np.unique([s.split()[0].split('_')[0] for s in synset])
make_directory_structure(test_dir, subclasses)
syms_test = generate_symlinks(testset, test_dir, subclasses)
create_symlinks(syms_test)


# Make training directory structure - taxonomy training classes
subclasses = np.unique([s.replace(' ', '_') for s in synset])
make_directory_structure(train_dir, subclasses)
syms_train = generate_symlinks(trainset, train_dir, subclasses)
create_symlinks(syms_train)

print 'Directory created: %s' % train_dir
print 'Directory created: %s' % test_dir

with open(labels_file, 'w') as f:
    prefix = ""
    for s in subclasses:
        f.write(prefix)
        f.write(s)
        prefix = "\n"
print 'Labels file created: %s' % labels_file


Gathering paths and labels from the metadata
Train and test share 0 images, according to filenames
Train and val share 0 images, according to filenames
Test and val share 0 images, according to filenames
Dataset sizes (Based on Metadata):
Train,	Val,	Test,	Total
1036 	0 	118 	1154
7919 	0 	872 	8791
950 	0 	76 	1026
4567 	0 	641 	5208
8808 	0 	1461 	10269
5085 	0 	368 	5453
81907 	0 	10740 	92647
13088 	0 	380 	13468
4678 	0 	183 	4861

128038 0 14839

Dataset sizes (Based on unique images):
Train,	Val,	Test,	Total
978 	0 	117 	1095
7618 	0 	858 	8476
925 	0 	76 	1001
4426 	0 	635 	5061
8443 	0 	1449 	9892
4959 	0 	366 	5325
77165 	0 	10666 	87831
13010 	0 	375 	13385
4595 	0 	170 	4765
# Unique Images in Training: 123162
# Unique Images in Validation: 0
# Unique Images in Testing: 14712

Directory created: /ssd/esteva/skindata4/splits/recursive_dividing_N=1000/train
Directory created: /ssd/esteva/skindata4/splits/recursive_dividing_N=1000/test
Labels file created: /ssd/esteva/skindata