# Drawing Board

In [74]:
import json
import os
import numpy as np

import lib
from lib.taxonomy.utils import SynonymsList
from lib.notebooks.vis_utils import tic, toc
from lib.taxonomy.loading import getEntryValues, gatherSynset, gatherPathsAndLabels, rootNode, rootNodeClasses
from lib.taxonomy.loading import setEntries
from lib.taxonomy.loading import imageExists
from lib.taxonomy.loading import TRAINING_SET, TESTING_SET, NO_SET, VALIDATION_SET
from lib.taxonomy.graph_structure import Taxonomy, recursive_division
from lib.taxonomy.edge_extraction import *
from lib.taxonomy.io import generate_symlinks, make_directory_structure, create_symlinks
from lib.taxonomy.io import print_partition_statistics

import scipy.sparse as sp

dataset_directory = '/archive/esteva/skindata4/images/'
meta_file = '/archive/esteva/skindata4/meta.json'

train_dir = '/archive/esteva/skindata4/splits/recursive_dividing_N=1000/train-tmp'
test_dir = '/archive/esteva/skindata4/splits/recursive_dividing_N=1000/test-tmp'
labels_file = '/archive/esteva/skindata4/splits/recursive_dividing_N=1000/labels-tmp.txt'

skin_prob = 0.4
tax_path_score = 0.8
N=1000


In [76]:
if True:
    if os.path.exists(train_dir):
        print 'Train dir %s exists, exiting' % train_dir
        raise ValueError('gadfg')
    if os.path.exists(test_dir):
        print
        'Test dir %s exists, exiting' % test_dir
        raise ValueError('gadfg')

    # We load in images that exist on our filesystem,
    meta = json.load(open(meta_file))
    meta = [m for m in meta if imageExists(m, dataset_directory)]

    # Connected components partition assigns one of TRAINING_SET or TESTING_SET to field 'set_identifier'
    partition_connected_components(meta)

    # Keep meta with desired skin probs and tax path scores
    meta = [m for m in meta if 'tax_path_score' in m and m['tax_path_score'] >= tax_path_score]
    meta = [m for m in meta if m['tax_path']]
    meta = [m for m in meta if 'skin_prob' in m and m['skin_prob'] >= skin_prob]
    meta = [m for m in meta if m['set_identifier'] in [TRAINING_SET, TESTING_SET]]

    # Fix the naming convention issues of the top 9 categories (to dermal-tumor-benign, etc.)
    syns = SynonymsList()
    for m in meta:
        rootname = '-'.join(m['tax_path'][0])
        rootrename = syns.synonymOf(rootname).split('-')
        m['tax_path'][0] = rootrename

    # Rename 'label' field to 'disease_name'. 'label' will be used for integer labels.
    for m in meta:
        if 'label' in m:
            m['disease_name'] = m['label']
            m['label'] = None

    print "Kept Meta Entries: %d" % len(meta)

    # Assign nine-way rootnode classes.
    classes, labels = rootNodeClasses(meta)
    setEntries(meta, 'label', labels)
    setEntries(meta, 'clinical_label', labels)

    meta_train = getEntries(meta, 'set_identifier', TRAINING_SET)
    meta_test = getEntries(meta, 'set_identifier', TESTING_SET)


[FUNC: insert_datetime_field] Skipping 172 entries that could not load datetime
46288 Entries have the datetime metadata
Calculating time-camera edge matrix...




40000 / 43591 Elapsed Time: 6.21266889572 Time Remaining: 0.557573094064 Elapsed Time:  589.259970903
Adding 43591 edges to the graph
Adding 5183 turk edges to the graph
Adding 2647 turk edges to the graph
Adding 17204 dermquest edges to the graph
Adding 21434 edges to the graph based on identical filenames
We find 238254 connected components
Proposing test set from /archive/esteva/skindata4/duplicate_urls_turk2.json
Proposed Test Set has 20958 entries
Partitioned Test Set has 19644 meta entries
Kept Meta Entries: 135393


In [116]:
if True:
    taxonomy = Taxonomy(meta_train)
    print 'Applying TreeLearning: Recursive Dividing with N=%d' % N
    new_classes, new_names = recursive_division(taxonomy.top_node, N)

Initializing Taxonomy
Creating vertices...
Distributing metadata entries...
Initializing vertex variables...
Identifying root nodes...
Adding top node...
Applying TreeLearning: Recursive Dividing with N=1000


In [181]:
    sort_indices = np.argsort(new_names)
    new_classes = [new_classes[i] for i in sort_indices]
    new_names = [new_names[i] for i in sort_indices]
    for i, (new_class, new_name) in enumerate(zip(new_classes, new_names)):
        new_name = new_name.strip('/').replace('/', '_')
        for entry in new_class:
            entry['label'] = i
            entry['label_name'] = new_name

In [186]:
def collectSynset(meta_train):
    """Returns the synset and checks that it is sorted.
    
    Args:
        meta_train (list): list of dicts in skindata format. Must contain field 'label' and 'label_name'
        
    Returns
        Sorted list of class names in the format [label_name] [label].
    """
    synset = []
    for m in meta_train:
        synset.append([m['label_name'], m['label']])
    synset = {tuple(s) for s in synset}
    synset = [list(s) for s in synset]
    synset.sort(key=lambda x: x[0])
    synset = [[str(ss) for ss in s] for s in synset]
    synset = [" ".join(s) for s in synset]
    
    # run sort checks
    ss = np.sort(synset)
    for i,j in zip(ss, synset):
        assert i == j
        
    for i, j in zip([s.split()[1] for s in ss], [s.split()[1] for s in synset]):
        assert i == j
        
    return synset

synset = collectSynset(meta_train)


In [188]:
    # Keep only the test set entries that have passed manual curation
    curated_test_file = '/archive/esteva/skindata4/splits/test_curated.txt'
    print 'Keeping test set images that have been manually curated.',
    print 'Using curated test file: %s' % curated_test_file
    curated_test = [line.strip() for line in
                    open(curated_test_file).readlines()]
    curated_test = np.array([os.path.basename(t.split()[0]) for t in curated_test])

    filename2meta = Field2meta(meta_test, field='filename')
    for fn in curated_test:
        ms = filename2meta(fn)
        for m in ms:
            m['cc_keep'] = True

    for m in meta_test:
        if 'cc_keep' not in m:
            m['set_identifier'] = NO_SET

    meta_test = getEntries(meta, 'set_identifier', TESTING_SET)
    print len(meta_test)

    print 'Gathering paths and labels from the metadata'
    trainset = np.unique(gatherPathsAndLabels(meta, dataset_directory, TRAINING_SET))
    valset = np.unique(gatherPathsAndLabels(meta, dataset_directory, VALIDATION_SET))
    testset = np.unique(gatherPathsAndLabels(meta, dataset_directory, TESTING_SET))
    no_set = np.unique(gatherPathsAndLabels(meta, dataset_directory, NO_SET))

    print_partition_statistics(meta, classes, dataset_directory)


Keeping test set images that have been manually curated. Using curated test file: /archive/esteva/skindata4/splits/test_curated.txt
14839
Gathering paths and labels from the metadata
Train and test share 0 images, according to filenames
Train and val share 0 images, according to filenames
Test and val share 0 images, according to filenames
Dataset sizes (Based on Metadata):
Train,	Val,	Test,	Total
1036 	0 	118 	1154
7919 	0 	872 	8791
950 	0 	76 	1026
4567 	0 	641 	5208
8808 	0 	1461 	10269
5085 	0 	368 	5453
81907 	0 	10740 	92647
2777 	0 	380 	3157
4244 	0 	183 	4427

117293 0 14839

Dataset sizes (Based on unique images):
Train,	Val,	Test,	Total
978 	0 	117 	1095
7618 	0 	858 	8476
925 	0 	76 	1001
4426 	0 	635 	5061
8443 	0 	1449 	9892
4959 	0 	366 	5325
77165 	0 	10666 	87831
2699 	0 	375 	3074
4161 	0 	170 	4331
# Unique Images in Training: 112417
# Unique Images in Validation: 0
# Unique Images in Testing: 14712



In [207]:
    # Make testing directory structure
    subclasses = np.unique([s.split()[0].split('_')[0] for s in synset])
    make_directory_structure(test_dir, subclasses)
    syms_test = generate_symlinks(testset, test_dir, subclasses)
    create_symlinks(syms_test)

In [205]:
subclasses

array(['cutaneous-lymphoma', 'dermal-tumor-benign',
       'dermal-tumor-malignant 90', 'epidermal-tumor-benign',
       'epidermal-tumor-malignant', 'genodermatosis', 'inflammatory',
       'inflammatory 210', 'pigmented-lesion-benign',
       'pigmented-lesion-benign 710', 'pigmented-lesion-malignant'], 
      dtype='|S27')

In [208]:
    # Make training directory structure
    subclasses = np.unique([s.replace(' ', '_') for s in synset])
    make_directory_structure(train_dir, subclasses)
    syms_train = generate_symlinks(trainset, train_dir, subclasses)
    create_symlinks(syms_train)


In [209]:
    print 'Directory created: %s' % train_dir
    print 'Directory created: %s' % test_dir

    with open(labels_file, 'w') as f:
        prefix = ""
        for s in subclasses:
            f.write(prefix)
            f.write(s)
            prefix = "\n"
    print 'Labels file created: %s' % labels_file


Directory created: /archive/esteva/skindata4/splits/recursive_dividing_N=1000/train-tmp
Directory created: /archive/esteva/skindata4/splits/recursive_dividing_N=1000/test-tmp
Labels file created: /archive/esteva/skindata4/splits/recursive_dividing_N=1000/labels-tmp.txt


In [216]:
v[1] == u

array([ True, False, False, False], dtype=bool)

In [219]:
v = ['a','a','b','c','d']
u = {v : i for i, v in enumerate(np.unique(v))}

u

{'a': 0, 'b': 1, 'c': 2, 'd': 3}

In [221]:
for uu in u:
    pass

In [224]:
x = 1

In [232]:
assert x == 0, \
    'you jackass'

AssertionError: you jackass