# Drawing Board

In [3]:
import os
import numpy as np
import time
import shutil

def bufferFillPerGroup(entries, groups=None):
    """Bufferfills a set of path-labels, possibly at the rootNode class level.

    Args:
        entries (list): A list of entries in the format 'path/to/stuff [label]'
        groups (List of Lists, or None): Each sublist must contain a set of integers
            denoting the labels of 'entries' to cluster together into a supergroup.

    Returns:
        A list of these, duplicated, such that each supergroup occurs the same number
        of times. if m = max(instances(supergroup_i), over i), then new_set contains
        m copies of the instances of each supergroup.
    """
    def label(entry):
        return int(entry.strip().split()[1])

    # 'Enough' of a group means as many entries of that group as of the biggest group.
    labels = [label(e) for e in entries]
    if groups is None:
        groups = [[g] for g in set(labels)]
    group_sizes = [len([e for e in entries if label(e) in group]) for group in groups]
    m = max(group_sizes)

    # The strategy here is to iterate over each group until we have enough of that group.
    def group_iterator(entries, group):
        """Iterates forever over the entries that are in group"""
        labels = [int(e.split()[1]) for e in entries]
        relevant = [e for e in entries if label(e) in group]
        i = 0         
        while(True):
            yield(relevant[i])
            i += 1
            if i == len(relevant):
                i = 0 

    new_entries = []
    for group in groups:
        print 'Generating group', group
        time.sleep(0.5)
        count = 0 
        for entry in group_iterator(entries, group):
            new_entries.append(entry)
            count += 1
            if count >= m:
                break
    return new_entries


def rename_file(filename):
    """Appends a '_' to a filename until the new filename no longer exists."""
    if not os.path.exists(filename):
        return filename
    else:
        filename = filename + '_'
        return rename_file(filename)
    

def copy(src, dst):
    """Copies symlinks. If dst exists we append a _ to it."""
    dst = rename_file(dst)
    if os.path.islink(src):
        linkto = os.readlink(src)
        os.symlink(linkto, dst)
    else:
        shutil.copy(src,dst)
        

def make_dir_structure(new_dir, classes):
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
        for c in classes:
            os.makedirs(os.path.join(new_dir, c))
    else:
        print 'Directory exists: %s' % new_dir



data_dir = '/archive/esteva/skindata4/splits/nine-way/train'
new_dir = data_dir + '-even'
labels_file = '/archive/esteva/skindata4/splits/nine-way/labels.txt'

print 'Processing directory %s' % data_dir
classes = np.array([line.strip() for line in open(labels_file).readlines()])
unique_classes = np.unique([c.split('_')[0] for c in classes])
groups = [np.where(c == classes)[0].tolist() for c in unique_classes]

dataset = []
for i, c in enumerate(classes):
    data = os.listdir(os.path.join(data_dir, c))
    for d in data:
        dataset.append(' '.join([os.path.join(data_dir, c, d), str(i)])) 

new_dataset = bufferFillPerGroup(dataset, groups)

print 'Creating directory %s' % new_dir
if os.path.exists(new_dir):
    shutil.rmtree(new_dir)
make_dir_structure(new_dir, classes)

for i, entry in enumerate(new_dataset):
    if i % 1000 == 0:
        print '\rLinking new entry %d/%d' % (i, len(new_dataset)),
    src = entry.split()[0]
    dst = os.path.join(new_dir, src[len(data_dir)+1:])
    copy(src, dst)


Processing directory /archive/esteva/skindata4/splits/nine-way/train
Generating group [0]
Generating group [1]
Generating group [2]
Generating group [3]
Generating group [4]
Generating group [5]
Generating group [6]
Generating group [7]
Generating group [8]
Creating directory /archive/esteva/skindata4/splits/nine-way/train-even
Linking new entry 18000

KeyboardInterrupt: 

# Create ISIC metadata for /archive/esteva/isic-dermoscopy-curated

In [51]:
names_labels = [line.strip() for line in open('/archive/esteva/isic-dermoscopy-curated/curated_dermoscopy.csv').readlines()]
def name(entry):
    return entry.split(',')[0]

def label(entry):
    return entry.split(',')[1]

for entry in names_labels:
    n = name(entry)
    


In [52]:
n

'ISIC_0012079.png'

# Parse ISIC Metadata

In [38]:
import json
import os
import lib
from lib.notebooks.parse_isic_helper import *

# for auto-reloading external modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
meta = json.load(open('/archive/isic_meta.json', 'r'))

In [18]:
def get_isic_id(m):
    return int(os.path.basename(m['path']).split('_')[1])

ids = [get_isic_id(m) for m in meta]
len(set(ids))

12086

In [21]:
for i, elem in enumerate(ids):
    assert i== elem

In [10]:
len(meta)

12086

In [34]:
meta[100].keys()


[u'files', u'p1a', u'p1b', u'p1c', u'exif', u'label', u'meta', u'path']

In [30]:
diagnoses[0]

u'benign'

In [47]:
diagnoses = [get_diagnosis(m) for m in meta]
for d, m in zip(diagnoses, meta):
    if 'label' in m:
        assert d == m['label']


AssertionError: 

In [48]:
m

{u'files': [u'ISIC_0001102.tif',
  u'UDA2_pilot_052-p1b.png',
  u'UDA2_pilot_052-tile-p1b.png',
  u'UDA2_pilot_052-p1c.png',
  u'UDA2_pilot_052-p1b.json',
  u'UDA2_pilot_052-tile-p1c.png',
  u'UDA2_pilot_052-tile-p1a.png',
  u'UDA2_pilot_052-p1a.json',
  u'UDA2_pilot_052-p1c.json',
  u'UDA2_pilot_052-p1a.png',
  u'ISIC_0001102.jpg'],
 u'label': u'malignant',
 u'meta': {u'convertedFilename': u'UDA2_pilot_052.new.tif',
  u'convertedMimeType': u'image/tiff',
  u'originalFilename': u'UDA2_pilot_052.jpg',
  u'originalMimeType': u'image/jpeg',
  u'p1a_folder_id': u'54e7571ebae47850e86ce0f8',
  u'p1a_result': u'ok',
  u'p1a_start_time': u'2015-02-20 17:13:23.413000',
  u'p1a_stop_time': u'2015-02-20 17:13:43.646000',
  u'p1a_user': u'5450e996bae47865794e4d0d',
  u'p1b_folder_id': u'54e759cabae47850e86ce100',
  u'p1b_result': u'ok',
  u'p1b_start_time': u'2015-02-20 18:03:12.439000',
  u'p1b_stop_time': u'2015-02-20 18:03:21.749000',
  u'p1b_user': u'54cb967ebae47819d8e4c724',
  u'qc_folder_id

In [49]:
get_diagnosis(m)

In [50]:
m['label']

u'malignant'

In [15]:
len([m for m in meta if 'p1a' in m])

10293

In [16]:
len([m for m in meta if 'label' in m])

10917

In [17]:
len([m for m in meta if 'p1a' in m])

10293

In [25]:
paths = [m['path'] for m in meta]
paths = np.sort(paths)

In [27]:
with open('isic_paths.txt', 'w') as f:
    prefix = ""
    for p in paths:
        f.write(prefix)
        f.write(p)
        prefix = '\n'
        