# Partition a detection baseline for Yunzhu
 - edinburgh dataset + 'other' class
 - pigmented lesion malignant
 - pigmented lesion benign
 - epidermal lesion malignant
 - epidermal lesion benign
 - imagenet validation data

In [2]:
import os
import json
import numpy as np

import lib
from lib.taxonomy import loading 
from lib.taxonomy.utils import SynonymsList
from lib.taxonomy import io

imagenet_dir = '/media/esteva/ExtraDrive1/ILSVRC2014/val'
train_dir = '/media/esteva/ExtraDrive1/ThrunResearch/data/skindata4/splits/detection/five-way/train'

D = '/media/esteva/ExtraDrive1/ThrunResearch/data/skindata4'
dataset_directory = os.path.join(D, 'images/')
meta_file = os.path.join(D, 'meta.json')

### Load Metadata

In [4]:
meta = json.load(open(meta_file))
meta = [m for m in meta if loading.imageExists(m, dataset_directory)]

### Extract Edinburgh data

In [5]:
edinburgh = loading.getEntries(meta, 'database', 'edinburgh')
print len(edinburgh)

# Fix the naming convention issues of the top 9 categories (to dermal-tumor-benign, etc.)
syns = SynonymsList()
for m in edinburgh:
    rootname = '-'.join(m['tax_path'][0])
    rootrename = syns.synonymOf(rootname).split('-')
    m['tax_path'][0] = rootrename
    
# Keep the 4 classes of interest
edinburgh = [m for m in edinburgh if 'epidermal' in m['label'] or 'pigmented' in m['label']]
print len(edinburgh)

loading.setEntries(edinburgh, 'set_identifier', loading.TRAINING_SET)

1300
1114


In [6]:
for m in edinburgh:
    if 'label' in m:
        m['disease_name'] = m['label']
        m['label'] = None

# Assign nine-way rootnode classes.
classes, labels = loading.rootNodeClasses(edinburgh)
classnames = classes.values()
loading.setEntries(edinburgh, 'label', labels)

# Extract training paths and labels into a single list
trainset = np.unique(loading.gatherPathsAndLabels(edinburgh, dataset_directory, loading.TRAINING_SET)).tolist()

classes

{0: 'epidermal-tumor-benign',
 1: 'epidermal-tumor-malignant',
 2: 'pigmented-lesion-benign',
 3: 'pigmented-lesion-malignant'}

### Add Imagenet data

In [8]:
imagenet = [os.path.join(imagenet_dir, im) for im in os.listdir(imagenet_dir)]
imagenet = [im + ' 4' for im in imagenet]
trainset.extend(imagenet)
classnames = classes.values()
classnames.extend(['imagenet'])

classnames

### Create directory structure with symlinks

In [10]:
io.make_directory_structure(train_dir, classnames)
syms_train = io.generate_symlinks(trainset, train_dir, classnames)

for entry in syms_train:
    src = entry.split()[0]
    dst = entry.split()[1]
    os.symlink(src, dst)    