# Sets up N-way cross validation, using a raw split
 - no connected componenets
 - assumes every image is unique and has no duplicates
 - the purpose of this is to determine an error bar for the connected_components validation accuracy, as requested by reviewer 3 from nature
 - we don't both with recursive_dividing here

In [2]:
import json
import os
import numpy as np

import lib
from lib.taxonomy import loading as ld
from lib.taxonomy.loading import TRAINING_SET, TESTING_SET, NO_SET, VALIDATION_SET
from lib.taxonomy import utils
from lib.taxonomy import io

In [13]:
dataset_directory = '/ssd/esteva/skindata4/images/'
meta_file = '/ssd/esteva/skindata4/meta.json'

crossval_dir = '/ssd/esteva/skindata4/splits/recursive_dividing_N=1000_crossvalidation_rawsplit'
labels_file = '/ssd/esteva/skindata4/splits/recursive_dividing_N=1000_crossvalidation_rawsplit/labels.txt'

skin_prob = 0.4 
tax_path_score = 0.8 
N=1000

curated_test_file = '/ssd/esteva/skindata4/test_sets/validation_set.txt'

# Files with entries of the form [path/to/image] [label]
# All basenames listed in excluded_datasets will be ommitted from train/val
excluded_datasets = [ 
        '/ssd/esteva/skindata4/test_sets/dermoscopy_test.txt',
        '/ssd/esteva/skindata4/test_sets/epidermal_test.txt',
        '/ssd/esteva/skindata4/test_sets/melanocytic_test.txt'
        ]


In [5]:
# We load in images that exist on our filesystem,
meta = json.load(open(meta_file))
meta = [m for m in meta if ld.imageExists(m, dataset_directory)]

In [6]:
# Keep only isic entries that are labeled.
isic = ld.getEntries(meta, 'database', 'isic')
isic = [i for i in isic if 'label' in i and i['label'] in ['benign', 'malignant']]

# Keep meta with desired skin probs and tax path scores
meta = [m for m in meta if 'tax_path_score' in m and m['tax_path_score'] >= tax_path_score]
meta = [m for m in meta if m['tax_path']]
meta = [m for m in meta if 'skin_prob' in m and m['skin_prob'] >= skin_prob]
meta.extend(isic)

# Fix the naming convention issues of the top 9 categories (to dermal-tumor-benign, etc.)
syns = utils.SynonymsList()
for m in meta:
    rootname = '-'.join(m['tax_path'][0])
    rootrename = syns.synonymOf(rootname).split('-')
    m['tax_path'][0] = rootrename

    
# Rename 'label' field to 'disease_name'. 'label' will be used for integer labels.
for m in meta:
    if 'label' in m:
        m['disease_name'] = m['label']
        m['label'] = None

print "Kept Meta Entries: %d" % len(meta)

Kept Meta Entries: 147438


In [7]:
# Assign nine-way rootnode classes.
classes, labels = ld.rootNodeClasses(meta)
ld.setEntries(meta, 'label', labels)
ld.setEntries(meta, 'clinical_label', labels)
synset = classes.values()

In [8]:
print 'synset:'
for s in synset:
    print s

synset:
cutaneous-lymphoma
dermal-tumor-benign
dermal-tumor-malignant
epidermal-tumor-benign
epidermal-tumor-malignant
genodermatosis
inflammatory
pigmented-lesion-benign
pigmented-lesion-malignant


In [9]:
# Exclude all specified datasets
filename2meta = ld.Field2meta(meta, field='filename')
for exclusion_file in excluded_datasets:
    filenames = [os.path.basename(line.strip().split()[0]) for line in open(exclusion_file).readlines()]

    for fn in filenames:
        ms = filename2meta(fn)
        for m in ms:
            m['set_identifier'] = NO_SET

In [10]:
# Gather unique paths and labels
ld.setEntries(meta, 'set_identifier', TRAINING_SET)
dataset = np.unique(ld.gatherPathsAndLabels(meta, dataset_directory, TRAINING_SET))

print '# unique images/labels by filename: %s' % len(dataset)

# unique images/labels by filename: 141376


In [14]:
crossval_dir

'/ssd/esteva/skindata4/splits/recursive_dividing_N=1000_crossvalidation_rawsplit'

In [20]:
# Cross validation
N_folds = 9

def write_crossval_folds(dataset, N, dirname, synset):
    """Divides the dataset into N train/val folds, writing the folds into dirname.
    
    Args:
        dataset (list): path-label pairs in the format "path/to/image 1"
        N (int): the number of folds to use. 
            The last fold will be of size len(dataset) - (N-1) * len(dataset)//N 
            It is necessarily be smaller than the rest of the folds
        dirname (str): the directory     
    """
    random.shuffle(dataset)
    fold_size = len(dataset) // N
    indices = [i * fold_size for i in range(N)]
    indices.append(len(dataset))
    
    for i, (start, end) in enumerate(zip(indices[:-1], indices[1:])):
        print 'Generating Fold %d' % i
        val = dataset[start:end]
        train = dataset[:start] + dataset[end:]
        
        train_dir = os.path.join(dirname, 'fold_' + str(i+1), 'train')
        val_dir = os.path.join(dirname, 'fold_' + str(i+1), 'val')
        
        io.make_directory_structure(train_dir, synset)
        io.make_directory_structure(val_dir, synset)
        
        syms_train = io.generate_symlinks(train, train_dir, synset)
        syms_val = io.generate_symlinks(val, val_dir, synset)
        
        io.create_symlinks(syms_train)
        io.create_symlinks(syms_val)
        
        
write_crossval_folds(dataset.tolist(), N_folds, crossval_dir, synset)


Generating Fold 0
Generating Fold 1
Generating Fold 2
Generating Fold 3
Generating Fold 4
Generating Fold 5
Generating Fold 6
Generating Fold 7
Generating Fold 8


In [21]:
with open(labels_file, 'w') as f:
    prefix = ""
    for s in synset:
        f.write(prefix)
        f.write(s)
        prefix = "\n"
print 'Labels file created: %s' % labels_file

Labels file created: /ssd/esteva/skindata4/splits/recursive_dividing_N=1000_crossvalidation_rawsplit/labels.txt
