# Make Inflammatory Training and Validation: Initial Setup

 - Copy the inflammatory part of the connected_components partition into its own folder: train-orig. Expected format:

```
inflammatory_venous-insuficiency-vascular-disorders-and-lymphedema_phlebitis-superficial_705
inflammatory_venous-insuficiency-vascular-disorders-and-lymphedema_raynauds-disease_706
inflammatory_venous-insuficiency-vascular-disorders-and-lymphedema_stasis-edema_707
inflammatory_venous-insuficiency-vascular-disorders-and-lymphedema_varicosis_708
inflammatory_xanthomas_709
```
 
 - Run some version of the following: 
 
 ```
 python /ssd/esteva/lib/taxonomy/partition_scripts/split_to_leafs.py \
  --dataset_directory=/ssd/esteva/skindata4/splits/nine-way/val/inflammatory \
  --new_dir_location=/ssd/esteva/skindata4/splits/inflammatory \
  --meta_file=/ssd/esteva/skindata4/meta.json \
```

This script creates /ssd/esteva/skindata4/splits/inflammatory/inflammatory-taxsplit, which is essentially the validation set split down to the leaf nodes, in class-1_subclass-1_subclass-2 format, where underscores delimit the taxonomy. 


In [3]:
import os
import shutil
import numpy as np

# Classes to keep in {classname : [disease1-keyword, disease2-keyword, ...]} format

# Inflammatory split:
classes = {
    
    # Inflammatory
    'acne' : ['acne'],
    'bullous' : ['bullous'],
    'rosacea' : ['rosacea'],
    'erythema' : ['erythema'],
    'lichen-planus' : ['lichen-planus'],
    'psoriasis' : ['psoriasis', 'pilaris', 'papulosquamous'],
    'eczema' : ['eczema'],
    'pustular' : ['pustular'],
    
    # STDs
    'warts-hpv' : ['warts'],
    'herpes' : ['herpes-simplex'],
    'molluscum' : ['molluscum-contagiosum'],
    'scabies' : ['scabies'],
    'syphilis' : ['syphilis'],
    
    # Fungal Infections
    'fungal-infections' : ['fungal-infections'],
    
}

# Diseases we'd like to include but don't have enough validation data for
#     'crabs' : ['pediculosis-lids'],
#     'gonorrhea' : ['gonorrhea'],
#     'chancroid' : ['chancroid'],
#     'purpura' : ['vasculopathy', 'vasculitis', 'purpura'],

# Make Validation

 - scan 'original_taxsplit_valdir' subdirectories for keywords (classes.values()) in the 'classes' dictionary defined above.
 - When a keyword is found, we put the contents of that subdirectory into its corresponding class (classes.keys())
 - We print along the way the number of unique images (unique path basenames) for each class.

In [4]:
original_taxsplit_valdir = '/ssd/esteva/skindata4/splits/inflammatory3/inflammatory-val-taxsplit'
new_val_dir = '/ssd/esteva/skindata4/splits/inflammatory3/val'


def resolve_valclasses(entry, tags):
    """For entries with multiple potential classes (tags), we choose the one higher up in the taxonomy."""
    best_locations = []
    for t in tags:
        locations = []
        for k in classes[t]:
            l = entry.find(k)
            if l >= 0:
                locations.append(l)
        min_loc = np.min(locations)
        best_locations.append(min_loc)    
    return np.array(tags)[np.argmin(best_locations)]


def copy(src, dst):
    """Copies symlinks and files."""    
    if os.path.islink(src):
        linkto = os.readlink(src)
        os.symlink(linkto, dst)
    else:
        shutil.copy(src,dst)

        
def copy_dir_contents(src_dir, dst_dir):
    """Copies files from src_dir into dst_dir, preserving symlinks."""
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
    for entry in os.listdir(src_dir):
        src = os.path.join(src_dir, entry)
        dst = os.path.join(dst_dir, entry)
        copy(src, dst)

        
dirs = os.listdir(original_taxsplit_valdir)
valclass_leafclass = []
for entry in dirs:
    valclasses = []
    for classname, keywords in classes.iteritems():
        for k in keywords:
            if k in entry:
                valclasses.append(classname)
    valclasses = list(set(valclasses))
    if valclasses:
        if len(valclasses) > 1:
            print 'WARNING: more than one valclass, resolving by first listing:',
            print valclasses
            print entry
            valclass = resolve_valclasses(entry, valclasses)
            print valclass
        else:
            valclass = valclasses[0]
        valclass_leafclass.append((valclass, entry))
    

eczema-spongiotic-dermatitis_erythema-craquele
eczema
infections_viral-diseaseshpv-herpes-molluscum-exanthems-and-others_herpes-simplex_eczema-herpeticum_eczema-herpeticatum
herpes
acne-folliculitis-hidradenitis-and-diseases-of-appendegeal-structures_folliculitis_eosinophilic-pustular-folliculitis_eosinophilic-pustular-folliculitis
acne
pustular-skin-disease-non-infectious_erythema-toxicum-neonatorum_erythema-toxicum-neonatorum
pustular
infections_viral-diseaseshpv-herpes-molluscum-exanthems-and-others_herpes-simplex_eczema-herpeticum_eczema-herpeticum
herpes
pustular-skin-disease-non-infectious_pustular-psoriasis
pustular
lichen-planus-and-lichenoid-eruptions_lichen-planus_lichen-planopilaris_lichen-planopilaris
lichen-planus
acne-folliculitis-hidradenitis-and-diseases-of-appendegeal-structures_acne-rosacea
acne


In [13]:
# Print statistics on class counts

counts = {k : 0 for k in classes.keys()}
for (valclass, leafclass) in valclass_leafclass:
    d = os.path.join(original_taxsplit_valdir, leafclass)
    counts[valclass] += len(os.listdir(d))

print 'Class counts:'
for k,v in counts.iteritems():
    print k,v

Class counts:
psoriasis 800
rosacea 695
warts-hpv 118
lichen-planus 238
fungal-infections 1206
acne 1431
syphilis 99
scabies 70
bullous 208
pustular 64
molluscum 69
erythema 250
eczema 1007
herpes 76


In [21]:
1.0* sum([v for v in counts.values()]) / 1024

6.1826171875

In [14]:
# Build directory structure, copying sym links over
for (valclass, leafclass) in valclass_leafclass:
    src = os.path.join(original_taxsplit_valdir, leafclass)
    dst = os.path.join(new_val_dir, valclass)
    copy_dir_contents(src, dst)
    
print '%s created' % new_val_dir

/ssd/esteva/skindata4/splits/inflammatory3/val created


# Make Training

In [15]:
original_train_dir = '/ssd/esteva/skindata4/splits/inflammatory3/train-orig'
new_train_dir = '/ssd/esteva/skindata4/splits/inflammatory3/train'


def manual_fixes(tagged_entry):
    """We fix a few things, per Rob's suggestion, given that the taxonomy isn't a true tree."""
    
    fixes = [
        ('erythema_purpura_purpura-vasculitis-and-vasculopathy_erythema-elevatum-diutinum_662',
         'purpura_purpura-vasculitis-and-vasculopathy_erythema-elevatum-diutinum_662'
        ),
        ('acne_pustular_acne-folliculitis-hidradenitis-and-diseases-of-appendegeal-structures_acne-pustular_227',
         'acne_acne-folliculitis-hidradenitis-and-diseases-of-appendegeal-structures_acne-pustular_227'
        ),
        ('acne_pustular_acne-folliculitis-hidradenitis-and-diseases-of-appendegeal-structures_folliculitis_eosinophilic-pustular-folliculitis_232',
         'acne_acne-folliculitis-hidradenitis-and-diseases-of-appendegeal-structures_folliculitis_eosinophilic-pustular-folliculitis_232'
        ),
        ('acne_rosacea_acne-folliculitis-hidradenitis-and-diseases-of-appendegeal-structures_acne-rosacea_228',
         'acne_acne-folliculitis-hidradenitis-and-diseases-of-appendegeal-structures_acne-rosacea_228'
        ),
        ('lichen-planus_psoriasis_lichen-planus-and-lichenoid-eruptions_lichen-planus_lichen-planopilaris_582',
         'lichen-planus_lichen-planus-and-lichenoid-eruptions_lichen-planus_lichen-planopilaris_582'
        ),
    ]
    for fix in fixes:
        if fix[0] in tagged_entry:
            return fix[1]
    return tagged_entry


def remove_tag(entry, tag):
    """Removes tax-path tags from the entry.
    
    Args:
        entry (string): the tax-path entry
        tag (string): something like 'inflammatory'
        
    Returns:
        The entry with the tag 'tag_' removed
    """
    tag = tag + '_'
    if tag not in entry:
        raise ValueError('tag %s not in entry %s' % (tag, entry))
    return "".join(entry.split(tag))


def add_tag(entry, tag):
    """Adds tag_ to the start of entry. Same format as remove_tag"""
    if isinstance(tag, list):
        for t in tag:
            entry = add_tag(entry, t)
        return entry
    else:
        return tag + '_' + entry
    
    
def copy(src, dst):
    """Copies symlinks and files."""    
    if os.path.islink(src):
        linkto = os.readlink(src)
        os.symlink(linkto, dst)
    else:
        shutil.copy(src,dst)

        
def copy_dir_contents(src_dir, dst_dir):
    """Copies files from src_dir into dst_dir, preserving symlinks."""
    os.makedirs(dst_dir)
    for entry in os.listdir(src_dir):
        src = os.path.join(src_dir, entry)
        dst = os.path.join(dst_dir, entry)
        copy(src, dst)


dirs = os.listdir(original_train_dir)

# Add class tag, removing 'inflammatory' tag, for each entry in the original train directotry:
olddir_newdir = []
for entry in dirs:
    tags = []
    for classname, keywords in classes.iteritems():
        for k in keywords:
            if k in entry:
                tags.append(classname)
    tags = list(set(tags))
    if tags:
        new_dir = remove_tag(entry, 'inflammatory')
        if len(tags) > 1:
            tag = resolve_valclasses(new_dir, tags)
            new_dir = add_tag(new_dir, tag)
            print 'WARNING: more than one tag, check that its fixed:',
            print tags
            print '\t', new_dir
        else:
            tag = tags[0]
            new_dir = add_tag(new_dir, tag)
        new_dir = manual_fixes(new_dir)
        olddir_newdir.append((entry, new_dir))


# Build directory structure, copying sym links over
for (old_dir, new_dir) in olddir_newdir:
    src = os.path.join(original_train_dir, old_dir)
    dst = os.path.join(new_train_dir, new_dir)
    copy_dir_contents(src, dst)


	acne_acne-folliculitis-hidradenitis-and-diseases-of-appendegeal-structures_acne-rosacea_228
	acne_acne-folliculitis-hidradenitis-and-diseases-of-appendegeal-structures_acne-pustular_227
	herpes_infections_viral-diseaseshpv-herpes-molluscum-exanthems-and-others_herpes-simplex_eczema-herpeticum_528
	eczema_eczema-spongiotic-dermatitis_erythema-craquele_431
	lichen-planus_lichen-planus-and-lichenoid-eruptions_lichen-planus_lichen-planopilaris_582
	acne_acne-folliculitis-hidradenitis-and-diseases-of-appendegeal-structures_folliculitis_eosinophilic-pustular-folliculitis_232


In [26]:
new_dirs[0]

'psoriasis_psoriasis-pityriasis-rubra-pilaris-and-papulosquamous-disorders_seborrheic-dermatitis_657'

In [28]:
# Print statistics on class counts

new_dirs = [new_dir for _, new_dir in olddir_newdir]
counts = {k : 0 for k in classes.keys()}
for (old_dir, new_dir) in olddir_newdir:
    d = os.path.join(new_train_dir, new_dir)
    valclass = new_dir.split('_')[0]
    counts[valclass] += len(os.listdir(d))

print 'Class counts:'
total = 0
for k,v in counts.iteritems():
    print k,v
    total += v
print 'Total: %d' % total

Class counts:
psoriasis 6897
rosacea 1627
warts-hpv 1721
lichen-planus 2307
fungal-infections 5982
acne 8487
syphilis 756
scabies 925
bullous 2619
pustular 573
molluscum 600
erythema 2607
eczema 8516
herpes 1373
Total: 44990
