# Make STDs Training and Validation: Initial Setup

 - Copy the inflammatory part of the connected_components partition into its own folder: train-orig. Expected format:

```
inflammatory_venous-insuficiency-vascular-disorders-and-lymphedema_phlebitis-superficial_705
inflammatory_venous-insuficiency-vascular-disorders-and-lymphedema_raynauds-disease_706
inflammatory_venous-insuficiency-vascular-disorders-and-lymphedema_stasis-edema_707
inflammatory_venous-insuficiency-vascular-disorders-and-lymphedema_varicosis_708
inflammatory_xanthomas_709
```
 
 - Run some version of the following: 
 
 ```
 python /ssd/esteva/lib/taxonomy/partition_scripts/split_to_leafs.py \
  --dataset_directory=/ssd/esteva/skindata4/splits/nine-way/val/inflammatory \
  --new_dir_location=/ssd/esteva/skindata4/splits/inflammatory \
  --meta_file=/ssd/esteva/skindata4/meta.json \
```

This script creates /ssd/esteva/skindata4/splits/inflammatory/inflammatory-taxsplit, which is essentially the validation set split down to the leaf nodes, in class-1_subclass-1_subclass-2 format, where underscores delimit the taxonomy. 


In [7]:
import os
import shutil
import numpy as np

# Classes to keep in {classname : [disease1-keyword, disease2-keyword, ...]} format

# Inflammatory split:
classes = {
#     'chancroid' : ['chancroid'],
    'warts-hpv' : ['warts'],
#     'gonorrhea' : ['gonorrhea'],
    'herpes' : ['herpes-simplex'],
    'molluscum' : ['molluscum-contagiosum'],
#     'crabs' : ['pediculosis-lids'],
    'scabies' : ['scabies'],
    'syphilis' : ['syphilis']
}

# Make Validation

In [8]:
original_taxsplit_valdir = '/media/esteva/ExtraDrive1/ThrunResearch/data/skindata4/splits/stds/inflammatory-val-taxsplit'
new_val_dir = '/media/esteva/ExtraDrive1/ThrunResearch/data/skindata4/splits/stds/val'

dirs = os.listdir(original_taxsplit_valdir)

def resolve_valclasses(entry, tags):
    """For entries with multiple potential classes (tags), we choose the one higher up in the taxonomy."""
    best_locations = []
    for t in tags:
        locations = []
        for k in classes[t]:
            l = entry.find(k)
            if l >= 0:
                locations.append(l)
        min_loc = np.min(locations)
        best_locations.append(min_loc)    
    return np.array(tags)[np.argmin(best_locations)]


def copy(src, dst):
    """Copies symlinks and files."""    
    if os.path.islink(src):
        linkto = os.readlink(src)
        os.symlink(linkto, dst)
    else:
        shutil.copy(src,dst)

def copy_dir_contents(src_dir, dst_dir):
    """Copies files from src_dir into dst_dir, preserving symlinks."""
    print "%s -> %s" % (os.path.basename(src_dir), os.path.basename(dst_dir))
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
    for entry in os.listdir(src_dir):
        src = os.path.join(src_dir, entry)
        dst = os.path.join(dst_dir, entry)
        copy(src, dst)

valclass_leafclass = []
for entry in dirs:
    valclasses = []
    for classname, keywords in classes.iteritems():
        for k in keywords:
            if k in entry:
                valclasses.append(classname)
    valclasses = list(set(valclasses))
    if valclasses:
        if len(valclasses) > 1:
            print 'WARNING: more than one valclass, resolving by first listing:',
            print valclasses
            print entry
            valclass = resolve_valclasses(entry, valclasses)
            print valclass
        else:
            valclass = valclasses[0]
        valclass_leafclass.append((valclass, entry))
    

In [9]:
# Build directory structure, copying sym links over
for (valclass, leafclass) in valclass_leafclass:
    src = os.path.join(original_taxsplit_valdir, leafclass)
    dst = os.path.join(new_val_dir, valclass)
    copy_dir_contents(src, dst)
    
print '%s created' % new_val_dir

infections_viral-diseaseshpv-herpes-molluscum-exanthems-and-others_warts_planar-warts_planar-warts -> warts-hpv
infections_viral-diseaseshpv-herpes-molluscum-exanthems-and-others_herpes-simplex_herpes-buttocks -> herpes
infections_viral-diseaseshpv-herpes-molluscum-exanthems-and-others_herpes-simplex_herpes-simplex -> herpes
infections_bacterial-infections_syphilis_syphilis-secondary -> syphilis
infections_viral-diseaseshpv-herpes-molluscum-exanthems-and-others_warts_warts-vulgaris -> warts-hpv
infections_viral-diseaseshpv-herpes-molluscum-exanthems-and-others_warts_papulose-bowenoide_bowenoid-papulosis -> warts-hpv
infections_viral-diseaseshpv-herpes-molluscum-exanthems-and-others_warts_warts-filiformis -> warts-hpv
infections_viral-diseaseshpv-herpes-molluscum-exanthems-and-others_warts_verruca-vulgaris -> warts-hpv
infections_viral-diseaseshpv-herpes-molluscum-exanthems-and-others_herpes-simplex_eczema-herpeticum_eczema-herpeticum -> herpes
infections_viral-diseaseshpv-herpes-mollus

# Make Training

In [10]:
original_train_dir = '/media/esteva/ExtraDrive1/ThrunResearch/data/skindata4/splits/stds/train-orig'
new_train_dir = '/media/esteva/ExtraDrive1/ThrunResearch/data/skindata4/splits/stds/train'


def manual_fixes(tagged_entry):
    """We fix a few things, per Rob's suggestion, given that the taxonomy isn't a true tree."""
    return tagged_entry


def remove_tag(entry, tag):
    """Removes tax-path tags from the entry.
    
    Args:
        entry (string): the tax-path entry
        tag (string): something like 'inflammatory'
        
    Returns:
        The entry with the tag 'tag_' removed
    """
    tag = tag + '_'
    if tag not in entry:
        raise ValueError('tag %s not in entry %s' % (tag, entry))
    return "".join(entry.split(tag))


def add_tag(entry, tag):
    """Adds tag_ to the start of entry. Same format as remove_tag"""
    if isinstance(tag, list):
        for t in tag:
            entry = add_tag(entry, t)
        return entry
    else:
        return tag + '_' + entry
    
    
def copy(src, dst):
    """Copies symlinks and files."""    
    if os.path.islink(src):
        linkto = os.readlink(src)
        os.symlink(linkto, dst)
    else:
        shutil.copy(src,dst)

        
def copy_dir_contents(src_dir, dst_dir):
    """Copies files from src_dir into dst_dir, preserving symlinks."""
    os.makedirs(dst_dir)
    for entry in os.listdir(src_dir):
        src = os.path.join(src_dir, entry)
        dst = os.path.join(dst_dir, entry)
        copy(src, dst)


dirs = os.listdir(original_train_dir)

# Add class tag, removing 'inflammatory' tag, for each entry in the original train directotry:
olddir_newdir = []
for entry in dirs:
    tags = []
    for classname, keywords in classes.iteritems():
        for k in keywords:
            if k in entry:
                tags.append(classname)
    tags = list(set(tags))
    if tags:
        new_dir = remove_tag(entry, 'inflammatory')
        if len(tags) > 1:
            tag = resolve_valclasses(new_dir, tags)
            new_dir = add_tag(new_dir, tag)
            print 'WARNING: more than one tag, check that its fixed:',
            print tags
            print '\t', new_dir
        else:
            tag = tags[0]
            new_dir = add_tag(new_dir, tag)
        new_dir = manual_fixes(new_dir)
        olddir_newdir.append((entry, new_dir))


# Build directory structure, copying sym links over
for (old_dir, new_dir) in olddir_newdir:
    src = os.path.join(original_train_dir, old_dir)
    dst = os.path.join(new_train_dir, new_dir)
    copy_dir_contents(src, dst)
