# Integrate ISIC Metadata  into general meta
 - ~10,000 images that Brett collected
 - keep p1a images (crops)

In [1]:
import json
import os
import lib
from lib.notebooks.parse_isic_helper import *
from lib.taxonomy.loading import getEntries, setEntries, getEntryValues

# for auto-reloading external modules
%load_ext autoreload
%autoreload 2

In [2]:
meta = json.load(open('/archive/isic_meta.json', 'r'))
big_meta = json.load(open('/archive/esteva/skindata4/meta_skindata3+derm101.json', 'r'))

In [3]:
b = []
for m in big_meta:
    keep = True
    if 'database' in m and m['database'] == 'isic':
        keep = False
    if keep:
        b.append(m)
big_meta = b
print 'Eliminating isic meta from big_meta'

Eliminating isic meta from big_meta


In [4]:
def get_isic_id(m):
    return int(os.path.basename(m['path']).split('_')[1])

ids = [get_isic_id(m) for m in meta]
print 'The number of unique isic ids contained: %d' % len(set(ids))

The number of unique isic ids contained: 12086


In [5]:
# Check that brett and I agree
diagnoses = [get_diagnosis(m) for m in meta]
for d, m in zip(diagnoses, meta):
    if 'label' in m and d:
        assert d == m['label']
print 'Brett and I agree on diagnosis. He has more, because he scraped from the web and some diagnoses are online but not in the meta.'


Brett and I agree on diagnosis. He has more, because he scraped from the web and some diagnoses are online but not in the meta.


### Adjust isic metadata to integrate into original meta
 - remove non-pigmented lesions
 - add tax_path and database keys
 - add filename key, joining the full ISIC/ path the image comes from, and using the p1a crop
 

In [15]:
def isNotPigmentedLesions(m):
    """Returns false if m has a pathology diagnosis corresponding to a non-pigmented lesion."""
    keepIt = True
    
    pd = finditem(m, 'pathology diagnosis')
    if pd:
        pd = pd[0]
        remove = ['arthropod','acanthoma', 'carcinoma', 'keratosis', 'angiofibroma', 'angiokeratoma',
                  'collision', 'dermatofibroma', 'hemangioma', 'hematoma', 'hemorrhage', 'tattoo', 'dermatitis'
                 ]
        for r in remove:
            if r in pd:
                keepIt = False
    return keepIt


def removeChars(string, chars_list):
    for elem in chars_list:
        string = string.replace(elem, '')
    return string

def getp1aFilename(m):
    """Returns the p1a filename of meta"""
    filenames = m['files']
    p1a = [f for f in filenames if '-p1a.' in f and 'tile' not in f and '.json' not in f]
    if len(p1a) == 1:
        return p1a[0]
    elif len(p1a) == 0:
        return None
    else:
        raise ValueError('found multiple matching filenames for p1a %s' % (' '.join(p1a)))   
        
def getBestFilename(m):
    """Returns the best image to keep for the meta. Usually its a p1a, if not, just grab the normal jpg"""
    p1a = getp1aFilename(m)
    if p1a is not None:
        return p1a
    best = [f for f in m['files'] if '.jpg' in f and 'tile' not in f and '.json' not in f]
    if len(best) == 1:
        return best[0]
    elif len(best) == 0:
        return None
    else:
        raise ValueError('found multiple matching filenames for jpg %s' % (' '.join(best)))               
    

In [10]:
# Remove lesions that seem to be non-pigmented lesions1
meta_kept = [m for m in meta if isNotPigmentedLesions(m)]

# Add tax_path and database
for m in meta_kept:
    m['database'] = 'isic'    
    if 'label' in m:
        c = m['label']
        m['tax_path'] = [[u'pigmented', 'lesions', '%s' % c], ['dermoscopy']]

            
# Add filename
for m in meta_kept:
    filename = m['path']
    filename = filename.replace('/','_')
    filename = removeChars(filename, '() ')
#     p1a = getp1aFilename(m)
    best = getBestFilename(m)
    fucks = 0    
    if best is not None:
        filename += '-'    
        filename += best
    else: 
        fucks += 1  
    m['original_filename'] = best
    m['filename'] = filename
    
            
print 'Keeping %d meta entries' % len(meta_kept)
print 'Unable to extract %d p1a images' % fucks
print 'Example filename used: %s' % filename
print 'Example original filename used: %s' % best

Keeping 11927 meta entries
Unable to extract 0 p1a images
Example filename used: ISIC_LesionImages_ISIC_MSK-1_1_ISIC_0012085-kopf2656-p1a.png
Example original filename used: kopf2656-p1a.png


### Save to meta

In [11]:
all_meta = big_meta
all_meta.extend(meta_kept)if best is None:
        print 'FUCKKGFADFGADFG'

In [12]:
json.dump(all_meta, open('/archive/esteva/skindata4/meta.json', 'w'))

In [13]:
len(getEntries(all_meta, 'database', 'isic'))

11927

# Copy images from /archive/ISIC into skindata4/images
 - this is the release that Brett used when he created the metadata, above

In [12]:
import json
import lib
import os
import shutil
from lib.taxonomy import loading

In [2]:
rootdir = '/archive'
meta = json.load(open('/archive/esteva/skindata4/meta.json', 'r'))

In [3]:
# Copy the 'best image' from isic[i]['path'] into /archive/esteva/skindata4/isic
# 'best image' is typically the p1a, which was found by the code above.
isic = loading.getEntries(meta, 'database', 'isic')

In [17]:
copydir = '/archive/esteva/skindata4/isic_images'
copy_pairs = []
for i, m in enumerate(isic):
    if i % 100: 
        print '\r', i,        
    path = m['path']
    isicid = os.path.split(path)[-1]        
    src = os.path.join(rootdir, path, m['original_filename'])
    new_fn = m['filename']        
    new_fn = removeChars(new_fn, '(), ')
    dst = os.path.join(copydir, new_fn)
    assert os.path.exists(src), "%s not found" % src
    copy_pairs.append((src, dst))

11926


In [18]:
for i, pair in enumerate(copy_pairs):
    if i % 100 == 0: 
        print '\r', i,     
    src = pair[0]
    dst = pair[1]
    shutil.copy(src, dst)

11900
