In [1]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict

In [2]:
root = '/media/bramiozo/DATA-FAST/kaggle/image_classification/MEDICAL/melanoma'
os.chdir(root)  
remove_nan = False

In [3]:
meta_lumc = pd.read_csv('TRAINING/lumc_train_metadata.csv')
meta_isic2019 = pd.read_csv('TRAINING/ISIC_2019_Training_Metadata.csv')
meta_isic2020 = pd.read_csv('TRAINING/ISIC_2020_train.csv')

In [4]:
groundtruth_isic2019 = pd.read_csv('TRAINING/ISIC_2019_Training_GroundTruth.csv')

# targets

In [5]:
meta_lumc.Diagnose
map_sites = {_k:_k for _k in meta_lumc.Diagnose.unique()}
map_sites['nv_mel'] = 'anv'
map_sites['misc'] = np.nan
meta_lumc['target'] = meta_lumc.Diagnose.map(map_sites)

In [6]:
class_map = {'unknown': 'misc',
             'melanoma': 'mel',
             'seborrheic keratosis': 'bkl',
             'lichenoid keratosis': 'bkl',
             'lentigo NOS': 'bkl',
             'cafe-au-lait macule': 'bkl',
             'solar lentigo': 'bkl',
             'nevus': 'nv',
             'atypical melanocytic proliferation': 'anv'}

meta_isic2020['target']  =  meta_isic2020.diagnosis.map(class_map)

In [7]:
keys = {0: 'mel', 1: 'nv', 2: 'bcc', 3: 'akiec', 4: 'bkl', 5: 'df', 6: 'vasc', 7: 'scc', 8: 'misc'}
def get_class(x):
    for k, v in x.items():
        if v == 1:
            return k.lower()
    
groundtruth_isic2019['target'] = groundtruth_isic2019[['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC', 'UNK']].\
                            apply(lambda x: get_class(x), axis=1)

In [8]:
meta_isic2019= meta_isic2019.merge(groundtruth_isic2019[['image', 'target']], how='left', left_on='image', right_on='image')

## Add test meta

In [9]:
test_isic2019 = pd.read_csv('TRAINING/ISIC_2019_Test_Metadata.csv')
test_isic2020 = pd.read_csv('TRAINING/ISIC_2020_test.csv')
test_isic2019['target'] = np.nan
test_isic2020['target'] = np.nan

In [10]:
meta_isic2019.drop('lesion_id', axis=1, inplace=True)
meta_isic2020.drop(['diagnosis', 'benign_malignant'], axis=1, inplace=True)

In [11]:
meta_isic2019 = pd.concat([meta_isic2019.copy(), test_isic2019], axis=0)
meta_isic2020 = pd.concat([meta_isic2020.copy(), test_isic2020], axis=0)

meta_isic2019.reset_index(drop=True, inplace=True)
meta_isic2020.reset_index(drop=True, inplace=True)

# Age, gender, sites

In [12]:
gender_map = {'M': 'male', 'F': 'female'}
meta_lumc['sex'] = meta_lumc.Geslacht.map(gender_map)

In [13]:
meta_isic2020['age'] = pd.cut(meta_isic2020.age_approx,[-1, 17, 26, 35, 44, 53, 62, 71, 80, 120], labels=False)
meta_isic2019['age'] = pd.cut(meta_isic2019.age_approx,[-1, 17, 26, 35, 44, 53, 62, 71, 80, 120], labels=False)
meta_lumc.Leeftijdscategorie.unique()
age_map = {'0-17':0, '18-26': 1, '27-35': 2, '36-44': 3, 
           '45-53': 4, '54-62': 5, '63-71': 6, '72-80': 7, 
           '81-100': 8}
meta_lumc['age'] = meta_lumc.Leeftijdscategorie.map(age_map)

In [14]:
#anatom_site_general_challenge
meta_isic2020.anatom_site_general_challenge.unique()

array(['head/neck', 'upper extremity', 'lower extremity', 'torso', nan,
       'palms/soles', 'oral/genital'], dtype=object)

In [15]:
meta_isic2019.anatom_site_general.unique()

array(['anterior torso', 'upper extremity', 'posterior torso',
       'lower extremity', nan, 'lateral torso', 'head/neck',
       'palms/soles', 'oral/genital'], dtype=object)

In [16]:
#map_sites = defaultdict(str)
map_sites = {_k:_k for _k in meta_isic2019.anatom_site_general.unique()}
map_sites['anterior torso'] = 'torso'
map_sites['posterior torso'] = 'torso'
map_sites['lateral torso'] = 'torso'
meta_isic2019['sites'] = meta_isic2019.anatom_site_general.map(map_sites)

meta_isic2020['sites'] = meta_isic2020.anatom_site_general_challenge

In [17]:
map_sites = {'been': 'lower extremity', 'been | dermatoscopie': 'lower extremity',
             'schouder': 'torso' , 'schouder | dermatoscopie': 'torso',
             'borst': 'torso', 'borst | dermatoscopie': 'torso', 
             'buik': 'torso', 'buik | dermatoscopie': 'torso', 
             'arm': 'upper extremity', 'arm | dermatoscopie': 'upper extremity',
             'rug': 'torso', 'rug | dermatoscopie': 'torso', 'dermatoscopie | rug': 'torso',
             'behaarde hoofd': 'torso', 'behaarde hoofd | dermatoscopie': 'torso',
             'hals': 'head/neck' ,'dermatoscopie | hals': 'head/neck',
             'dermatoscopie | schouder': 'torso', 'dermatoscopie': 'torso',
             'gelaat': 'head/neck', 'gelaat | dermatoscopie': 'head/neck',
             'hals | dermatoscopie': 'head/neck', 'flank': 'torso', 
             'flank | dermatoscopie':'torso', 'voet': 'palms/soles',
             'voet | dermatoscopie': 'palms/soles', 'dermatoscopie | voet': 'palms/soles',
             'dermatoscopie | flank': 'torso',  'bil': 'torso', 'bil | dermatoscopie': 'torso'           
            }
meta_lumc['sites'] = meta_lumc.Trefwoord.map(map_sites)

In [18]:
meta_lumc['filename'] = meta_lumc.Bestandsnummer.apply(lambda x: str(x)+'.jpg')
meta_isic2020['filename'] = meta_isic2020.image_name.apply(lambda x: str(x)+'.jpg')
meta_isic2019['filename'] = meta_isic2019.image.apply(lambda x: str(x)+'.jpg')

meta_lumc['dataset'] = 'LUMC'
meta_isic2020['dataset'] = 'ISIC2020'
meta_isic2019['dataset'] = 'ISIC2019'

In [19]:
final = pd.concat([meta_isic2019[['filename', 'sex', 'age', 'sites', 'target', 'dataset']],
                   meta_isic2020[['filename', 'sex', 'age', 'sites', 'target', 'dataset']],
                   meta_lumc[['filename', 'sex', 'age', 'sites', 'target', 'dataset']]], axis=0)

if remove_nan:
    final.dropna(subset=['filename', 'sites', 'age', 'sex'], axis=0, inplace=True)
else:
    from sklearn.impute import SimpleImputer
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    final[['age']] = imp_mean.fit_transform(final[['age']])

final.age = final.age.astype(int)

In [20]:
final.to_csv("ISIC_LUMC_META.csv", sep=";", index=False)