In [3]:
import pandas as pd

import io
import os
import os.path as pt
import csv
import zipfile
import random
from multiprocessing.dummy import Pool as ThreadPool

import numpy as np
from PIL import Image
from PIL import ImageChops
from simplejpeg import decode_jpeg
from simplejpeg import encode_jpeg

from datadings.writer import FileWriter
from datadings.tools import yield_threaded
from datadings.tools import document_keys

from sklearn.model_selection import train_test_split

### Creates the 5Class Derm7pt dataset also used in Kawahare et al.(2019).
# https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8333693&casa_token=fE8RRfpkEq0AAAAA:0GaEzwzYsMyTaa0-ZLtzT0H0BD4FK0qaU5BXKzqo24S6LjygoYgcWfgaU9ia1DZX8jUnG0zXIw&tag=1
# and in Fu et al.
# https://arxiv.org/ftp/arxiv/papers/2104/2104.00201.pdf

# __doc__ += document_keys(
#     Derm7pt
#     )

def Derm7pt(
        key,
        image,
        annotations
):
    """
Returns a dictionary::

    {
        'key': key,
        'image': image,
        'diagnosis': diagnosis
        'seven_point_score': seven_point_score
        'pigment_network': pigment_network
        ...
    }
    """
    return {
        'key': key,
        'image': image,
        'diagnosis': annotationstring2int('diagnosis', annotations['diagnosis']),
        'seven_point_score': annotations['seven_point_score'],
        'pigment_network': annotationstring2int('pigment_network', annotations['pigment_network']),
        'streaks': annotationstring2int('streaks', annotations['streaks']),
        'pigmentation': annotationstring2int('pigmentation', annotations['pigmentation']),
        'regression_structures': annotationstring2int('regression_structures', annotations['regression_structures']),
        'dots_and_globules': annotationstring2int('dots_and_globules', annotations['dots_and_globules']),
        'blue_whitish_veil': annotationstring2int('blue_whitish_veil', annotations['blue_whitish_veil']),
        'vascular_structures': annotationstring2int('vascular_structures', annotations['vascular_structures']),
        'level_of_diagnostic_difficulty': annotationstring2int('level_of_diagnostic_difficulty', annotations['level_of_diagnostic_difficulty']),
        'elevation': annotationstring2int('elevation', annotations['elevation']),
        'location': annotationstring2int('location', annotations['location']),
        'sex': annotationstring2int('sex', annotations['sex']),
        'management': annotationstring2int('management', annotations['management'])        
    }

def annotationstring2int(key, value):
        dictionary = {
            'diagnosis': {
                'basal cell carcinoma': 0, 
                'blue nevus': 1, 
                'clark nevus': 1,
                'combined nevus': 1, 
                'congenital nevus': 1, 
                'dermal nevus': 1,
                'dermatofibroma': 4, 
                'lentigo': 4, 
                'melanoma (in situ)': 2,
                'melanoma (less than 0.76 mm)': 2, 
                'melanoma (0.76 to 1.5 mm)': 2,
                'melanoma (more than 1.5 mm)': 2, 
                'melanoma metastasis': 2, 
                'melanosis': 4,
                'miscellaneous': 4, 
                'recurrent nevus': 1, 
                'reed or spitz nevus': 1,
                'seborrheic keratosis': 3, 
                'vascular lesion': 4, 
                'melanoma': 2
            },
            'pigment_network': {
                'absent': 0,
                'typical': 1,
                'atypical': 2
            },
            'streaks': {
                'absent': 0,
                'regular': 1,
                'irregular': 2
            },
            'pigmentation': {
                'absent': 0, 
                'diffuse regular': 1, 
                'localized regular': 2,
                'diffuse irregular': 3, 
                'localized irregular': 4,
            },
            'regression_structures': {
                'absent': 0, 
                'blue areas': 1, 
                'white areas': 2,
                'combinations': 3
            },
            'dots_and_globules': {
                'absent': 0,
                'regular': 1,
                'irregular': 2
            },
            'blue_whitish_veil': {
                'absent': 0,
                'present': 1
            },
            'vascular_structures': {
                'absent': 0, 
                'arborizing': 1, 
                'within regression': 1, 
                'hairpin': 1, 
                'dotted': 1,
                'comma': 1, 
                'linear irregular': 1, 
                'wreath': 1
            },
            'level_of_diagnostic_difficulty': {
                'low': 0,
                'medium': 1,
                'high': 2
            },
            'elevation': {
                'flat': 0,
                'nodular': 1, 
                'palpable': 2                
            },
            'location': {
                'abdomen': 0, 
                'head neck': 1, 
                'lower limbs': 2, 
                'upper limbs': 3, 
                'back': 4,
                'chest': 5, 
                'acral': 6, 
                'buttocks': 7, 
                'genital areas': 8
            },
            'sex': {
                'female': 0, 
                'male': 1
            },
            'management': {
                'no further examination': 0,
                'clinical follow up': 1,
                'excision': 2
            },
        }
        
        return dictionary[key][value]

def __transform_image(im, size=64):
    return im.resize(
        (256, 256),
        Image.ANTIALIAS,
    )


def __decode(data):
    return Image.fromarray(decode_jpeg(
        data, fastupsample=False, fastdct=False
    ), 'RGB')


def __tobytes(im):
    bio = io.BytesIO()
    im.save(bio, 'PNG', optimize=True)
    return bio.getvalue()


def yield_samples(partition_data, augmentation):
    for index, sample in partition_data.iterrows():
        filename = sample.derm
        filename = os.path.join(f'../augmentations/{augmentation}/', filename)
        
        imagedata = Image.open(filename)
        
        yield filename, imagedata, sample


def create_sample(item):
    filename, imagedata, annotation = item

    #image = __decode(imagedata)
    image = __transform_image(imagedata)
    image_binary = __tobytes(image)
    
    filename = filename.split(os.sep)[-1]
    return Derm7pt(
        filename,
        image_binary,
        annotation,
    )

def write_set(partition, partition_data, t_name, augmentation):
    outdir = f'../datadings/Augmentations/{augmentation}/Derm7pt_strat_perTarget/Derm7pt_strat_{t_name}'

    gen = yield_threaded(yield_samples(partition_data, augmentation))
    
    outfile = pt.join(outdir, partition + '.msgpack')
    filelength = partition_data.size
    with FileWriter(outfile, total=filelength, overwrite=True) as writer:
        pool = ThreadPool(8)
        for sample in pool.imap_unordered(create_sample, gen):
            writer.write(sample)

def write_sets():
    
    target_names = ['diagnosis']
    
    target_augmentations = [
        'textures-only_IMGSize-256_RasterSize-32_CropSize-200',
        'textures-removed_IMGSize-256_CropSize-200',
        'shape-removed_IMGSize-256_RasterSize-16_CropSize-200',
    ]
    
    random_i = 42
    np.random.seed(random_i)
    random.seed(random_i)
    
    data = pd.read_csv('../original/meta/meta.csv')
    
    for t_name in target_names:
        print(t_name)
        
        # Get relevent indices and labels
        all_indices = data.index.tolist()
        all_labels = data[t_name].tolist()
        all_labels = [annotationstring2int(t_name, x) for x in all_labels]
        
        # Stratified split
        train_indices, test_indices, train_labels, _ = train_test_split(all_indices, all_labels, test_size=0.33, stratify=all_labels, random_state=random_i)
        train_indices, val_indices, _, _ = train_test_split(train_indices, train_labels, test_size=0.33, stratify=train_labels, random_state=random_i)

        try:
            for partition, partition_indexes in zip(['train', 'val', 'test'], [train_indices, val_indices, test_indices]):
                partition_data = data.loc[partition_indexes]

                for augmentation in target_augmentations:
                    print(augmentation)
                    
                    write_set(partition, partition_data, t_name, augmentation)
        except FileExistsError:
            pass

        print('')

In [4]:
write_sets()

train.msgpack   0% 00:00<?, ?it/s

diagnosis
textures-only_IMGSize-256_RasterSize-32_CropSize-200


train.msgpack   5% 00:15<04:40, 29.09it/s
train.msgpack   0% 00:00<?, ?it/s

453 samples written
textures-removed_IMGSize-256_CropSize-200


train.msgpack   5% 00:06<01:57, 69.54it/s
train.msgpack   0% 00:00<?, ?it/s

453 samples written
shape-removed_IMGSize-256_RasterSize-16_CropSize-200


train.msgpack   5% 00:12<03:51, 35.21it/s
val.msgpack   0% 00:00<?, ?it/s

453 samples written
textures-only_IMGSize-256_RasterSize-32_CropSize-200


val.msgpack   5% 00:07<02:23, 28.12it/s
val.msgpack   0% 00:00<?, ?it/s

224 samples written
textures-removed_IMGSize-256_CropSize-200


val.msgpack   5% 00:02<00:52, 77.16it/s
val.msgpack   0% 00:00<?, ?it/s

224 samples written
shape-removed_IMGSize-256_RasterSize-16_CropSize-200


val.msgpack   5% 00:05<01:44, 38.56it/s
test.msgpack   0% 00:00<?, ?it/s

224 samples written
textures-only_IMGSize-256_RasterSize-32_CropSize-200


test.msgpack   5% 00:10<03:17, 30.36it/s
test.msgpack   0% 00:00<?, ?it/s

334 samples written
textures-removed_IMGSize-256_CropSize-200


test.msgpack   5% 00:04<01:19, 75.54it/s
test.msgpack   0% 00:00<?, ?it/s

334 samples written
shape-removed_IMGSize-256_RasterSize-16_CropSize-200


test.msgpack   5% 00:09<02:46, 36.13it/s

334 samples written




