In [1]:
%load_ext autoreload
%autoreload 2
import os
import json

import numpy as np
import pandas as pd

from studio.data.datasets import Dataset
from studio.data.snapshots import Snapshot
from studio.data.ontology import Ontology
from studio.data.modelmap import ModelMap
from studio.utils.utils import mkdir



Using TensorFlow backend.


In [2]:
def filter_dataframe(df, include_tags=None, exclude_tags=None):
    # Will do Include tags (include.a & include.b & include.c) & not (exclude.d or exclude.e)
    drop_rows = []
    if include_tags is not None and not isinstance(include_tags, list):
        include_tags = [include_tags]
    if exclude_tags is not None and not isinstance(exclude_tags, list):
        exclude_tags = [exclude_tags]
        
    for i, row in df.iterrows():
        drop = True
        if len(row['tags']) > 0:
            if include_tags is not None:
                n_included = 0
                for i_t in include_tags:
                    if i_t in row['tags']:
                        n_included+=1
                if n_included == len(include_tags):
                    drop=False
                    
            if exclude_tags is not None:
                for e_t in exclude_tags:
                    if e_t in row['tags']:
                        drop=True
        if drop:
            drop_rows.append(i)
    print('Dropped %i rows'% len(drop_rows))
    return df.drop(drop_rows)

## Create macroscopic dataset snapshot
Query:
image-type:macroscopic and not (dataset:aip-test or studio:possible-duplicate or test:internal or test:aip-risk-benchmark or test-app-incoming)

In [3]:
data_df = pd.read_json('/data/datasets/all_data_processed/skin_images_manifest.json')

In [4]:
# Some storage_keys need cleaning

In [5]:
count = 0
values = []
for i, row in data_df.iterrows():
    if len(row['storage_key'].split('/')) > 1:
        values.append(row['storage_key'].split('/')[-1])
        count += 1
    elif len(row['storage_key'].split('/')) == 1:
        values.append(row['storage_key'])
    else:
        print(row['storage_key'])

print('There were % rows that changed path', count)
data_df['storage_key'] = values

There were % rows that changed path 0


In [18]:
len(data_df)

280055

In [143]:
# Compare storage keys with os.listdir()
filenames = set(os.listdir('/data/old-data/images/'))
drop_rows = []
for i, row in data_df.iterrows():
    if row['storage_key'] not in filenames:
        drop_rows.append(i)
print('%i rows storage key was not in the server', len(drop_rows))
data_df = data_df.drop(drop_rows)

%i rows storage key was not in the server 2069


In [144]:
len(data_df)

280055

In [19]:
data_df = filter_dataframe(data_df, 
                      include_tags='image-type:macroscopic', 
                      exclude_tags=['dataset:aip-test', 'studio:possible-duplicate', 
                                    'test:internal', 'test:aip-risk-benchmark', 'test-app-incoming'])

Dropped 130277 rows


In [20]:
len(data_df)

149778

In [23]:
data_df.head()

Unnamed: 0,id,width,height,tags,crop,reviews,storage_key
3,1017765,333,333,"[AIP:0001423, SNOMEDID:68225006, source:region...","[83, 0, 333, 333]",,9BPQxu5g41QpS1wU8ETe4wB8
6,1020510,983,983,"[AIP:0000973, SNOMEDID:254731001, source:regio...","[255, 0, 983, 983]",,1GLHFfLUsgS4pv8xQRNjjJfa
23,47744,317,317,"[SNOMEDID:186289000, ICD-9-CM:111.1, ICD-10:B3...",,,A3w1jJc2CfLQtAsTP2pzLUeB
36,44277,260,260,"[SNOMEDID:403194002, ICD-10:L55.0, DERMO:00028...",,,atkRdxCpmPjq5hoMbsTax7cJ
37,44861,305,305,"[SNOMEDID:76272004, ICD-10:A53.9, DERMO:000138...",,,WpnYATzVJV7WogcrWdnBDrgM


In [24]:
data_df.to_json('/data/datasets/macroscopic/all_training_images.json', orient='records', indent=2)

## Create ModelMap

In [3]:
training_snapshot_path = '/data/datasets/macroscopic/all_training_images.json'

In [4]:
training_snapshot = Snapshot(training_snapshot_path)

There have been 22 rows with a tie in the image type. Access the indices in snapshot.rows_tie_image_type
There have been 12644 rows missing AIP codes. Access the indices in snapshot.rows_missing_AIP_code
There have been 1410 rows containing a AIP code and a review. Access the indices in snapshot.rows_multiple_sources
There have been 402 rows containing rows_multiple AIP codes. Access the indices in snapshot.rows_multiple_AIP_code


In [5]:
ontology_path = '/home/albert/dermaip/ai-studio/data/ontology/ontology.json'
macroscopic_diagnosis_filename = os.path.join('/home/albert/dermaip/ai-studio/data/ontology', 'macroscopic_diagnosis_nodes.csv')

In [6]:
node_counts = training_snapshot.compute_node_frequency(uncertainty_threshold=0.5, uncertainty_mode='keep')

In [7]:
macroscopic_diagnosis_df = pd.read_csv(macroscopic_diagnosis_filename)

In [8]:
ontology = Ontology(ontology_path, root_id='AIP:root')
ontology.set_diagnosis_nodes(macroscopic_diagnosis_df.id)

Ontology was created without errors.


[]

In [9]:
ontology.set_node_count(node_counts)

There are 8 `node_id`'s that occur in `nodes_frequency`, but not in our tree. This likely indicates that the IDs are wrong, or we need to add nodes to our tree. This is the list:['old-AIP:0002492', 'old-AIP:0000007', 'review-AIP-old:0002810', 'old-AIP:0002810', 'uncertainty', 'old-AIP:0000964', 'old-AIP:0003286', 'old-AIP:0000337']


In [10]:
old_conditions_df = pd.read_json('/home/albert/dermaip/ai-studio/139_way_june_2020/conditions_df.json')
old_model_map = ModelMap(old_conditions_df)

In [11]:
old_dns = old_model_map.diagnosis_df['diagnosis_id'].tolist()

In [12]:
old_dns.remove('AIP:0001949')

In [13]:
len(old_dns)

138

In [14]:
conditions_df = ontology.compute_conditions_df(min_diagnosis_images=150, force_diagnosis_ids=old_dns, constrain_diagnosis_ids=old_dns)

The following 10 diagnosis IDs were excluded:
 ['AIP:0000178', 'AIP:0000436', 'AIP:0001041', 'AIP:0001199', 'AIP:0001580', 'AIP:0001883', 'AIP:0002163', 'AIP:0002898', 'AIP:0003043', 'AIP:0012300']


In [15]:
model_map = ModelMap(conditions_df)

In [16]:
model_map.diagnosis_df

Unnamed: 0,class_index,diagnosis_id,diagnosis_name,malignancy,condition_id,condition_name,n_samples
0,0,AIP:0000010,ichthyosis vulgaris,benign,[AIP:0000010],[ichthyosis vulgaris],240.0
1,1,AIP:0000064,darier disease,benign,"[AIP:0000064, AIP:0003333, AIP:0003339, AIP:00...","[darier disease, acral hemorrhagic type darier...",359.0
2,2,AIP:0000065,hailey-hailey disease,benign,"[AIP:0000065, AIP:0003343, AIP:0003344, AIP:00...","[hailey-hailey disease, hailey-hailey disease ...",151.0
3,3,AIP:0000119,scabies,benign,"[AIP:0000119, AIP:0002327, AIP:000119a, AIP:00...","[scabies, crusted scabies, scabies-crusty-or-e...",1113.0
4,4,AIP:0000120,pediculosis,benign,"[AIP:0000120, AIP:0002805, AIP:0002804, AIP:00...","[pediculosis, body lice, crab lice, head lice,...",124.0
...,...,...,...,...,...,...,...
133,133,AIP:0100249,hang nail,benign,[AIP:0100249],[hang nail],174.0
134,134,AIP:0100255,palmoplantar-psoriasis,benign,"[AIP:0100255, AIP:100255c, AIP:100255a, AIP:10...","[palmoplantar-psoriasis, palmoplantar-psoriasi...",386.0
135,135,AIP:0101021,intertrigo,benign,"[AIP:0101021, AIP:0002272, AIP:0002089]","[intertrigo, candidal intertrigo, streptococca...",172.0
136,136,AIP:0103006,lentigo,benign,"[AIP:0103006, AIP:0000928, AIP:0103007, AIP:00...","[lentigo, lentigo simplex, lentigo-ink-spot, s...",560.0


In [17]:
model_map.save('/data/datasets/macroscopic/138_way_may_2021/conditions_df', mode='conditions', format='json')

[{'class_index': 0,
  'diagnosis_name': 'ichthyosis vulgaris',
  'diagnosis_id': 'AIP:0000010',
  'condition_name': 'ichthyosis vulgaris',
  'condition_id': 'AIP:0000010',
  'malignancy': 'benign',
  'n_samples': 240.0},
 {'class_index': 1,
  'diagnosis_name': 'darier disease',
  'diagnosis_id': 'AIP:0000064',
  'condition_name': 'darier disease',
  'condition_id': 'AIP:0000064',
  'malignancy': 'benign',
  'n_samples': 359.0},
 {'class_index': 1,
  'diagnosis_name': 'darier disease',
  'diagnosis_id': 'AIP:0000064',
  'condition_name': 'acral hemorrhagic type darier disease',
  'condition_id': 'AIP:0003333',
  'malignancy': 'benign',
  'n_samples': 0.0},
 {'class_index': 1,
  'diagnosis_name': 'darier disease',
  'diagnosis_id': 'AIP:0000064',
  'condition_name': 'cornifying darier disease',
  'condition_id': 'AIP:0003339',
  'malignancy': 'benign',
  'n_samples': 0.0},
 {'class_index': 1,
  'diagnosis_name': 'darier disease',
  'diagnosis_id': 'AIP:0000064',
  'condition_name': 'dari

In [18]:
model_map.save('/data/datasets/macroscopic/138_way_may_2021/diagnosis_df', mode='diagnosis', format='json')

[{'class_index': 0,
  'diagnosis_id': 'AIP:0000010',
  'diagnosis_name': 'ichthyosis vulgaris',
  'malignancy': 'benign',
  'condition_id': ['AIP:0000010'],
  'condition_name': ['ichthyosis vulgaris'],
  'n_samples': 240.0},
 {'class_index': 1,
  'diagnosis_id': 'AIP:0000064',
  'diagnosis_name': 'darier disease',
  'malignancy': 'benign',
  'condition_id': ['AIP:0000064',
   'AIP:0003333',
   'AIP:0003339',
   'AIP:0003334',
   'AIP:0003335',
   'AIP:0003342',
   'AIP:0003340',
   'AIP:0003336',
   'AIP:0003338',
   'AIP:0003341',
   'AIP:0003337',
   'AIP:0003411'],
  'condition_name': ['darier disease',
   'acral hemorrhagic type darier disease',
   'cornifying darier disease',
   'darier disease segmental type 1',
   'darier disease segmental type 2',
   'darier disease with alopecia and cutic verticis gyrata',
   'darier disease with comedones, facial cysts and acne conglobata',
   'darier disease with guttate hypopigmentation',
   'darier disease with keratoderma',
   'darier dis

## Create Dataset with AIP codes

In [19]:
dataset_folder = '/data/datasets/macroscopic/'
dataset_name = '138_way_may_2021'

In [20]:
# Create ModelMap
conditions_df = pd.read_json('/data/datasets/macroscopic/138_way_may_2021/conditions_df.json')
model_map = ModelMap(conditions_df)

In [24]:
snapshot_manifest_path = '/data/datasets/macroscopic/all_training_images.json'
ontology_path = "data/ontology/ontology.json"

In [25]:
# Create Dataset
dataset = Dataset(dataset_folder=dataset_folder, 
                  dataset_name=dataset_name, 
                  ontology_path=ontology_path,
                  model_map=model_map)

Ontology was created without errors.


In [26]:
from studio.data.review_algorithms import distribute_ancestors_to_dn_outliers_to_uncertainty, reject_outliers
# Create mapping function to distribute ancestors probability
ancestors_diagnosis_ids_map = dataset.ontology.get_ancestors_diagnosis_ids_map()
distribute_ancestors_probs = distribute_ancestors_to_dn_outliers_to_uncertainty(ancestors_diagnosis_ids_map)

In [27]:
training_df = dataset.process_training_data(snapshot_manifest_path=snapshot_manifest_path,
                                            mapping_function=distribute_ancestors_probs,
                                            image_type='macroscopic',
                                            processed_name='training_data_processed',
                                            min_reviews=3,
                                            uncertainty_as_class=False,
                                            uncertainty_mode='distribute',
                                            uncertainty_threshold=0.5,
                                            images_root_directory='/data/old-data/images/')

Processing Snapshot
There have been 22 rows with a tie in the image type. Access the indices in snapshot.rows_tie_image_type
There have been 12644 rows missing AIP codes. Access the indices in snapshot.rows_missing_AIP_code
There have been 1410 rows containing a AIP code and a review. Access the indices in snapshot.rows_multiple_sources
There have been 402 rows containing rows_multiple AIP codes. Access the indices in snapshot.rows_multiple_AIP_code
The number of total samples contained in the snapshot data is 149778
Processing Snapshot - Done!
The number of total samples contained in the snapshot data after filtering by image type macroscopic is 149755

Computing Node Frequency and setting ontology counts
There are 8 `node_id`'s that occur in `nodes_frequency`, but not in our tree. This likely indicates that the IDs are wrong, or we need to add nodes to our tree. This is the list:['old-AIP:0002492', 'old-AIP:0000007', 'review-AIP-old:0002810', 'old-AIP:0002810', 'uncertainty', 'old-AI

In [28]:
len(training_df)

112333

In [29]:
experiment_name = '138_training_15_class_percentage_distribute_ancestors'

In [30]:
train_df, val_df = dataset.perform_dataset_split(training_df,
                                                 experiment_name=experiment_name,
                                                 mode='class_fraction',
                                                 split_ratio=0.15,
                                                 split_class_count=None,
                                                )

Training Set Size: 95485
Validation Set Size: 16848
Training dataset split saved in /data/datasets/macroscopic/138_way_may_2021/138_training_15_class_percentage_distribute_ancestors/training.json
Validation dataset split saved in /data/datasets/macroscopic/138_way_may_2021/138_training_15_class_percentage_distribute_ancestors/validation.json


### Create app incoming Manifest

Query: first-gate-top1:macroscopic and review-status:dermatologist-reviewed and not (dataset:test-app-incoming or studio:possible-duplicate or test:internal or test:aip-risk-benchmark or review-status:dermatologist-reviewed-paid-image)

In [31]:
app_incoming_df = pd.read_json('/data/datasets/all_data_processed/skin_images_manifest.json')

In [32]:
app_incoming_df = filter_dataframe(app_incoming_df, 
                                   include_tags=['first-gate-top1:macroscopic','review-status:dermatologist-reviewed'], 
                                   exclude_tags=['dataset:test-app-incoming', 'studio:possible-duplicate', 
                                                 'test:internal', 'test:aip-risk-benchmark', 'review-status:dermatologist-reviewed-paid-image'])

Dropped 273020 rows


In [33]:
print(len(app_incoming_df))

7035


In [34]:
app_incoming_df.to_json('/data/datasets/macroscopic/app_incoming_training_manifest.json', orient='records', indent=2)

### Incorporate review data to training and validation dataset

In [35]:
snapshot_manifest_path = "/data/datasets/macroscopic/app_incoming_training_manifest.json"

In [36]:
reviews_df = dataset.create_data_partition(snapshot_manifest_path=snapshot_manifest_path,
                                           mapping_function=distribute_ancestors_probs,
                                           partition_name=None,
                                           image_type='macroscopic',
                                           min_reviews=3,
                                           uncertainty_as_class=False,
                                           uncertainty_mode='distribute',
                                           uncertainty_threshold=0.5,
                                           root_directory='/data/old-data/images/')

Processing Snapshot
There have been 9 rows containing a AIP code and a review. Access the indices in snapshot.rows_multiple_sources
The number of total samples contained in the lab snapshot is 7035
Processing Snapshot - Done!
The number of total samples contained in the snapshot data after filtering by image type macroscopic is 7034

Processing Dataset
0 samples were dropped due to a lack of reviews or AIP code
0 samples were discarded because the number of reviews was below 3 and 2151 because of high uncertainty 
The number of total samples contained in the dataset is 4883


In [37]:
# Only take the ones coming from the review pipeline
reviews_df = reviews_df[reviews_df['labels_source'] == 'review_pipeline']

In [38]:
reviews_df

Unnamed: 0,index,id,width,height,tags,crop,reviews,storage_key,labels_source,image_type,filename,class_probabilities,mapped_reviews,average_diagnosis_reviews
0,0,6948946,3024,3024,"[model:first-gate, app-id:3677fe47-aeca-42ec-9...","[61, 0, 3024, 3024]","[{'image_type': 'macroscopic', 'diagnoses': {'...",nnjhpkr3lkt2e9g0sjff1pek2zsi,review_pipeline,macroscopic,/data/old-data/images/nnjhpkr3lkt2e9g0sjff1pek...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.005794871794871795...","[{'reviewer_email': 44, 'diagnoses': {'AIP:010...","{'AIP:0100152': 0.18846153846153846, 'AIP:0001..."
1,1,6948960,3024,3024,"[model:first-gate, app-id:244a04fa-6b81-4370-8...","[62, 0, 3024, 3024]","[{'image_type': 'macroscopic', 'diagnoses': {'...",hwibl0fxay4k5nm3j4eajmap5rwp,review_pipeline,macroscopic,/data/old-data/images/hwibl0fxay4k5nm3j4eajmap...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[{'reviewer_email': 44, 'diagnoses': {'AIP:000...",{'AIP:0000183': 1.0}
2,2,6948964,2217,2217,"[model:macroscopic, model:first-gate, app-id:9...","[0, 1472, 2217, 2217]","[{'image_type': 'macroscopic', 'diagnoses': {'...",jvd70m45x6bmrvlu48nmoh819xde,review_pipeline,macroscopic,/data/old-data/images/jvd70m45x6bmrvlu48nmoh81...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1166666666666...","[{'reviewer_email': 44, 'diagnoses': {'AIP:000...","{'AIP:0000445': 0.8833333333333333, 'AIP:00001..."
3,3,7032577,1376,1376,"[model:macroscopic, model:first-gate, app-id:y...","[656, 1383, 1376, 1376]","[{'image_type': 'macroscopic', 'diagnoses': {'...",ndc6d5niwgq1t3pap9ynpswlkgj7,review_pipeline,macroscopic,/data/old-data/images/ndc6d5niwgq1t3pap9ynpswl...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.01, 0....","[{'reviewer_email': 100, 'diagnoses': {'uncert...","{'AIP:0001341': 0.01, 'AIP:0001406': 0.01, 'AI..."
4,4,6949199,1475,1475,"[model:first-gate, app-id:bc6f00c9-b8c4-448f-9...","[117, 144, 1475, 1475]","[{'image_type': 'macroscopic', 'diagnoses': {'...",smu0vr8gy56e1mejkqzblzd2k0tv,review_pipeline,macroscopic,/data/old-data/images/smu0vr8gy56e1mejkqzblzd2...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[{'reviewer_email': 44, 'diagnoses': {'AIP:000...","{'AIP:0002471': 0.6666666666666666, 'AIP:00035..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4878,7029,6866883,2978,2978,"[model:macroscopic, model:first-gate, first-ga...","[0, 1053, 2978, 2978]","[{'image_type': 'macroscopic', 'diagnoses': {'...",02d0j7zupzy1j3l9ietwp887zmw2,review_pipeline,macroscopic,/data/old-data/images/02d0j7zupzy1j3l9ietwp887...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[{'reviewer_email': 45, 'diagnoses': {'AIP:000...","{'AIP:0000457': 0.3333333333333333, 'AIP:00030..."
4879,7030,6867059,1407,1407,"[model:macroscopic, model:first-gate, first-ga...","[714, 1100, 1407, 1407]","[{'image_type': 'macroscopic', 'diagnoses': {'...",l5jyazrh0jhdkafbuhlc5olxjsoa,review_pipeline,macroscopic,/data/old-data/images/l5jyazrh0jhdkafbuhlc5olx...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[{'reviewer_email': 45, 'diagnoses': {'AIP:000...","{'AIP:0000510': 0.5416666666666666, 'AIP:00032..."
4880,7031,6868350,1932,1932,"[model:first-gate, first-gate-top1:macroscopic...","[54, 0, 1932, 1932]","[{'image_type': 'macroscopic', 'diagnoses': {'...",e96x1aspgqjud4iykfba3gh7060q,review_pipeline,macroscopic,/data/old-data/images/e96x1aspgqjud4iykfba3gh7...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[{'reviewer_email': 100, 'diagnoses': {'uncert...",{'AIP:0002471': 1.0}
4881,7032,6869344,2417,2417,"[model:macroscopic, model:first-gate, first-ga...","[357, 1073, 2417, 2417]","[{'image_type': 'macroscopic', 'diagnoses': {'...",gpzabdj8swqsayqlbn99dm7uw2gq,review_pipeline,macroscopic,/data/old-data/images/gpzabdj8swqsayqlbn99dm7u...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.38333333333333336,...","[{'reviewer_email': 45, 'diagnoses': {'AIP:000...","{'AIP:0001341': 0.5, 'AIP:0000122': 0.38333333..."


In [39]:
reviews_train_df, reviews_val_df = dataset.perform_dataset_split(reviews_df,
                                                                 experiment_name=None,
                                                                 mode='class_fraction',
                                                                 split_ratio=0.15,
                                                                 split_class_count=None,
                                                                )

Training Set Size: 4144
Validation Set Size: 731


In [40]:
experiment_name = '138_training_15_class_percentage_distribute_ancestors'
training_df = train_df.append(reviews_train_df)
print('Total training images: ', len(training_df))
validation_df = val_df.append(reviews_val_df)
print('Total validation images: ', len(validation_df))

Total training images:  99629
Total validation images:  17579


In [41]:
dataset.save_dataframe(os.path.join(dataset.dataset_dir, 
                                    experiment_name, 
                                    'plus_reviews_distribute', 
                                    'reviews_data.json'),
                      reviews_df)
dataset.save_dataframe(os.path.join(dataset.dataset_dir, 
                                    experiment_name, 
                                    'plus_reviews_distribute', 
                                    'training.json'),
                      training_df)
dataset.save_dataframe(os.path.join(dataset.dataset_dir, 
                                    experiment_name, 
                                    'plus_reviews_distribute', 
                                    'validation.json'),
                      validation_df)

### Create internal test dataset
Query: image-type:macroscopic and dataset:aip-test

In [42]:
test_df = pd.read_json('/data/datasets/all_data_processed/skin_images_manifest.json')

In [43]:
test_df = filter_dataframe(test_df, 
                      include_tags=['image-type:macroscopic', 'dataset:aip-test'], 
                      exclude_tags=None)

Dropped 260504 rows


In [44]:
len(test_df)

19551

In [45]:
test_df.to_json('/data/datasets/macroscopic/internal_test_set_images.json', orient='records', indent=2)

Query: image-type:macroscopic and dataset:internal-test

In [46]:
test_df = pd.read_json('/data/datasets/all_data_processed/skin_images_manifest.json')

In [47]:
test_df = filter_dataframe(test_df, 
                      include_tags=['image-type:macroscopic', 'test:internal'], 
                      exclude_tags=None)

Dropped 276049 rows


In [48]:
len(test_df)

4006

In [49]:
test_df.to_json('/data/datasets/macroscopic/internal_curated_test_set_images.json', orient='records', indent=2)

In [50]:
snapshot_manifest_path = '/data/datasets/macroscopic/internal_test_set_images.json'

In [51]:
test_df = dataset.create_data_partition(snapshot_manifest_path=snapshot_manifest_path,
                                        mapping_function=reject_outliers,
                                        partition_name='internal_test_set',
                                        image_type='macroscopic',
                                        min_reviews=3,
                                        uncertainty_as_class=False,
                                        uncertainty_mode='keep',
                                        uncertainty_threshold=0.5,
                                        root_directory='/data/old-data/images/')

Processing Snapshot
There have been 440 rows missing AIP codes. Access the indices in snapshot.rows_missing_AIP_code
There have been 10 rows containing a AIP code and a review. Access the indices in snapshot.rows_multiple_sources
There have been 52 rows containing rows_multiple AIP codes. Access the indices in snapshot.rows_multiple_AIP_code
The number of total samples contained in the lab snapshot is 19551
Processing Snapshot - Done!
The number of total samples contained in the snapshot data after filtering by image type macroscopic is 19551

Processing Dataset
4256 samples were dropped due to a lack of reviews or AIP code
0 samples were discarded because the number of reviews was below 3 and 5 because of high uncertainty 
The number of total samples contained in the dataset is 15290
Dataset processed and saved in /data/datasets/macroscopic/138_way_may_2021/internal_test_set.json


In [52]:
snapshot_manifest_path = '/data/datasets/macroscopic/internal_curated_test_set_images.json'

In [53]:
test_df = dataset.create_data_partition(snapshot_manifest_path=snapshot_manifest_path,
                                        mapping_function=reject_outliers,
                                        partition_name='internal_curated_test_set',
                                        image_type='macroscopic',
                                        min_reviews=3,
                                        uncertainty_as_class=False,
                                        uncertainty_mode='keep',
                                        uncertainty_threshold=0.5,
                                        root_directory='/data/old-data/images/')

Processing Snapshot
There have been 6 rows missing AIP codes. Access the indices in snapshot.rows_missing_AIP_code
There have been 1 rows containing a AIP code and a review. Access the indices in snapshot.rows_multiple_sources
The number of total samples contained in the lab snapshot is 4006
Processing Snapshot - Done!
The number of total samples contained in the snapshot data after filtering by image type macroscopic is 4006

Processing Dataset
72 samples were dropped due to a lack of reviews or AIP code
0 samples were discarded because the number of reviews was below 3 and 1 because of high uncertainty 
The number of total samples contained in the dataset is 3933
Dataset processed and saved in /data/datasets/macroscopic/138_way_may_2021/internal_curated_test_set.json


### Create app incoming testing datasets

### 100 images benchmark

Query: dataset:test-app-incoming

In [54]:
app_incoming_df = pd.read_json('/data/datasets/all_data_processed/skin_images_manifest.json')

In [55]:
app_incoming_df = filter_dataframe(app_incoming_df, 
                                   include_tags=['dataset:test-app-incoming'], 
                                   exclude_tags=None)

Dropped 279955 rows


In [56]:
len(app_incoming_df)

100

In [57]:
app_incoming_df.to_json('/data/datasets/macroscopic/100_images_benchmark.json', orient='records', indent=2)

In [58]:
snapshot_manifest_path = '/data/datasets/macroscopic/100_images_benchmark.json'

In [59]:
test_df = dataset.create_data_partition(snapshot_manifest_path=snapshot_manifest_path,
                                        mapping_function=reject_outliers,
                                        partition_name='100_benchmark_test_set',
                                        image_type='macroscopic',
                                        min_reviews=3,
                                        uncertainty_as_class=False,
                                        uncertainty_mode='keep',
                                        uncertainty_threshold=0.5,
                                        root_directory='/data/old-data/images/')

Processing Snapshot
The number of total samples contained in the lab snapshot is 100
Processing Snapshot - Done!
The number of total samples contained in the snapshot data after filtering by image type macroscopic is 100

Processing Dataset
0 samples were dropped due to a lack of reviews or AIP code
1 samples were discarded because the number of reviews was below 3 and 8 because of high uncertainty 
The number of total samples contained in the dataset is 91
Dataset processed and saved in /data/datasets/macroscopic/138_way_may_2021/100_benchmark_test_set.json


### App Incoming paid images test set

Query: review-status:dermatologist-reviewed-paid-image

In [60]:
app_incoming_df = pd.read_json('/data/datasets/all_data_processed/skin_images_manifest.json')

In [61]:
app_incoming_df = filter_dataframe(app_incoming_df, 
                                   include_tags=['review-status:dermatologist-reviewed-paid-image'], 
                                   exclude_tags=None)

Dropped 278489 rows


In [62]:
print(len(app_incoming_df))

1566


In [63]:
app_incoming_df.to_json('/data/datasets/macroscopic/app_incoming_paid_benchmark.json', orient='records', indent=2)

In [64]:
snapshot_manifest_path = '/data/datasets/macroscopic/app_incoming_paid_benchmark.json'

In [65]:
test_df = dataset.create_data_partition(snapshot_manifest_path=snapshot_manifest_path,
                                        mapping_function=distribute_ancestors_probs,
                                        partition_name='app_incoming_paid_test',
                                        image_type='macroscopic',
                                        min_reviews=3,
                                        uncertainty_as_class=False,
                                        uncertainty_mode='keep',
                                        uncertainty_threshold=0.5,
                                        root_directory='/data/old-data/images/')

Processing Snapshot
The number of total samples contained in the lab snapshot is 1566
Processing Snapshot - Done!
The number of total samples contained in the snapshot data after filtering by image type macroscopic is 1566

Processing Dataset
0 samples were dropped due to a lack of reviews or AIP code
0 samples were discarded because the number of reviews was below 3 and 479 because of high uncertainty 
The number of total samples contained in the dataset is 1087
Dataset processed and saved in /data/datasets/macroscopic/138_way_may_2021/app_incoming_paid_test.json
