In [1]:
import os, glob
import numpy as np
import pandas as pd
import pickle

from ce_utils.data import train_valid_split
from ce_utils.preprocessing import extract_aug_suffix
from ce_utils.record import printProgress

root = '/mnt/disk2/data/private_data/SMhospital/capsule'

import argparse

def parse_arguments():
    
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--data_dir', action="store", type=str, 
                        default='/1 preprocessed/database', help='database directory')
    parser.add_argument('--label_dir', action="store", type=str, 
                        default='/1 preprocessed', help='label directory')
    parser.add_argument('--target_sources', action="store", nargs='+', type=str, 
                        default='p3_2', help='target source **default p3_2')
    # '+' == 1 or more.
    # '*' == 0 or more.
    # '?' == 0 or 1.
    parser.add_argument('--aug_frb', action="store", nargs='?',
                        default='0 0 0', help='flip, rotate, blurring control switch')
    parser.add_argument('--aug_sv', action="store", type=bool, 
                        default=True, help='saturation and value control switch')
    parser.add_argument('--save_name', action="store", type=str, default='data_config', help='file name')
    
    args = parser.parse_args()
    print('args={}'.format(args))
    
    return args

def __main__(args):
    
    data_dir = root + args.data_dir 
    label_dir = root + args.label_dir 

    img_files = sorted(os.listdir(data_dir))
    label = pd.read_csv(label_dir + '/label.csv', index_col = 0)

    sources = []
    for source in args.target_sources:
        sources.append([source_ for source_ in sorted(set(label.source.values)) if source in source_])

    sources = np.concatenate(sources)

    data_config = {'positive': [], 'negative': []}
    
    for name in label.index.values:  
        if label.loc[name]['source'] in sources:
            if (label.loc[name]['hemorrhagic'] == 1 and label.loc[name]['depressed'] == 0 and label.loc[name]['negative'] == 0 
                or label.loc[name]['hemorrhagic'] == 0 and label.loc[name]['depressed'] == 1 and label.loc[name]['negative'] == 0):
                data_config['positive'].append(name)
            elif label.loc[name]['positive'] == 0 and label.loc[name]['negative'] == 1:
                data_config['negative'].append(name)

    aug_suffixes = extract_aug_suffix(args.aug_frb, args.aug_sv, mode = 'preprocessing')
    
    from sklearn.model_selection import KFold
    kf = KFold(n_splits = 5, shuffle = True, random_state = 44)

    for i, [(n_train_idx, n_test_idx), (p_train_idx, p_test_idx)] in enumerate(zip(kf.split(data_config['negative']), kf.split(data_config['positive']))):

        train_neg_id = list(np.asarray(data_config['negative'])[n_train_idx])  
        train_pos_id = list(np.asarray(data_config['positive'])[p_train_idx])

        train_neg_id, valid_neg_id = train_valid_split(train_neg_id)
        train_pos_id, valid_pos_id = train_valid_split(train_pos_id)

        test_neg_id = list(np.asarray(data_config['negative'])[n_test_idx])
        test_pos_id = list(np.asarray(data_config['positive'])[p_test_idx])

        train_pos_files = []
        for name in train_pos_id:
            for aug_suf in aug_suffixes:
                train_pos_files.append(name.split('.jpg')[0] + '_' + aug_suf + '.jpg')

        train_neg_files = []
        for name in train_neg_id:
            for aug_suf in aug_suffixes:
                train_neg_files.append(name.split('.jpg')[0] + '_' + aug_suf + '.jpg')

        valid_pos_files = [name.split('.jpg')[0] + '__c_-_-_-.jpg' for name in valid_pos_id]
        valid_neg_files = [name.split('.jpg')[0] + '__c_-_-_-.jpg' for name in valid_neg_id]

        test_pos_files = [name.split('.jpg')[0] + '__c_-_-_-.jpg' for name in test_pos_id]
        test_neg_files = [name.split('.jpg')[0] + '__c_-_-_-.jpg' for name in test_neg_id]

        data_config['train_aug_files_{:02d}'.format(i+1)] = [train_neg_files, train_pos_files]
        data_config['valid_files_{:02d}'.format(i+1)] = [valid_neg_files, valid_pos_files]
        data_config['test_files_{:02d}'.format(i+1)] = [test_neg_files, test_pos_files]

    with open(label_dir + '{}.pkl'.format(args.save_name), "wb") as f:
        pickle.dump(data_config, f)
        
    return None

if __name__ == '__main__':
    # invoke the main function of the script
    __main__(parse_arguments())
    
    
"""
python3 'data_configuration_np-hd_cv.py' --data_dir '/1 preprocessed/database' --label_dir '/1 preprocessed' --target_sources p3_2 --aug_frb 0 0 0 --aug_sv False --save_name data_config_p3_2_np-hd_---_--_5f_cv
"""

In [1]:
import os, glob
import numpy as np
import pandas as pd
import pickle

from ce_utils.data import train_valid_split
from ce_utils.preprocessing import extract_aug_suffix
from ce_utils.record import printProgress

root = '/mnt/disk2/data/private_data/SMhospital/capsule'

In [2]:
label = pd.read_csv(root +'/1 preprocessed' + '/label.csv', index_col = 0)
label

Unnamed: 0,negative,positive,hemorrhagic,red_spot,angioectasia,active_bleeding,depressed,erosion,ulcer,stricture,protruded,ampulla_of_vater,lymphoid_follicles,small_bowel_tumor,etc,phlebectasia,lymphangiectasia,source
BK___02-02-14___1014671.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___00-42-54___1005149.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___00-42-54___1005150.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___00-48-17___1005795.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-01-12___1007346.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-01-13___1007348.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-01-56___1007433.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-02-46___1007534.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-31-27___1010976.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-36-40___1011602.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2


In [3]:
sorted(set(label.source.values))

['190520 p3_2',
 '190520 p3_2, 190814 negative',
 '190520 p3_2, 200713 clip for AI re-screening/01/FN',
 '190520 p3_2, 200713 clip for AI re-screening/02/FN',
 '190814 negative',
 '200121 validation',
 '200713 clip for AI re-screening/01',
 '200713 clip for AI re-screening/01/FN',
 '200713 clip for AI re-screening/02',
 '200713 clip for AI re-screening/02/FN',
 '200713 clip for AI re-screening/03',
 '200713 clip for AI re-screening/03/FN']

In [12]:
ex_val = label[label['source'] == '200121 validation']

ex_val[ex_val['protruded'] == 1]

Unnamed: 0,negative,positive,hemorrhagic,red_spot,angioectasia,active_bleeding,depressed,erosion,ulcer,stricture,protruded,ampulla_of_vater,lymphoid_follicles,small_bowel_tumor,etc,phlebectasia,lymphangiectasia,source
달조___03-16-31___2031172.jpg,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,200121 validation
달조___03-16-32___2031175.jpg,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,200121 validation
달조___03-16-33___2031177.jpg,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,200121 validation
달조___00-15-49___2003515.jpg,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,200121 validation
승염___00-27-38___1003317.jpg,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,200121 validation
광배___06-40-12___2065116.jpg,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,200121 validation
병김___04-54-30___2052693.jpg,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,200121 validation
병김___05-02-31___2053656.jpg,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,200121 validation
주김___00-52-20___1006280.jpg,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,200121 validation
주김___00-52-21___1006282.jpg,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,200121 validation


In [14]:
len(ex_val[ex_val['protruded'] == 1])

49

In [5]:
label[label['source'] == '190520 p3_2']

Unnamed: 0,negative,positive,hemorrhagic,red_spot,angioectasia,active_bleeding,depressed,erosion,ulcer,stricture,protruded,ampulla_of_vater,lymphoid_follicles,small_bowel_tumor,etc,phlebectasia,lymphangiectasia,source
BK___02-02-14___1014671.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___00-42-54___1005149.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___00-42-54___1005150.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___00-48-17___1005795.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-01-12___1007346.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-01-13___1007348.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-01-56___1007433.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-02-46___1007534.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-31-27___1010976.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-36-40___1011602.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2


In [6]:
p3_2 = label[label['source'] == '190520 p3_2']

p3_2[p3_2['negative'] == 1]

Unnamed: 0,negative,positive,hemorrhagic,red_spot,angioectasia,active_bleeding,depressed,erosion,ulcer,stricture,protruded,ampulla_of_vater,lymphoid_follicles,small_bowel_tumor,etc,phlebectasia,lymphangiectasia,source
AJ___02-07-37___1015318.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
AJ___02-07-38___1015319.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
AJ___02-07-42___1015327.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
AJ___02-08-16___1015395.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
AJ___02-08-46___1015456.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
AJ___02-09-44___1015572.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
AJ___02-09-47___1015577.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
AJ___02-09-48___1015579.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
AJ___02-09-50___1015583.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
AJ___02-09-56___1015595.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2


In [7]:
p3_2[p3_2['positive'] == 1]

Unnamed: 0,negative,positive,hemorrhagic,red_spot,angioectasia,active_bleeding,depressed,erosion,ulcer,stricture,protruded,ampulla_of_vater,lymphoid_follicles,small_bowel_tumor,etc,phlebectasia,lymphangiectasia,source
BK___02-02-14___1014671.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___00-42-54___1005149.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___00-42-54___1005150.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___00-48-17___1005795.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-01-12___1007346.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-01-13___1007348.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-01-56___1007433.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-02-46___1007534.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-31-27___1010976.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2
BS___01-36-40___1011602.jpg,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,190520 p3_2


In [8]:
label[label['source'] == '200713 clip for AI re-screening/01']

Unnamed: 0,negative,positive,hemorrhagic,red_spot,angioectasia,active_bleeding,depressed,erosion,ulcer,stricture,protruded,ampulla_of_vater,lymphoid_follicles,small_bowel_tumor,etc,phlebectasia,lymphangiectasia,source
김 민경 (19998616) 31 Aug 18_1_frame_04111.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/01
김 민경 (19998616) 31 Aug 18_1_frame_00679.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/01
김 민경 (19998616) 31 Aug 18_1_frame_06261.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/01
김 민경 (19998616) 31 Aug 18_1_frame_07904.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/01
김 민경 (19998616) 31 Aug 18_1_frame_00025.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/01
김 민경 (19998616) 31 Aug 18_1_frame_06190.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/01
김 민경 (19998616) 31 Aug 18_1_frame_04163.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/01
김 민경 (19998616) 31 Aug 18_1_frame_07731.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/01
김 민경 (19998616) 31 Aug 18_1_frame_04213.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/01
김 민경 (19998616) 31 Aug 18_1_frame_07921.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/01


In [9]:
label[label['source'] == '200713 clip for AI re-screening/01/FN']

Unnamed: 0,negative,positive,hemorrhagic,red_spot,angioectasia,active_bleeding,depressed,erosion,ulcer,stricture,protruded,ampulla_of_vater,lymphoid_follicles,small_bowel_tumor,etc,phlebectasia,lymphangiectasia,source
민김___05-19-54___2045630.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/01/FN


In [10]:
label[label['source'] == '200713 clip for AI re-screening/02']

Unnamed: 0,negative,positive,hemorrhagic,red_spot,angioectasia,active_bleeding,depressed,erosion,ulcer,stricture,protruded,ampulla_of_vater,lymphoid_follicles,small_bowel_tumor,etc,phlebectasia,lymphangiectasia,source
박 명자 (28372171) 10 Dec 18_1_frame_01740.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/02
박 명자 (28372171) 10 Dec 18_1_frame_09662.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/02
박 명자 (28372171) 10 Dec 18_1_frame_09755.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/02
박 명자 (28372171) 10 Dec 18_1_frame_14794.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/02
박 명자 (28372171) 10 Dec 18_1_frame_09706.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/02
박 명자 (28372171) 10 Dec 18_1_frame_09058.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/02
박 명자 (28372171) 10 Dec 18_1_frame_03251.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/02
박 명자 (28372171) 10 Dec 18_1_frame_13663.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/02
박 명자 (28372171) 10 Dec 18_1_frame_14281.jpg,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/02
박 명자 (28372171) 10 Dec 18_1_frame_09620.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,200713 clip for AI re-screening/02


In [4]:
for id in label.index.values:
    label

array(['BK___02-02-14___1014671.jpg', 'BS___00-42-54___1005149.jpg',
       'BS___00-42-54___1005150.jpg', ...,
       '오 종임 (23245336) 27 Aug 18_1_frame_03047.jpg',
       '박 명자 (28372171) 10 Dec 18_1_frame_11025.jpg',
       '박 명자 (28372171) 10 Dec 18_1_frame_15182.jpg'], dtype=object)

In [18]:
sources = []
for source in ['p3_2','190814 negative']:
    sources.append([source_ for source_ in sorted(set(label.source.values)) if source in source_])
    
sources = np.unique(np.concatenate(sources))

In [19]:
sources

array(['190520 p3_2', '190520 p3_2, 190814 negative',
       '190520 p3_2, 200713 clip for AI re-screening/01/FN',
       '190520 p3_2, 200713 clip for AI re-screening/02/FN',
       '190814 negative'], dtype='<U50')

In [17]:
np.unique(sources)

array(['190520 p3_2', '190520 p3_2, 190814 negative',
       '190520 p3_2, 200713 clip for AI re-screening/01/FN',
       '190520 p3_2, 200713 clip for AI re-screening/02/FN',
       '190814 negative'], dtype='<U50')

In [7]:
sources

array(['190520 p3_2', '190520 p3_2, 190814 negative',
       '190520 p3_2, 200713 clip for AI re-screening/01/FN',
       '190520 p3_2, 200713 clip for AI re-screening/02/FN',
       '190520 p3_2, 190814 negative', '190814 negative'], dtype='<U50')

In [10]:
np.concatenate(set(sources))

TypeError: The first input argument needs to be a sequence