In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os, glob
import cv2
import h5py

import sys
sys.path.append('/mnt/disk1/project/SMhospital/capsule/ce_packages')

from ce_utils import one_hot

In [2]:
from itertools import product
import glob

def target_preprocessings(phase_a_switch = [1, 1, 1], phase_b_switch = True, mode = 'load'):
    """
    phase_a_switch = [1, 1, 1], [0, 0 ,1], [1, 1, 0].... 
    that means [flip, rotate, blur_sharp]
    """
    phase0 = ['_c']
    phase1 = {1: ['-', 'f'], 0: ['-']}
    phase2 = {1: ['-', 'r1', 'r2', 'r3'], 0: ['-']}
    phase3 = {1: ['-', 'ab', 'mb', 'eh'], 0: ['-']}
    phase4 = ['s_-30_v_30', 's_-30_v_-30', 's_30_v_-30', 's_30_v_30']
    
    if mode == 'load':
        phase_a_items = [phase1[phase_a_switch[0]], phase2[phase_a_switch[1]], phase3[phase_a_switch[2]]]
    elif mode == 'preprocessing':
        phase_a_items = [phase0, phase1[phase_a_switch[0]], phase2[phase_a_switch[1]], phase3[phase_a_switch[2]]]
    
    phase_a = []
    for i in list(product(*phase_a_items)):
        phase_a.append('_'.join(i))

    if not phase_b_switch != True:
        phase_b = []
        for i in list(product(*[phase_a, phase4])):
            phase_b.append('_'.join(i))
        return list(np.hstack([phase_a, phase_b]))
    else:
        return phase_a 

class ce_load_dataset:
    def __init__(self, phase, data, pre_a, pre_b, img_ch = 'bgr', ext_name = True):
        self.phase = phase        # 'train' or 'test'
        self.data = data          # 'sm', 'sm_core', 'sm_v2', 'sm_x160', ...
        self.pre_a = pre_a        # [1, 1, 1], [0, 0 ,1], [1, 1, 0].... 
        self.pre_b = pre_b        # True or False
        self.img_ch = img_ch      # 'bgr', 'rgb', and 'hsv'
        self.ext_name = ext_name  # True or False

    def load_path(self, cls, les, data_dir = '/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed'):
        """
        phase = 'train', 'test'
        cls: [les]  
          'n': ['neg']
          'h': ['redspot', 'angio', 'active'], 
          'd': ['ero', 'ulc', 'str'],
          'p': ['amp', 'lym', 'tum']}
        pre_a[0] must be 0
        """
        lesions = dict(neg = 'negative', 
                       redspot = 'red_spot', angio = 'angioectasia', active = 'active_bleeding', 
                       ero = 'erosion', ulcer = 'ulcer', str = 'stricture', 
                       amp = 'ampulla_of_vater', lym = 'lymphoid_follicles', tum = 'small_bowel_tumor')
        classes = dict(n = 'negative', h = 'hemorrhagic', d = 'depressed', p = 'protruded')

        path = os.path.join(data_dir, self.data, self.phase, classes[cls], lesions[les])
        pathlist = glob.glob(path + '/*.jpg')
        if self.pre_b != True:
            path_in_phase = []
            for p in pathlist:
                name = os.path.basename(p)
                if (name.split('c_')[-1])[:-4] in target_preprocessings(self.pre_a, self.pre_b):
                    path_in_phase.append(p)   
            return np.asarray(path_in_phase)
        else:
            return np.asarray(pathlist)

    def load_image_from_path(self, pathlist,image_ch = 'bgr', extract_name = False):
        data = []
        for i in pathlist:
            temp = cv2.imread(i)
            if image_ch == 'bgr':
                pass
            elif image_ch == 'rgb':
                temp = cv2.cvtColor(temp, cv2.COLOR_BGR2RGB)
            elif image_ch == 'hsv':
                temp = cv2.cvtColor(temp, cv2.COLOR_BGR2HSV)
            data.append(temp)
        if extract_name != False:
            name = []
            for i in pathlist:
                name.append(os.path.basename(i))
            return np.asarray(data), np.asarray(name)
        else:
            return np.asarray(data) 

    def load_data(self, cls, les):
        pathlist = self.load_path(cls, les)
        return  self.load_image_from_path(pathlist, image_ch = self.img_ch, extract_name = self.ext_name)

In [3]:
ce = ce_load_dataset(phase = 'test', data = 'sm_x160_v2', pre_a = [0, 0, 0], pre_b = False)

In [4]:
neg_dataset = ce.load_data('n', 'neg')
redspot_dataset = ce.load_data('h', 'redspot')
angio_dataset = ce.load_data('h', 'angio')
active_dataset = ce.load_data('h', 'active')
ero_dataset = ce.load_data('d', 'ero')
ulcer_dataset = ce.load_data('d', 'ulcer')
str_dataset = ce.load_data('d', 'str')

neg_label = one_hot(neg_dataset[0], 0)
redspot_label = one_hot(redspot_dataset[0], 1)
angio_label = one_hot(angio_dataset[0], 1)
active_label = one_hot(active_dataset[0], 1)
ero_label = one_hot(ero_dataset[0], 1)
ulcer_label = one_hot(ulcer_dataset[0], 1)
str_label = one_hot(str_dataset[0], 1)

print("negative:",neg_dataset[0].shape, neg_label.shape)
print("red spot:",redspot_dataset[0].shape, redspot_label.shape)
print("angioectasia:",angio_dataset[0].shape, angio_label.shape)
print("active_bleeding:",active_dataset[0].shape, active_label.shape)
print("erosion:",ero_dataset[0].shape, ero_label.shape)
print("ulcer:",ulcer_dataset[0].shape, ulcer_label.shape)
print("stricture:",str_dataset[0].shape, str_label.shape)

negative: (1562, 512, 512, 3) (1562, 2)
red spot: (139, 512, 512, 3) (139, 2)
angioectasia: (5, 512, 512, 3) (5, 2)
active_bleeding: (108, 512, 512, 3) (108, 2)
erosion: (185, 512, 512, 3) (185, 2)
ulcer: (183, 512, 512, 3) (183, 2)
stricture: (26, 512, 512, 3) (26, 2)


In [5]:
def label_lesion(label, lesion):
    return np.array([lesion for i in range(len(label))])

In [6]:
neg_lesion = label_lesion(neg_label, 'neg')

In [7]:
neg_lesion

array(['neg', 'neg', 'neg', ..., 'neg', 'neg', 'neg'], dtype='<U3')

In [8]:
imgs = np.vstack([neg_dataset[0], redspot_dataset[0], angio_dataset[0], active_dataset[0], 
                      ero_dataset[0], ulcer_dataset[0], str_dataset[0]])

filename = np.hstack([neg_dataset[1], redspot_dataset[1], angio_dataset[1], active_dataset[1], 
                      ero_dataset[1], ulcer_dataset[1], str_dataset[1]])

labels = np.vstack([neg_label, redspot_label, angio_label, active_label, ero_label, ulcer_label, str_label])

lesion = np.hstack([label_lesion(neg_label, 'neg'), 
                    label_lesion(redspot_label, 'redspot'), label_lesion(angio_label, 'angio'), label_lesion(active_label, 'active'), 
                    label_lesion(ero_label, 'ero'), label_lesion(ulcer_label, 'ulcer'), label_lesion(str_label, 'str')])

In [9]:
imgs = imgs.reshape(imgs.shape[0], -1)

In [10]:
N = imgs.shape[0]

In [11]:
hf = h5py.File('/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed/sm_x160_v2/testset', 'w')

hf.create_dataset('image', (N, 512*512*3), data = imgs)   
hf.create_dataset('label', (N, 2), data = labels)     
hf.create_dataset('lesion', (N,), data = lesion)
hf.create_dataset('filename', (N,), data = filename)

TypeError: No conversion path for dtype: dtype('<U7')

In [19]:
hf.create_dataset('label', (2208, 2), data = labels)     
hf.create_dataset('lesion', (2208,), data = lesion)
hf.create_dataset('filename', (2208,), data = filename)

TypeError: No conversion path for dtype: dtype('<U7')

In [58]:
with h5py.File('/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed/sm_x160_v2/testset', 'w') as f:

    f.create_dataset('image', (2208, 512*512*3), dtype='uint8')    # 1000개의 32x32 RGB 이미지를 담는 데이터 공간을 생성한다. 
    f.create_dataset('label', (2208,), dtype='float32')              # 1000개의 float을 담는 데이터 공간을 생성한다. 
    f.create_dataset('filename', (2208,))
    imgs = f['image']    # 실 데이터 공간에 접근할 변수를 할당한다. 
    labels = f['label']
    filename = f['filename']

OSError: Unable to create file (unable to truncate a file which is already open)

In [62]:
hf2 = h5py.File('/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed/sm_x160_v2/testset', 'w')

hf2.create_dataset('image', (2208, 512*512*3), data = imgs)   
hf2.create_dataset('label', (2208,), data = labels)              
hf2.create_dataset('filename', (2208,), data = filename)

OSError: Unable to create file (unable to truncate a file which is already open)

In [43]:
hf = h5py.File('/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed/sm_x160_v2/testset', 'r')

In [48]:
hf.keys()

<KeysViewHDF5 ['filename', 'image', 'label']>

In [49]:
imgs_from_hf = np.array(hf['image'])

In [56]:
imgs.shape

(2208, 786432)

In [55]:
imgs_from_hf.shape

(2208, 786432)

In [57]:
np.mean(np.equal(imgs, imgs_from_hf))

0.09440492374309595