In [3]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import glob
import cv2
import pandas as pd
from itertools import product

In [4]:
def load_filename(phase, cls, les = None, data = 'sm', data_dir = '/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed'):
    """
    phase = 'train', 'test'
    cls: [les]  
      'n': ['neg']
      'h': ['redspot', 'angio', 'active'], 
      'd': ['ero', 'ulc', 'str'],
      'p': ['amp', 'lym', 'tum']}
    """
    lesions = dict(neg = 'negative', 
                   redspot = 'red_spot', angio = 'angioectasia', active = 'active_bleeding', 
                   ero = 'erosion', ulcer = 'ulcer', str = 'stricture', 
                   amp = 'ampulla_of_vater', lym = 'lymphoid_follicles', tum = 'small_bowel_tumor')
    classes = dict(n = 'negative', h = 'hemorrhagic', d = 'depressed', p = 'protruded')

    path = os.path.join(data_dir, data, phase, classes[cls], lesions[les])
    pathlist = glob.glob(path + '/*.jpg')
    name = []
    for path in pathlist:
        name.append(os.path.basename(path))
    
    return np.asarray(name)

# Train / Test with no redundacny of patient

## Split Training / Testing set by Patient

### Check Number of Patients

In [5]:
data_dir = '/mnt/disk2/data/private_data/SMhospital/capsule/0 data/labeled'

neg_1_filename = load_filename('p3_2', 'n', 'neg', data = '190520 p3_2', data_dir = data_dir)
neg_2_filename = load_filename('raw', 'n', 'neg', data = '190814 negative', data_dir = data_dir)
redspot_filename = load_filename('p3_2', 'h', 'redspot', data = '190520 p3_2', data_dir = data_dir)
angio_filename = load_filename('p3_2', 'h', 'angio', data = '190520 p3_2', data_dir = data_dir)
active_filename = load_filename('p3_2', 'h', 'active', data = '190520 p3_2', data_dir = data_dir)
ero_filename = load_filename('p3_2', 'd', 'ero', data = '190520 p3_2', data_dir = data_dir)
ulcer_filename = load_filename('p3_2', 'd', 'ulcer', data = '190520 p3_2', data_dir = data_dir)
str_filename = load_filename('p3_2', 'd', 'str', data = '190520 p3_2', data_dir = data_dir)

In [6]:
len(neg_1_filename), len(neg_2_filename), len(redspot_filename), len(angio_filename), len(active_filename), len(ero_filename), len(ulcer_filename), len(str_filename)

(2536, 5270, 695, 23, 536, 922, 915, 129)

In [26]:
def extract_patient_name(filenames):
    patient_names = []

    for filename in filenames:
        patient_names.append(filename.split('___')[0])
    return patient_names

In [27]:
neg_1_patient_name = extract_patient_name(neg_1_filename)
neg_2_patient_name = extract_patient_name(neg_2_filename)
redspot_patient_name = extract_patient_name(redspot_filename)
angio_patient_name = extract_patient_name(angio_filename)
active_patient_name = extract_patient_name(active_filename)
ero_patient_name = extract_patient_name(ero_filename)
ulcer_patient_name = extract_patient_name(ulcer_filename)
str_patient_name = extract_patient_name(str_filename)

In [28]:
len(neg_1_patient_name), len(neg_2_patient_name), len(redspot_patient_name), len(angio_patient_name), len(active_patient_name), len(ero_patient_name), len(ulcer_patient_name), len(str_patient_name)

(2536, 5270, 695, 23, 536, 922, 915, 129)

In [29]:
redspot_patient_name

['EA',
 'BK',
 'EA',
 'BS',
 'EA',
 'BS',
 'EA',
 'BS',
 'GB',
 'BS',
 'GB',
 'BS',
 'GB',
 'BS',
 'GK',
 'BS',
 'HG',
 'BS',
 'HG',
 'BS',
 'HG',
 'CGB',
 'HG',
 'CGB',
 'HG',
 'CGB',
 'HG',
 'CGB',
 'HG',
 'CGB',
 'HG',
 'CGB',
 'HG',
 'CGB',
 'HG',
 'CGB',
 'HG',
 'CGB',
 'HG',
 'CGB',
 'HG',
 'CH',
 'HG',
 'CH',
 'HG',
 'CH',
 'HG',
 'CH',
 'HS',
 'CH',
 'HS',
 'CH',
 'Hr',
 'CH',
 'Hr',
 'CH',
 'IH',
 'CJ',
 'IK',
 'CJ',
 'IK',
 'CJ',
 'IK',
 'CY',
 'IK',
 'DK',
 'IK',
 'DO',
 'IS',
 'DO',
 'JB',
 'DO',
 'JB',
 'DO',
 'JB',
 'DO',
 'JB',
 'DP',
 'JB',
 'EA',
 'JB',
 'EA',
 'EA',
 'JB',
 'EA',
 'JH',
 'EA',
 'JH',
 'GB',
 'jb',
 'JH',
 'JH',
 'JL',
 'JL',
 'JL',
 'JL',
 'JL',
 'JL',
 'Jy',
 'KH',
 'KH',
 'KH',
 'KH',
 'KH',
 'KH',
 'KH',
 'KH',
 'KH',
 'KI',
 'KI',
 'KI',
 'KJ',
 'KJ',
 'KJ',
 'KJ',
 'KJ',
 'KJ',
 'KJ',
 'KJ',
 'KJ',
 'KK',
 'KK',
 'KK',
 'KK',
 'KM',
 'KM',
 'KM',
 'LE',
 'LE',
 'LE',
 'LE',
 'LE',
 'LE',
 'LE',
 'LE',
 'LE',
 'LE',
 'LE',
 'LE',
 'LE',
 'LE',
 'L

In [30]:
neg_1_patient = np.unique(neg_1_patient_name)
neg_2_patient = np.unique(neg_2_patient_name)
redspot_patient = np.unique(redspot_patient_name)
angio_patient = np.unique(angio_patient_name)
active_patient = np.unique(active_patient_name)
ero_patient = np.unique(ero_patient_name)
ulcer_patient = np.unique(ulcer_patient_name)
str_patient = np.unique(str_patient_name)

In [11]:
len(neg_1_patient), len(neg_2_patient), len(redspot_patient), len(angio_patient), len(active_patient), len(ero_patient), len(ulcer_patient), len(str_patient)

(160, 62, 142, 4, 56, 101, 64, 16)

In [12]:
len(neg_1_patient), len(neg_2_patient), len(redspot_patient), len(angio_patient), len(active_patient), len(ero_patient), len(ulcer_patient), len(str_patient)

(160, 62, 142, 4, 56, 101, 64, 16)

In [13]:
len(np.intersect1d(neg_1_patient, neg_2_patient))

41

In [14]:
active_patient

array(['BK', 'CH', 'CJ', 'CK', 'EA', 'GK', 'HG', 'JH', 'JP', 'KH', 'KI',
       'KJ', 'KK', 'LO', 'Lh', 'SJ', 'YK', 'YM', 'mK', 'mj', '강은', '경김',
       '광김', '광배', '금황', '기김', '명최', '범정', '복이', '상이', '서황', '순김', '순황',
       '승김', '승박', '승이', '안창', '양임', '영김', '영이', '옥서', '옥이', '용이', '윤이',
       '인이', '정김', '정박', '종문', '종신', '진김', '진박', '창정', '철김', '태김', '해정',
       '현이'], dtype='<U2')

In [15]:
total_patient = np.hstack([neg_1_patient, neg_2_patient, 
                           redspot_patient, angio_patient,active_patient, 
                           ero_patient, ulcer_patient, str_patient])
total_patient_wo_redundancy = np.unique(total_patient)

In [16]:
total_patient_wo_redundancy

array(['AJ', 'BK', 'BS', 'CE', 'CGB', 'CH', 'CJ', 'CK', 'CP', 'CW', 'CY',
       'DG', 'DJ', 'DK', 'DO', 'DP', 'EA', 'EC', 'EG', 'EK', 'EL', 'GB',
       'GK', 'GN', 'HG', 'HL', 'HN', 'HS', 'Hr', 'IH', 'IK', 'IS', 'Is',
       'JB', 'JG', 'JH', 'JJ', 'JK', 'JL', 'JO', 'JP', 'JS', 'JY', 'Jy',
       'KH', 'KI', 'KJ', 'KK', 'KM', 'KS', 'LE', 'LO', 'LS', 'Lh', 'Ly',
       'ML', 'MN', 'MP', 'NT', 'Nm', 'OD', 'PI', 'PJ', 'PM', 'SH', 'SJ',
       'SK', 'SL', 'SP', 'TI', 'TY', 'WK', 'YB', 'YJ', 'YK', 'YM', 'Yn',
       'ck', 'cp', 'dG', 'hk', 'iu', 'jb', 'jh', 'mK', 'mj', 'oP', 'sK',
       '강은', '건박', '경김', '경박', '경백', '경양', '경윤', '경이', '계신', '고상', '광김',
       '광배', '귀박', '규김', '규심', '금황', '기강', '기김', '기박', '기오', '기유', '김혜',
       '낙최', '달조', '대권', '대염', '동김', '동신', '맹신', '명김', '명나', '명박', '명이',
       '명채', '명최', '미김', '미장', '민강', '민김', '민방', '민최', '범안', '범정', '병김',
       '병문', '보김', '복이', '복지', '봉오', '분김', '사고', '삼조', '상이', '상최', '서미',
       '서황', '석강', '석김', '성김', '성박', '성서', '성양', '

In [17]:
len(total_patient_wo_redundancy)

274

### Split (train_patient: test_patient = 8:2)

In [21]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [22]:
def train_test_patient_split(patients, train_rate = 0.80):
    train_idx = np.sort(np.random.choice(len(patients), round(len(patients)*train_rate), replace = False))
    test_idx = np.setxor1d(train_idx, np.arange(len(patients)))
    return patients[train_idx], patients[test_idx]

In [24]:
neg_1_train_patient, neg_1_test_patient = train_test_patient_split(neg_1_patient)
neg_2_train_patient, neg_2_test_patient = train_test_patient_split(neg_2_patient)
redspot_train_patient, redspot_test_patient = train_test_patient_split(redspot_patient)
angio_train_patient, angio_test_patient = train_test_patient_split(angio_patient)
active_train_patient, active_test_patient = train_test_patient_split(active_patient)
ero_train_patient, ero_test_patient = train_test_patient_split(ero_patient)
ulcer_train_patient, ulcer_test_patient = train_test_patient_split(ulcer_patient)
str_train_patient, str_test_patient = train_test_patient_split(str_patient)

In [25]:
np.intersect1d(neg_1_train_patient, neg_1_test_patient)

array([], dtype='<U3')

In [26]:
np.intersect1d(neg_2_train_patient, neg_2_test_patient)

array([], dtype='<U2')

In [27]:
np.intersect1d(redspot_train_patient, redspot_test_patient)

array([], dtype='<U3')

In [28]:
np.intersect1d(angio_train_patient, angio_test_patient)

array([], dtype='<U2')

In [29]:
np.intersect1d(active_train_patient, active_test_patient)

array([], dtype='<U2')

In [30]:
np.intersect1d(ero_train_patient, ero_test_patient)

array([], dtype='<U3')

In [31]:
np.intersect1d(ulcer_train_patient, ulcer_test_patient)

array([], dtype='<U3')

In [1277]:
np.intersect1d(str_train_patient, str_test_patient)

array([], dtype='<U3')

### Save Patient Info

In [931]:
import pandas as pd

In [976]:
df = pd.DataFrame(columns = ['patient', 'negative1', 'negative2', 
                             'red spot', 'angioectasia', 'active bleeding',
                             'erosion', 'ulcer', 'stricture'])

In [977]:
for i, patient in zip(range(len(total_patient_wo_redundancy)), total_patient_wo_redundancy):
    including_label = np.zeros([8])
    
    if patient in neg_1_train_patient: including_label[0] = 1
    if patient in neg_2_train_patient: including_label[1] = 1
    if patient in redspot_train_patient: including_label[2] = 1
    if patient in angio_train_patient: including_label[3] = 1
    if patient in active_train_patient: including_label[4] = 1
    if patient in ero_train_patient: including_label[5] = 1
    if patient in ulcer_train_patient: including_label[6] = 1
    if patient in str_train_patient: including_label[7] = 1
        
    if patient in neg_1_test_patient: including_label[0] = 2
    if patient in neg_2_test_patient: including_label[1] = 2
    if patient in redspot_test_patient: including_label[2] = 2
    if patient in angio_test_patient: including_label[3] = 2
    if patient in active_test_patient: including_label[4] = 2
    if patient in ero_test_patient: including_label[5] = 2
    if patient in ulcer_test_patient: including_label[6] = 2
    if patient in str_test_patient: including_label[7] = 2
        
    if np.sum(including_label) == 0:
        print("{} - {} is missed".format(i, patient))
    
    df.loc[i] = [patient, including_label[0], including_label[1], including_label[2], including_label[3],
                 including_label[4], including_label[5], including_label[6], including_label[7]]

In [978]:
df

Unnamed: 0,patient,negative1,negative2,red spot,angioectasia,active bleeding,erosion,ulcer,stricture
0,AJ,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BK,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0
2,BS,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
3,CE,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,CGB,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
5,CH,0.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0
6,CJ,0.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0
7,CK,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,CP,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,CW,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [981]:
df.to_csv("./patient_info(train_1_test_2).csv", mode='w', encoding = 'utf-8-sig')

### Load save info

In [50]:
df = pd.read_csv("./patient_info(train_1_test_2).csv")
df = df.drop(columns = 'Unnamed: 0')
df.head()

Unnamed: 0,patient,neg_1,neg_2,redspot,angio,active,ero,ulcer,str
0,AJ,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BK,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0
2,BS,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
3,CE,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,CGB,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0


In [49]:
df.to_csv("./patient_info(train_1_test_2).csv", mode='w', encoding = 'utf-8-sig')

In [74]:
def find_patient(col, phase = 'train'):
    if phase == 'train':
        return df[df[col] == 1]['patient'].tolist()
    elif phase == 'test':
        return df[df[col] == 2]['patient'].tolist()

In [75]:
neg_1_train_patient = find_patient('neg_1', 'train')
neg_2_train_patient = find_patient('neg_2', 'train')
redspot_train_patient = find_patient('redspot', 'train')
angio_train_patient = find_patient('angio', 'train')
active_train_patient = find_patient('active', 'train')
ero_train_patient = find_patient('ero', 'train')
ulcer_train_patient = find_patient('ulcer', 'train')
str_train_patient = find_patient('str', 'train')

In [76]:
neg_1_test_patient = find_patient('neg_1', 'test')
neg_2_test_patient = find_patient('neg_2', 'test')
redspot_test_patient = find_patient('redspot', 'test')
angio_test_patient = find_patient('angio', 'test')
active_test_patient = find_patient('active', 'test')
ero_test_patient = find_patient('ero', 'test')
ulcer_test_patient = find_patient('ulcer', 'test')
str_test_patient = find_patient('str', 'test')

### split into train and test filenames which belongs to train and test patients respectively

<font size = 4> by whether it belongs to train_patient or test_patient </font>

In [79]:
def train_test_by_patient(filenames, train_patient, test_patient):
    train_filename = []
    test_filename = []
    for filename in filenames:
        if filename.split('___')[0] in train_patient:
            train_filename.append(filename)
        elif filename.split('___')[0] in test_patient:
            test_filename.append(filename)
    return train_filename, test_filename

In [80]:
neg_1_train_filename, neg_1_test_filename = train_test_by_patient(neg_1_filename, neg_1_train_patient, neg_1_test_patient)
neg_2_train_filename, neg_2_test_filename = train_test_by_patient(neg_2_filename, neg_2_train_patient, neg_2_test_patient)
redspot_train_filename, redspot_test_filename = train_test_by_patient(redspot_filename, redspot_train_patient, redspot_test_patient)
angio_train_filename, angio_test_filename = train_test_by_patient(angio_filename, angio_train_patient, angio_test_patient)
active_train_filename, active_test_filename = train_test_by_patient(active_filename, active_train_patient, active_test_patient)
ero_train_filename, ero_test_filename = train_test_by_patient(ero_filename, ero_train_patient, ero_test_patient)
ulcer_train_filename, ulcer_test_filename = train_test_by_patient(ulcer_filename, ulcer_train_patient, ulcer_test_patient)
str_train_filename, str_test_filename = train_test_by_patient(str_filename, str_train_patient, str_test_patient)

In [923]:
len(neg_1_filename), len(neg_2_filename), len(redspot_filename), len(angio_filename), len(active_filename), len(ero_filename), len(ulcer_filename), len(str_filename)

(2536, 5270, 695, 23, 536, 922, 915, 129)

In [62]:
len(neg_1_filename), len(neg_2_filename), len(redspot_filename), len(angio_filename), len(active_filename), len(ero_filename), len(ulcer_filename), len(str_filename)

(2536, 5270, 695, 23, 536, 922, 915, 129)

In [924]:
len(neg_1_train_filename), len(neg_2_train_filename), len(redspot_train_filename), len(angio_train_filename), len(active_train_filename), len(ero_train_filename), len(ulcer_train_filename), len(str_train_filename)

(2029, 4328, 581, 20, 414, 801, 774, 98)

In [81]:
len(neg_1_train_filename), len(neg_2_train_filename), len(redspot_train_filename), len(angio_train_filename), len(active_train_filename), len(ero_train_filename), len(ulcer_train_filename), len(str_train_filename)

(2029, 4328, 581, 20, 414, 801, 774, 98)

In [925]:
len(neg_1_test_filename), len(neg_2_test_filename), len(redspot_test_filename), len(angio_test_filename), len(active_test_filename), len(ero_test_filename), len(ulcer_test_filename), len(str_test_filename)

(507, 942, 114, 3, 122, 121, 141, 31)

In [82]:
len(neg_1_test_filename), len(neg_2_test_filename), len(redspot_test_filename), len(angio_test_filename), len(active_test_filename), len(ero_test_filename), len(ulcer_test_filename), len(str_test_filename)

(507, 942, 114, 3, 122, 121, 141, 31)

### Preprocssing and Save

In [83]:
def printProgress(iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100):
    formatStr = "{0:." + str(decimals) + "f}"
    percent = formatStr.format(100 * (iteration / float(total)))
    filledLength = int(round(barLength * iteration / float(total)))
    bar = '#' * filledLength + '-' * (barLength - filledLength)
    sys.stdout.write('\r{} |{} | {}{} {}'.format(prefix, bar, percent, '%', suffix)),
    if iteration == total:
        sys.stdout.write('\n')
    sys.stdout.flush()

def target_preprocessings(phase_a_switch = [1, 1, 1], phase_b_switch = True, mode = 'load'):
    """
    phase_a_switch = [1, 1, 1], [0, 0 ,1], [1, 1, 0].... 
    that means [flip, rotate, blur_sharp]
    """
    phase0 = ['_c']
    phase1 = {1: ['-', 'f'], 0: ['-']}
    phase2 = {1: ['-', 'r1', 'r2', 'r3'], 0: ['-']}
    phase3 = {1: ['-', 'ab', 'mb', 'eh'], 0: ['-']}
    phase4 = ['s_-30_v_30', 's_-30_v_-30', 's_30_v_-30', 's_30_v_30']

    if mode == 'load':
        phase_a_items = [phase1[phase_a_switch[0]], phase2[phase_a_switch[1]], phase3[phase_a_switch[2]]]
    elif mode == 'preprocessing':
        phase_a_items = [phase0, phase1[phase_a_switch[0]], phase2[phase_a_switch[1]], phase3[phase_a_switch[2]]]

    phase_a = []
    for i in list(product(*phase_a_items)):
        phase_a.append('_'.join(i))

    if not phase_b_switch != True:
        phase_b = []
        for i in list(product(*[phase_a, phase4])):
            phase_b.append('_'.join(i))
        return list(np.hstack([phase_a, phase_b]))
    else:
        return phase_a 

In [84]:
class ce_preprocessing:
    def __init__(self, data_dir, save_dir):
        self.data_dir = data_dir
        self.save_dir = save_dir

    def cropping(self, img):
        img = np.array(img, dtype = 'f4')
        img_pre = img[32:544, 32:544, :]
        for i in range(100):
            for j in range(100):
                if i + j > 99:
                    pass
                else :
                    img_pre[i, j, :] = 0
                    img_pre[i, -j, :] = 0
        return img_pre.astype('uint8')

    def rotate(self, img, degree):
        rows, cols = img.shape[:2]
        M = cv2.getRotationMatrix2D(center = (cols/2, rows/2), angle = degree, scale = 1)
        img_rotated = cv2.warpAffine(img, M, dsize = (rows, cols))
        return img_rotated
    
    def blur_and_sharp(self, img):
        img_avg_blur = cv2.blur(img, (5,5)).astype('uint8')
        
        kernel_size = 15
        
        kernel_motion_blur = np.zeros((kernel_size, kernel_size))
        kernel_motion_blur[int((kernel_size-1)/2), :] = np.ones(kernel_size)
        kernel_motion_blur = kernel_motion_blur / kernel_size
        img_mb = cv2.filter2D(img, -1, kernel_motion_blur).astype('uint8')
        
        kernel_edge_enhancement = np.array([[-1,-1,-1,-1,-1],[-1,2,2,2,-1],[-1,2,8,2,-1],[-1,2,2,2,-1],[-1,-1,-1,-1,-1]])/8.0
        img_eh = cv2.filter2D(img, -1, kernel_edge_enhancement).astype('uint8')    
        return img_avg_blur, img_mb, img_eh
    
    def bgr2_h_s_v(self, img):
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        h, s, v = cv2.split(hsv)
        return h, s, v

    def hsv_control(self, ch_data, ctr_value, ch_name):
        """
        ch_data: data of channel (h, s, or v) which you want to revise by ctr_value / shape: image.shape[0:2]
        ctr_value: the value that will be added to corresponding channel.
        ch_name: 'h', 's', or 'v'
        """
        ch_data_rev = ch_data.copy()
        if ctr_value > 0:
            ch_data_rev[np.where(ch_data <= 255 - ctr_value)] += ctr_value
        else:
            ch_data_rev[np.where(ch_data + ctr_value >= 0)] -= abs(ctr_value)
        return ch_data_rev
    
    def pre_aug(self, img, phase = 'x160'):  
        """
        The image will be preprocessed and augmented at one go 
        by an entire process consisting of  the repetitive statement (for loop) per the processing phase 
        """
        preprocessed_imgs = []
        preprocessed_nots = []
        
        crop = self.cropping(img)
        if phase == 'crop':
            return [crop], ['_c_-_-_-']
        else:
            sv_ctr_values = [-30, 30]
            c_r1, c_r2, c_r3 = self.rotate(crop, 90), self.rotate(crop, 180), self.rotate(crop, 270)
            for r, r_n in zip([crop, c_r1, c_r2, c_r3], ['-', 'r1', 'r2', 'r3']):
                r_f = np.flipud(r)
                for f,  f_n in zip([r, r_f], ['-', 'f']): 
                    f_ab, f_mb, f_edge = self.blur_and_sharp(f)
                    for b, b_n in zip([f, f_ab, f_mb, f_edge], ['-', 'ab', 'mb', 'eh']):                    
                        preprocessed_imgs.append(b)
                        not_ = '_c_{}_{}_{}'.format(f_n, r_n, b_n)
                        preprocessed_nots.append(not_)
                        h, s, v = self.bgr2_h_s_v(b)
                        for s_value in sv_ctr_values:
                            s_rev = self.hsv_control(s, s_value, ch_name = 's')
                            for v_value in sv_ctr_values:
                                v_rev = self.hsv_control(v, v_value, ch_name = 'v')
                                v_rev[np.where(v <= 7)] = 0
                                b_sv = cv2.merge((h, s_rev, v_rev))
                                b_sv = cv2.cvtColor(b_sv, cv2.COLOR_HSV2BGR)
                                preprocessed_imgs.append(b_sv)
                                not_ = '_c_{}_{}_{}_s_{}_v_{}'.format(f_n, r_n, b_n, s_value, v_value)
                                preprocessed_nots.append(not_) 
                if not phase != 'before_rotation':
                    break
            return preprocessed_imgs, preprocessed_nots
    
    def avg_blur(self, img):
        return cv2.blur(img, (5,5)).astype('uint8')
    
    def motion_blur(self, img):
        kernel_size = 15
        kernel_motion_blur = np.zeros((kernel_size, kernel_size))
        kernel_motion_blur[int((kernel_size-1)/2), :] = np.ones(kernel_size)
        kernel_motion_blur = kernel_motion_blur / kernel_size
        return cv2.filter2D(img, -1, kernel_motion_blur).astype('uint8')
    
    def edge_enhancement(self, img):
        kernel_edge = np.array([[-1,-1,-1,-1,-1],[-1,2,2,2,-1],[-1,2,8,2,-1],[-1,2,2,2,-1],[-1,-1,-1,-1,-1]])/8.0
        return cv2.filter2D(img, -1, kernel_edge).astype('uint8') 
    
    def s_rev(self, img, s_value):
        h, s, v = self.bgr2_h_s_v(img)
        s_rev = self.hsv_control(s, s_value, ch_name = 's')
        
        return [h, s_rev, v]
    def v_rev_after_s_rev(self, s_rev_outputs, v_value):
        h, s_rev, v = s_rev_outputs
        v_rev = self.hsv_control(v, v_value, ch_name = 'v')
        v_rev[np.where(v <= 7)] = 0
        img_sv = cv2.merge((h, s_rev, v_rev))
        return cv2.cvtColor(img_sv, cv2.COLOR_HSV2BGR)
            
    def pre_aug_target_phase(self, img, phase = 'c'): 
        
        """
        phase, ex) 'c_f_-_mb_s_-30_v_30' -> 'c_f_-_mb_s-30_v30' -> ['c', 'f', '-', 's-30','v30']
        It allows to preprocess the image in specific phase, but slower it is fit to check preprocessing with small data
        """
        function = {'': (lambda x: x), '-': (lambda x: x),
                    'c': (lambda x: self.cropping(x)),
                    'f': (lambda x: np.flipud(x)), 
                    'r1': (lambda x: self.rotate(x, 90)), 
                    'r2': (lambda x: self.rotate(x, 180)), 
                    'r3': (lambda x: self.rotate(x, 270)),
                    'ab': (lambda x: self.avg_blur(x)),
                    'mb': (lambda x: self.motion_blur(x)),
                    'eh': (lambda x: self.edge_enhancement(x)),
                    's-30': (lambda x: self.s_rev(x, -30)),
                    's30': (lambda x: self.s_rev(x, 30)),
                    'v-30': (lambda x: self.v_rev_after_s_rev(x, -30)),
                    'v30': (lambda x: self.v_rev_after_s_rev(x, 30))}
        values = ['-30', '30']
        for i in values:
            if i in phase:
                phase = phase.replace('_{}'.format(i), str(i))
        phase_seg = phase.split('_')  
        for i, p in zip(range(len(phase_seg)), phase_seg):
            if i == 0:
                p_img = function[p](img)
            else:
                p_img = function[p](p_img)
        return p_img
    
    def pre_aug_and_save(self, phase, cls, les, filename, preprocessing_phase = 'x160', pre_aug_type = 'for_loop',
                         phase_a = [1, 1, 1], phase_b = True):
        
        """
        phase = 'train', 'test'
        cls: [les]  
          'n': ['neg']
          'h': ['redspot', 'angio', 'active'], 
          'd': ['ero', 'ulc', 'str'],
          'p': ['amp', 'lym', 'tum']}
        preprocessing_phase = 'x160', 'crop', 'before_rotation' for pre_aug
        phase_a = [1, 1, 1], [1, 0, 1], [1, 1, 0] .... [flip, rotate, blur_sharp]
        phase_b = True -> phase_a (max. x32) + phase_a * sv_control (max. x32x4) => max, 32 x 5
        """
        lesions = dict(neg = 'negative', 
                       redspot = 'red_spot', angio = 'angioectasia', active = 'active_bleeding', 
                       ero = 'erosion', ulcer = 'ulcer', str = 'stricture', 
                       amp = 'ampulla_of_vater', lym = 'lymphoid_follicles', tum = 'small_bowel_tumor')
        classes = dict(n = 'negative', h = 'hemorrhagic', d = 'depressed', p = 'protruded')
        
        save_path = os.path.join(self.save_dir, phase, classes[cls], lesions[les])
        import_path = os.path.join(self.data_dir, classes[cls], lesions[les])

        if not(os.path.isdir(save_path)):
            os.makedirs(save_path)
        
        for i, f in zip(range(1, len(filename)+1), filename) :
            img = cv2.imread(import_path + '/' + f)
            if pre_aug_type == 'for_loop':
                p_imgs, p_nots = self.pre_aug(img, phase = preprocessing_phase)  
                for img_, not_ in zip(p_imgs, p_nots):
                    save_filename = os.path.join(save_path, '{}_{}{}'.format(f[:-4], not_, f[-4:]))
                    if not(os.path.isfile(save_filename)):
                        cv2.imwrite(save_filename, img_)
            elif pre_aug_type == 'target_phase':
                for not_ in target_preprocessings(phase_a, phase_b, mode = 'preprocessing'):
                    save_filename = os.path.join(save_path, '{}_{}{}'.format(f[:-4], not_, f[-4:]))
                    if not(os.path.isfile(save_filename)):
                        p_img = self.pre_aug_target_phase(img, phase = not_)
                        cv2.imwrite(save_filename, p_img)
            printProgress(i, len(filename), prefix = '{:05d}'.format(len(filename)))

In [85]:
data_dir = '/mnt/disk2/data/private_data/SMhospital/capsule/0 data/labeled/190520 p3_2/p3_2'
save_dir =  '/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed/sm_x160'

ce_pre = ce_preprocessing(data_dir, save_dir)

In [86]:
ce_pre.pre_aug_and_save(phase = 'train', cls = 'n', les = 'neg', filename = neg_1_train_filename,
                        phase_a = [1, 1, 1], phase_b = True, pre_aug_type ='target_phase')

02029 |#################################################################################################### | 100.0% 


In [1373]:
ce_pre.pre_aug_and_save(phase = 'train', cls = 'h', les = 'redspot', filename = redspot_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'h', les = 'angio', filename = angio_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'h', les = 'active', filename = active_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'd', les = 'ero', filename = ero_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'd', les = 'ulcer', filename = ulcer_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'd', les = 'str', filename = str_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'n', les = 'neg', filename = neg_1_train_filename,
                        phase_a = [1, 0, 1], phase_b = False, pre_aug_type ='by_phase')

00581 |#################################################################################################### | 100.0% 
00020 |#################################################################################################### | 100.0% 
00414 |#################################################################################################### | 100.0% 
00801 |#################################################################################################### | 100.0% 
00774 |#################################################################################################### | 100.0% 
00098 |#################################################################################################### | 100.0% 
02029 |#################################################################################################### | 100.0% 


In [1372]:
ce_pre.pre_aug_and_save(phase = 'test', cls = 'h', les = 'redspot', filename = redspot_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'h', les = 'angio', filename = angio_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'h', les = 'active', filename = active_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'd', les = 'ero', filename = ero_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'd', les = 'ulcer', filename = ulcer_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'd', les = 'str', filename = str_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'n', les = 'neg', filename = neg_1_test_filename, 
                        phase_a = [1, 0, 1], phase_b = False, pre_aug_type ='by_phase')

00114 |#################################################################################################### | 100.0% 
00003 |#################################################################################################### | 100.0% 
00122 |#################################################################################################### | 100.0% 
00121 |#################################################################################################### | 100.0% 
00141 |#################################################################################################### | 100.0% 
00031 |#################################################################################################### | 100.0% 
00507 |#################################################################################################### | 100.0% 


In [1376]:
neg_2_dir = '/mnt/disk2/data/private_data/SMhospital/capsule/0 data/labeled/190814 negative/raw'

ce_pre = ce_preprocessing(neg_2_dir, save_dir)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'n', les = 'neg', filename = neg_2_train_filename,
                        phase_a = [1, 1, 0, 1], phase_b = False, pre_aug_type ='by_phase') 
ce_pre.pre_aug_and_save(phase = 'test', cls = 'n', les = 'neg', filename = neg_2_test_filename, 
                        phase_a = [1, 1, 0, 1], phase_b = False, pre_aug_type ='by_phase') 

04328 |#################################################################################################### | 100.0% 
00942 |#################################################################################################### | 100.0% 


In [1185]:
def num_of_jpg_file(dir_):
    print(len(glob.glob(dir_ + '/*.jpg')))

In [1514]:
num_of_jpg_file(save_dir + '/train/negative/negative' )
num_of_jpg_file(save_dir + '/train/hemorrhagic/red_spot' )
num_of_jpg_file(save_dir + '/train/hemorrhagic/angioectasia' )
num_of_jpg_file(save_dir + '/train/hemorrhagic/active_bleeding' )
num_of_jpg_file(save_dir + '/train/depressed/erosion' )
num_of_jpg_file(save_dir + '/train/depressed/ulcer' )
num_of_jpg_file(save_dir + '/train/depressed/stricture' )

50856
92960
3200
66240
128160
123840
15680


In [1396]:
(len(neg_1_train_filename) + len(neg_2_train_filename))*8, len(redspot_train_filename)*160, len(angio_train_filename)*160, len(active_train_filename)*160, len(ero_train_filename)*160, len(ulcer_train_filename)*160, len(str_train_filename)*160

(50856, 92960, 3200, 66240, 128160, 123840, 15680)

In [1515]:
num_of_jpg_file(save_dir + '/test/negative/negative' )
num_of_jpg_file(save_dir + '/test/hemorrhagic/red_spot' )
num_of_jpg_file(save_dir + '/test/hemorrhagic/angioectasia' )
num_of_jpg_file(save_dir + '/test/hemorrhagic/active_bleeding' )
num_of_jpg_file(save_dir + '/test/depressed/erosion' )
num_of_jpg_file(save_dir + '/test/depressed/ulcer' )
num_of_jpg_file(save_dir + '/test/depressed/stricture' )

11592
18240
480
19520
19360
22560
4960


In [1395]:
(len(neg_1_test_filename) + len(neg_2_test_filename))*8, len(redspot_test_filename)*160, len(angio_test_filename)*160, len(active_test_filename)*160, len(ero_test_filename)*160, len(ulcer_test_filename)*160, len(str_test_filename)*160

(11592, 18240, 480, 19520, 19360, 22560, 4960)

# Train / Test with no redundacny of patient

In [3]:
data_dir = '/mnt/disk2/data/private_data/SMhospital/capsule/0 data/labeled'

neg_1_filename = load_filename('p3_2', 'n', 'neg', data = '190520 p3_2', data_dir = data_dir)
neg_2_filename = load_filename('raw', 'n', 'neg', data = '190814 negative', data_dir = data_dir)
redspot_filename = load_filename('p3_2', 'h', 'redspot', data = '190520 p3_2', data_dir = data_dir)
angio_filename = load_filename('p3_2', 'h', 'angio', data = '190520 p3_2', data_dir = data_dir)
active_filename = load_filename('p3_2', 'h', 'active', data = '190520 p3_2', data_dir = data_dir)
ero_filename = load_filename('p3_2', 'd', 'ero', data = '190520 p3_2', data_dir = data_dir)
ulcer_filename = load_filename('p3_2', 'd', 'ulcer', data = '190520 p3_2', data_dir = data_dir)
str_filename = load_filename('p3_2', 'd', 'str', data = '190520 p3_2', data_dir = data_dir)

In [5]:
def train_test_split(filenames, train_rate = 0.8):
    n = len(filenames)
    train_idx = np.random.choice(n, int(n*train_rate), replace = False)
    test_idx = np.setdiff1d(np.arange(n), train_idx)
    return filenames[train_idx], filenames[test_idx]

In [8]:
neg_1_train_filename, neg_1_test_filename = train_test_split(neg_1_filename)
neg_2_train_filename, neg_2_test_filename = train_test_split(neg_2_filename)
redspot_train_filename, redspot_test_filename = train_test_split(redspot_filename)
angio_train_filename, angio_test_filename = train_test_split(angio_filename)
active_train_filename, active_test_filename = train_test_split(active_filename)
ulcer_train_filename, ulcer_test_filename = train_test_split(ulcer_filename)
ero_train_filename, ero_test_filename = train_test_split(ero_filename)
str_train_filename, str_test_filename = train_test_split(str_filename)

In [4]:
def printProgress(iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100):
    formatStr = "{0:." + str(decimals) + "f}"
    percent = formatStr.format(100 * (iteration / float(total)))
    filledLength = int(round(barLength * iteration / float(total)))
    bar = '#' * filledLength + '-' * (barLength - filledLength)
    sys.stdout.write('\r{} |{} | {}{} {}'.format(prefix, bar, percent, '%', suffix)),
    if iteration == total:
        sys.stdout.write('\n')
    sys.stdout.flush()

def target_preprocessings(phase_a_switch = [1, 1, 1], phase_b_switch = True, mode = 'load'):
    """
    phase_a_switch = [1, 1, 1], [0, 0 ,1], [1, 1, 0].... 
    that means [flip, rotate, blur_sharp]
    """
    phase0 = ['_c']
    phase1 = {1: ['-', 'f'], 0: ['-']}
    phase2 = {1: ['-', 'r1', 'r2', 'r3'], 0: ['-']}
    phase3 = {1: ['-', 'ab', 'mb', 'eh'], 0: ['-']}
    phase4 = ['s_-30_v_30', 's_-30_v_-30', 's_30_v_-30', 's_30_v_30']

    if mode == 'load':
        phase_a_items = [phase1[phase_a_switch[0]], phase2[phase_a_switch[1]], phase3[phase_a_switch[2]]]
    elif mode == 'preprocessing':
        phase_a_items = [phase0, phase1[phase_a_switch[0]], phase2[phase_a_switch[1]], phase3[phase_a_switch[2]]]

    phase_a = []
    for i in list(product(*phase_a_items)):
        phase_a.append('_'.join(i))

    if not phase_b_switch != True:
        phase_b = []
        for i in list(product(*[phase_a, phase4])):
            phase_b.append('_'.join(i))
        return list(np.hstack([phase_a, phase_b]))
    else:
        return phase_a 

In [5]:
class ce_preprocessing:
    def __init__(self, data_dir, save_dir):
        self.data_dir = data_dir
        self.save_dir = save_dir

    def cropping(self, img):
        img = np.array(img, dtype = 'f4')
        img_pre = img[32:544, 32:544, :]
        for i in range(100):
            for j in range(100):
                if i + j > 99:
                    pass
                else :
                    img_pre[i, j, :] = 0
                    img_pre[i, -j, :] = 0
        return img_pre.astype('uint8')

    def rotate(self, img, degree):
        rows, cols = img.shape[:2]
        M = cv2.getRotationMatrix2D(center = (cols/2, rows/2), angle = degree, scale = 1)
        img_rotated = cv2.warpAffine(img, M, dsize = (rows, cols))
        return img_rotated
    
    def blur_and_sharp(self, img):
        img_avg_blur = cv2.blur(img, (5,5)).astype('uint8')
        
        kernel_size = 15
        
        kernel_motion_blur = np.zeros((kernel_size, kernel_size))
        kernel_motion_blur[int((kernel_size-1)/2), :] = np.ones(kernel_size)
        kernel_motion_blur = kernel_motion_blur / kernel_size
        img_mb = cv2.filter2D(img, -1, kernel_motion_blur).astype('uint8')
        
        kernel_edge_enhancement = np.array([[-1,-1,-1,-1,-1],[-1,2,2,2,-1],[-1,2,8,2,-1],[-1,2,2,2,-1],[-1,-1,-1,-1,-1]])/8.0
        img_eh = cv2.filter2D(img, -1, kernel_edge_enhancement).astype('uint8')    
        return img_avg_blur, img_mb, img_eh
    
    def bgr2_h_s_v(self, img):
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        h, s, v = cv2.split(hsv)
        return h, s, v

    def hsv_control(self, ch_data, ctr_value, ch_name):
        """
        ch_data: data of channel (h, s, or v) which you want to revise by ctr_value / shape: image.shape[0:2]
        ctr_value: the value that will be added to corresponding channel.
        ch_name: 'h', 's', or 'v'
        """
        ch_data_rev = ch_data.copy()
        if ctr_value > 0:
            ch_data_rev[np.where(ch_data <= 255 - ctr_value)] += ctr_value
        else:
            ch_data_rev[np.where(ch_data + ctr_value >= 0)] -= abs(ctr_value)
        return ch_data_rev
    
    def pre_aug(self, img, phase = 'x160'):  
        """
        The image will be preprocessed and augmented at one go 
        by an entire process consisting of  the repetitive statement (for loop) per the processing phase 
        """
        preprocessed_imgs = []
        preprocessed_nots = []
        
        crop = self.cropping(img)
        if phase == 'crop':
            return [crop], ['_c_-_-_-']
        else:
            sv_ctr_values = [-30, 30]
            c_r1, c_r2, c_r3 = self.rotate(crop, 90), self.rotate(crop, 180), self.rotate(crop, 270)
            for r, r_n in zip([crop, c_r1, c_r2, c_r3], ['-', 'r1', 'r2', 'r3']):
                r_f = np.flipud(r)
                for f,  f_n in zip([r, r_f], ['-', 'f']): 
                    f_ab, f_mb, f_edge = self.blur_and_sharp(f)
                    for b, b_n in zip([f, f_ab, f_mb, f_edge], ['-', 'ab', 'mb', 'eh']):                    
                        preprocessed_imgs.append(b)
                        not_ = '_c_{}_{}_{}'.format(f_n, r_n, b_n)
                        preprocessed_nots.append(not_)
                        h, s, v = self.bgr2_h_s_v(b)
                        for s_value in sv_ctr_values:
                            s_rev = self.hsv_control(s, s_value, ch_name = 's')
                            for v_value in sv_ctr_values:
                                v_rev = self.hsv_control(v, v_value, ch_name = 'v')
                                v_rev[np.where(v <= 7)] = 0
                                b_sv = cv2.merge((h, s_rev, v_rev))
                                b_sv = cv2.cvtColor(b_sv, cv2.COLOR_HSV2BGR)
                                preprocessed_imgs.append(b_sv)
                                not_ = '_c_{}_{}_{}_s_{}_v_{}'.format(f_n, r_n, b_n, s_value, v_value)
                                preprocessed_nots.append(not_) 
                if not phase != 'before_rotation':
                    break
            return preprocessed_imgs, preprocessed_nots
    
    def avg_blur(self, img):
        return cv2.blur(img, (5,5)).astype('uint8')
    
    def motion_blur(self, img):
        kernel_size = 15
        kernel_motion_blur = np.zeros((kernel_size, kernel_size))
        kernel_motion_blur[int((kernel_size-1)/2), :] = np.ones(kernel_size)
        kernel_motion_blur = kernel_motion_blur / kernel_size
        return cv2.filter2D(img, -1, kernel_motion_blur).astype('uint8')
    
    def edge_enhancement(self, img):
        kernel_edge = np.array([[-1,-1,-1,-1,-1],[-1,2,2,2,-1],[-1,2,8,2,-1],[-1,2,2,2,-1],[-1,-1,-1,-1,-1]])/8.0
        return cv2.filter2D(img, -1, kernel_edge).astype('uint8') 
    
    def s_rev(self, img, s_value):
        h, s, v = self.bgr2_h_s_v(img)
        s_rev = self.hsv_control(s, s_value, ch_name = 's')
        return [h, s_rev, v]
    
    def v_rev_after_s_rev(self, s_rev_outputs, v_value):
        h, s_rev, v = s_rev_outputs
        v_rev = self.hsv_control(v, v_value, ch_name = 'v')
        v_rev[np.where(v <= 7)] = 0
        img_sv = cv2.merge((h, s_rev, v_rev))
        return cv2.cvtColor(img_sv, cv2.COLOR_HSV2BGR)
            
    def pre_aug_target_phase(self, img, phase = 'c'): 
        
        """
        phase, ex) 'c_f_-_mb_s_-30_v_30' -> 'c_f_-_mb_s-30_v30' -> ['c', 'f', '-', 's-30','v30']
        It allows to preprocess the image in specific phase, but slower it is fit to check preprocessing with small data
        """
        function = {'': (lambda x: x), '-': (lambda x: x),
                    'c': (lambda x: self.cropping(x)),
                    'f': (lambda x: np.flipud(x)), 
                    'r1': (lambda x: self.rotate(x, 90)), 
                    'r2': (lambda x: self.rotate(x, 180)), 
                    'r3': (lambda x: self.rotate(x, 270)),
                    'ab': (lambda x: self.avg_blur(x)),
                    'mb': (lambda x: self.motion_blur(x)),
                    'eh': (lambda x: self.edge_enhancement(x)),
                    's-30': (lambda x: self.s_rev(x, -30)),
                    's30': (lambda x: self.s_rev(x, 30)),
                    'v-30': (lambda x: self.v_rev_after_s_rev(x, -30)),
                    'v30': (lambda x: self.v_rev_after_s_rev(x, 30))}
        values = ['-30', '30']
        for i in values:
            if i in phase:
                phase = phase.replace('_{}'.format(i), str(i))
        phase_seg = phase.split('_')  
        for i, p in zip(range(len(phase_seg)), phase_seg):
            if i == 0:
                p_img = function[p](img)
            else:
                p_img = function[p](p_img)
        return p_img
    
    def pre_aug_and_save(self, phase, cls, les, filename, preprocessing_phase = 'x160', pre_aug_type = 'for_loop',
                         phase_a = [1, 1, 1], phase_b = True):
        
        """
        phase = 'train', 'test'
        cls: [les]  
          'n': ['neg']
          'h': ['redspot', 'angio', 'active'], 
          'd': ['ero', 'ulc', 'str'],
          'p': ['amp', 'lym', 'tum']}
        preprocessing_phase = 'x160', 'crop', 'before_rotation' for pre_aug
        phase_a = [1, 1, 1], [1, 0, 1], [1, 1, 0] .... [flip, rotate, blur_sharp]
        phase_b = True -> phase_a (max. x32) + phase_a * sv_control (max. x32x4) => max, 32 x 5
        """
        lesions = dict(neg = 'negative', 
                       redspot = 'red_spot', angio = 'angioectasia', active = 'active_bleeding', 
                       ero = 'erosion', ulcer = 'ulcer', str = 'stricture', 
                       amp = 'ampulla_of_vater', lym = 'lymphoid_follicles', tum = 'small_bowel_tumor')
        classes = dict(n = 'negative', h = 'hemorrhagic', d = 'depressed', p = 'protruded')
        
        save_path = os.path.join(self.save_dir, phase, classes[cls], lesions[les])
        import_path = os.path.join(self.data_dir, classes[cls], lesions[les])

        if not(os.path.isdir(save_path)):
            os.makedirs(save_path)
        
        for i, f in zip(range(1, len(filename)+1), filename) :
            img = cv2.imread(import_path + '/' + f)
            if pre_aug_type == 'for_loop':
                p_imgs, p_nots = self.pre_aug(img, phase = preprocessing_phase)  
                for img_, not_ in zip(p_imgs, p_nots):
                    save_filename = os.path.join(save_path, '{}_{}{}'.format(f[:-4], not_, f[-4:]))
                    if not(os.path.isfile(save_filename)):
                        cv2.imwrite(save_filename, img_)
            elif pre_aug_type == 'target_phase':
                for not_ in target_preprocessings(phase_a, phase_b, mode = 'preprocessing'):
                    save_filename = os.path.join(save_path, '{}_{}{}'.format(f[:-4], not_, f[-4:]))
                    if not(os.path.isfile(save_filename)):
                        p_img = self.pre_aug_target_phase(img, phase = not_)
                        cv2.imwrite(save_filename, p_img)
            printProgress(i, len(filename), prefix = '{:05d}'.format(len(filename)))

In [49]:
data_dir = '/mnt/disk2/data/private_data/SMhospital/capsule/0 data/labeled/190520 p3_2/p3_2'
save_dir =  '/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed/sm_x160_v2'

ce_pre = ce_preprocessing(data_dir, save_dir)

In [14]:
ce_pre.pre_aug_and_save(phase = 'test', cls = 'h', les = 'redspot', filename = redspot_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'h', les = 'angio', filename = angio_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'h', les = 'active', filename = active_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'd', les = 'ero', filename = ero_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'd', les = 'ulcer', filename = ulcer_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'd', les = 'str', filename = str_test_filename)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'n', les = 'neg', filename = neg_1_test_filename)

00139 |#################################################################################################### | 100.0% 
00005 |#################################################################################################### | 100.0% 
00108 |#################################################################################################### | 100.0% 
00185 |#################################################################################################### | 100.0% 
00183 |#################################################################################################### | 100.0% 
00026 |#################################################################################################### | 100.0% 
00508 |#################################################################################################### | 100.0% 


In [15]:
ce_pre.pre_aug_and_save(phase = 'train', cls = 'h', les = 'redspot', filename = redspot_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'h', les = 'angio', filename = angio_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'h', les = 'active', filename = active_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'd', les = 'ero', filename = ero_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'd', les = 'ulcer', filename = ulcer_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'd', les = 'str', filename = str_train_filename)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'n', les = 'neg', filename = neg_1_train_filename)

00556 |#################################################################################################### | 100.0% 
00018 |#################################################################################################### | 100.0% 
00428 |#################################################################################################### | 100.0% 
00737 |#################################################################################################### | 100.0% 
00732 |#################################################################################################### | 100.0% 
00103 |#################################################################################################### | 100.0% 
02028 |####################################################------------------------------------------------ | 52.2% 

KeyboardInterrupt: 

In [50]:
ce_pre.pre_aug_and_save(phase = 'train', cls = 'n', les = 'neg', filename = neg_1_train_filename, pre_aug_type = 'target_phase')

02028 |#################################################################################################### | 100.0% 


In [51]:
neg_2_dir = '/mnt/disk2/data/private_data/SMhospital/capsule/0 data/labeled/190814 negative/raw'

ce_pre = ce_preprocessing(neg_2_dir, save_dir)
ce_pre.pre_aug_and_save(phase = 'train', cls = 'n', les = 'neg', filename = neg_2_train_filename) 
ce_pre.pre_aug_and_save(phase = 'test', cls = 'n', les = 'neg', filename = neg_2_test_filename, phase_a = [0, 0, 0], phase_b = False, 
                        pre_aug_type = 'target_phase') 

04216 |#################################################################################################### | 100.0% 


TypeError: pre_aug_and_save() got an unexpected keyword argument 'prepre_aug_type'

In [52]:
ce_pre.pre_aug_and_save(phase = 'test', cls = 'n', les = 'neg', filename = neg_2_test_filename, phase_a = [0, 0, 0], phase_b = False, 
                        pre_aug_type = 'target_phase') 

01054 |#################################################################################################### | 100.0% 


In [8]:
test_names = []
for i in glob.glob('/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed/sm_x160_v2/test/negative/negative/*.jpg'):
    filename = os.path.basename(i)
    origin_fn = filename.split('__c_')[0] + '.jpg'
    
    test_names.append(origin_fn)

In [11]:
test_names = np.unique(test_names)

In [12]:
len(test_names)

1562

In [14]:
test_names

array(['AJ___02-08-16___1015395.jpg', 'AJ___02-09-57___1015598.jpg',
       'AJ___02-10-14___1015632.jpg', ..., '희정___04-45-38___2047831.jpg',
       '희정___05-17-56___2052919.jpg', '희정___05-19-44___2053248.jpg'],
      dtype='<U28')

In [19]:
neg_2_names = os.listdir('/mnt/disk2/data/private_data/SMhospital/capsule/0 data/labeled/190814 negative/raw/negative/negative')

In [20]:
len(neg_2_names)

5270

In [22]:
len(np.intersect1d(test_names, neg_2_names))

1054

In [23]:
neg_2_test_filename = np.intersect1d(test_names, neg_2_names)

In [24]:
neg_2_dir = '/mnt/disk2/data/private_data/SMhospital/capsule/0 data/labeled/190814 negative/raw'
save_dir =  '/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed/sm_x160_v2'

ce_pre = ce_preprocessing(neg_2_dir, save_dir)
ce_pre.pre_aug_and_save(phase = 'test', cls = 'n', les = 'neg', filename = neg_2_test_filename, phase_a = [1, 1, 1], phase_b = True, 
                        pre_aug_type = 'target_phase') 

01054 |#################################################################################################### | 100.0% 


In [25]:
def num_of_jpg_file(dir_):
    print(len(glob.glob(dir_ + '/*.jpg')))

In [26]:
num_of_jpg_file(save_dir + '/test/negative/negative' )

249920


In [27]:
249920/160

1562.0

# Check size between original and augmented data

In [20]:
data_dir = '/mnt/disk2/data/private_data/SMhospital/capsule/0 data/labeled'

neg_1_filename = load_filename('p3_2', 'n', 'neg', data = '190520 p3_2', data_dir = data_dir)
neg_2_filename = load_filename('raw', 'n', 'neg', data = '190814 negative', data_dir = data_dir)
redspot_filename = load_filename('p3_2', 'h', 'redspot', data = '190520 p3_2', data_dir = data_dir)
angio_filename = load_filename('p3_2', 'h', 'angio', data = '190520 p3_2', data_dir = data_dir)
active_filename = load_filename('p3_2', 'h', 'active', data = '190520 p3_2', data_dir = data_dir)
ero_filename = load_filename('p3_2', 'd', 'ero', data = '190520 p3_2', data_dir = data_dir)
ulcer_filename = load_filename('p3_2', 'd', 'ulcer', data = '190520 p3_2', data_dir = data_dir)
str_filename = load_filename('p3_2', 'd', 'str', data = '190520 p3_2', data_dir = data_dir)

In [8]:
160*np.array([len(neg_1_filename)+ len(neg_2_filename), len(redspot_filename), len(angio_filename), len(active_filename), len(ero_filename), len(ulcer_filename), len(str_filename)])

array([1248960,  111200,    3680,   85760,  147520,  146400,   20640])

In [13]:
save_dir =  '/mnt/disk2/data/private_data/SMhospital/capsule/1 preprocessed/'

neg_1_filename = load_filename('train', 'n', 'neg', data = 'sm_x160_v2', data_dir = save_dir)
# neg_2_filename = load_filename('train', 'n', 'neg', data = 'sm_x160_v2', data_dir = save_dir)
redspot_filename = load_filename('train', 'h', 'redspot', data = 'sm_x160_v2', data_dir = save_dir)
angio_filename = load_filename('train', 'h', 'angio', data = 'sm_x160_v2', data_dir = save_dir)
active_filename = load_filename('train', 'h', 'active', data = 'sm_x160_v2', data_dir = save_dir)
ero_filename = load_filename('train', 'd', 'ero', data = 'sm_x160_v2', data_dir = save_dir)
ulcer_filename = load_filename('train', 'd', 'ulcer', data = 'sm_x160_v2', data_dir = save_dir)
str_filename = load_filename('train', 'd', 'str', data = 'sm_x160_v2', data_dir = save_dir)

In [15]:
n_train = np.array([len(neg_1_filename), len(redspot_filename), len(angio_filename), len(active_filename), len(ero_filename), len(ulcer_filename), len(str_filename)])
n_train

array([999040,  88960,   2880,  68480, 117920, 117120,  16480])

In [17]:
neg_1_filename = load_filename('test', 'n', 'neg', data = 'sm_x160_v2', data_dir = save_dir)
# neg_2_filename = load_filename('test', 'n', 'neg', data = 'sm_x160_v2', data_dir = save_dir)
redspot_filename = load_filename('test', 'h', 'redspot', data = 'sm_x160_v2', data_dir = save_dir)
angio_filename = load_filename('test', 'h', 'angio', data = 'sm_x160_v2', data_dir = save_dir)
active_filename = load_filename('test', 'h', 'active', data = 'sm_x160_v2', data_dir = save_dir)
ero_filename = load_filename('test', 'd', 'ero', data = 'sm_x160_v2', data_dir = save_dir)
ulcer_filename = load_filename('test', 'd', 'ulcer', data = 'sm_x160_v2', data_dir = save_dir)
str_filename = load_filename('test', 'd', 'str', data = 'sm_x160_v2', data_dir = save_dir)

In [18]:
n_test = np.array([len(neg_1_filename), len(redspot_filename), len(angio_filename), len(active_filename), len(ero_filename), len(ulcer_filename), len(str_filename)])
n_test

array([249920,  22240,    800,  17280,  29600,  29280,   4160])

In [16]:
n_train/160

array([6244.,  556.,   18.,  428.,  737.,  732.,  103.])

In [19]:
n_test/160

array([1562.,  139.,    5.,  108.,  185.,  183.,   26.])

In [22]:
(n_train+n_test)/160

array([7806.,  695.,   23.,  536.,  922.,  915.,  129.])

In [21]:
np.array([len(neg_1_filename)+ len(neg_2_filename), len(redspot_filename), len(angio_filename), len(active_filename), len(ero_filename), len(ulcer_filename), len(str_filename)])

array([7806,  695,   23,  536,  922,  915,  129])