In [1]:
import pandas as pd
import numpy as np
import math

import os
from tqdm import tqdm
import json
import shutil

# image data
from PIL import Image
# import cv2

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from glob import glob
import matplotlib.pyplot as plt

### 1. GPU setting

In [2]:
gpus =tf.config.experimental.list_physical_devices('GPU')
if gpus: 
    # tensorflow가 첫번째 gpu만 사용하도록 제한
    try: 
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    except RuntimeError as e: 
        print(e)

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [4]:
#tf.test.is_gpu_available()
#tf.test.is_built_with_cuda()
tf.test.is_built_with_gpu_support()

True

In [5]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

### 2. Mass extraction

In [43]:
LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/1.Training/라벨링데이터/'
people = os.listdir(LABEL_path)

In [None]:
# 02_01_0001
ppl = '02_01_0010'
ppl_ARTlabel_path = LABEL_path + ppl + '/' + '{}_ART.json'.format(ppl)
with open(ppl_ARTlabel_path) as f: 
    ART_json = json.load(f)
    
ART_json

In [None]:
# create unique list of mass z 

ppl_ARTlabel_path = LABEL_path + ppl + '/' + '{}_ART.json'.format(ppl)
with open(ppl_ARTlabel_path) as f: 
    ART_json = json.load(f)
    
ART_json

length_left = len(ART_json['roi_list']['left mass'])
length_right = len(ART_json['roi_list']['right mass'])

if length_left == 0:
    length = length_right
    where = 'right'
else: 
    length = length_left
    where = 'left'
    
z_list = []
for i in range(length): 
    z_list.append(ART_json['roi_list']['{} mass'.format(where)][i]['z'])
    
z_list = set(z_list)
z_list_unique = list(z_list)
z_list_unique = ['%04d' % x for x in z_list_unique]
z_list_unique

In [32]:
length_left

17165

In [90]:
for item in z_list_unique: 
    print('%04s' % item)

0019
0020
0021


In [None]:
ppl = '02_01_0001'
lst = glob('/home/ncp/workspace/blocks1/kidneyData_windowing/Train_data/ART/{}_*'.format(ppl))
lst.sort()
lst

In [33]:
# create ART_train_label dataframe

LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/1.Training/라벨링데이터/'
people = os.listdir(LABEL_path) 
ART_train_label = pd.DataFrame()

for ppl in tqdm(people): 
    ppl_ARTlabel_path = LABEL_path + ppl + '/' + '{}_ART.json'.format(ppl)
    with open(ppl_ARTlabel_path) as f: 
        ART_json = json.load(f)
    case_id = ART_json['case_id'][0:10]
    type = ART_json['Clinical Information (Global)']['histologic type']
    data = {'case_id' : [case_id], 'histologic type' : [type]}
    data = pd.DataFrame(data)
    ART_train_label = pd.concat([ART_train_label, data]).reset_index(drop = True)

100%|██████████| 320/320 [00:28<00:00, 11.15it/s]


In [34]:
# create ART_val_label dataframe

LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/2.Validation/라벨링데이터/'
people = os.listdir(LABEL_path) 
ART_val_label = pd.DataFrame()

for ppl in tqdm(people): 
    ppl_ARTlabel_path = LABEL_path + ppl + '/' + '{}_ART.json'.format(ppl)
    with open(ppl_ARTlabel_path) as f: 
        ART_json = json.load(f)
    case_id = ART_json['case_id'][0:10]
    type = ART_json['Clinical Information (Global)']['histologic type']
    data = {'case_id' : [case_id], 'histologic type' : [type]}
    data = pd.DataFrame(data)
    ART_val_label = pd.concat([ART_val_label, data]).reset_index(drop = True)

100%|██████████| 40/40 [00:03<00:00, 10.44it/s]


In [35]:
# create PRE_train_label dataframe

LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/1.Training/라벨링데이터/'
people = os.listdir(LABEL_path) 
PRE_train_label = pd.DataFrame()

for ppl in tqdm(people): 
    ppl_PRElabel_path = LABEL_path + ppl + '/' + '{}_PRE.json'.format(ppl)
    with open(ppl_PRElabel_path) as f: 
        PRE_json = json.load(f)
    case_id = PRE_json['case_id'][0:10]
    type = PRE_json['Clinical Information (Global)']['histologic type']
    data = {'case_id' : [case_id], 'histologic type' : [type]}
    data = pd.DataFrame(data)
    PRE_train_label = pd.concat([PRE_train_label, data]).reset_index(drop = True)

100%|██████████| 320/320 [00:27<00:00, 11.69it/s]


In [36]:
# create PRE_val_label dataframe

LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/2.Validation/라벨링데이터/'
people = os.listdir(LABEL_path) 
PRE_val_label = pd.DataFrame()

for ppl in tqdm(people): 
    ppl_PRElabel_path = LABEL_path + ppl + '/' + '{}_PRE.json'.format(ppl)
    with open(ppl_PRElabel_path) as f: 
        PRE_json = json.load(f)
    case_id = PRE_json['case_id'][0:10]
    type = PRE_json['Clinical Information (Global)']['histologic type']
    data = {'case_id' : [case_id], 'histologic type' : [type]}
    data = pd.DataFrame(data)
    PRE_val_label = pd.concat([PRE_val_label, data]).reset_index(drop = True)

100%|██████████| 40/40 [00:03<00:00, 10.53it/s]


In [39]:
# create new label in label_df
    # cc, chr, pp -> RCC
    # AML
    # ONCO
# (07.28 수정 반영) RCC // AML + onco 로 binary classification

def new_label(df): 
    df['label'] = df['histologic type']
    df.loc[df['label'].isin(['cc', 'chr', 'pp']) == True, 'label'] = 'RCC'
    df.loc[df['label'] == 'AML', 'label'] = 'AML + onco'
    df.loc[df['label'] == 'onco', 'label'] = 'AML + onco'
    return df

ART_train_label = new_label(ART_train_label)
PRE_train_label = new_label(PRE_train_label)
ART_val_label = new_label(ART_val_label)
PRE_val_label = new_label(PRE_val_label)

In [104]:
# define function to create unique list of mass z

def extract_z_of_mass(LABEL_path, ppl, mass_name): 
    ppl_ARTlabel_path = LABEL_path + ppl + '/' + '{}_ART.json'.format(ppl)
    with open(ppl_ARTlabel_path) as f: 
        ART_json = json.load(f)
        
    length = len(ART_json['roi_list'][mass_name])

    z_list = []
    for i in range(length): 
        z_list.append(ART_json['roi_list'][mass_name][i]['z'])

    z_list = set(z_list)
    z_list_unique = list(z_list)
    z_list_unique = [x + 1 for x in z_list_unique]
    z_list_unique = ['%04d' % x for x in z_list_unique]
    
    return z_list_unique

In [129]:
if 'left mass' in ART_json['roi_list'].keys(): 
    print(1)

1


In [106]:
## mass가 여러 개인 환자 list 정의, 여러 개인지 검사하는 function def ##

def check_if_multiple_mass(train_test, ppl):
    LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/1.Training/라벨링데이터/'
    people = os.listdir(LABEL_path) 

    for ppl in tqdm(people): 
        ppl_ARTlabel_path = LABEL_path + ppl + '/' + '{}_ART.json'.format(ppl)
        with open(ppl_ARTlabel_path) as f: 
            ART_json = json.load(f)
        if 'left mass' in ART_json['roi_list'].keys():  
    
    if train_test == 'train' and ppl[6:] in tr_left_mass_list: 
        return ['left mass']
    if train_test == 'train' and ppl[6:] in tr_left_mass2_list: 
        return ['left mass 1', 'left mass 2']
    if train_test == 'train' and ppl[6:] in tr_left_mass3_list: 
        return ['left mass 1', 'left mass 2', 'left mass 3']
    if train_test == 'train' and ppl[6:] in tr_left_mass5_list: 
        return ['left mass 1', 'left mass 2', 'left mass 3', 'left mass 4', 'left mass 5']
    
    if train_test == 'train' and ppl[6:] in tr_right_mass_list: 
        return ['right mass']
    if train_test == 'train' and ppl[6:] in tr_right_mass2_list: 
        return ['right mass 1', 'right mass 2']
    if train_test == 'train' and ppl[6:] in tr_right_mass3_list: 
        return ['right mass 1', 'right mass 2', 'right mass 3']
    if train_test == 'train' and ppl[6:] in tr_right_mass4_list: 
        return ['right mass 1', 'right mass 2', 'right mass 3', 'right mass 4']
    
    if train_test == 'test' and ppl[6:] in te_left_mass_list: 
        return ['left mass']
    if train_test == 'test' and ppl[6:] in te_left_mass2_list: 
        return ['left mass 1', 'left mass 2']
    
    if train_test == 'test' and ppl[6:] in te_right_mass_list: 
        return ['right mass']

In [120]:
tr_right_mass3_list = [41, 42, 43]
tr_right_mass3_list = ['%04d' % x for x in tr_right_mass3_list]
tr_right_mass3_list

['0041', '0042', '0043']

In [122]:
'02_01_0041'[6:]

'0041'

In [103]:
# img 검사 후 복사하는 function 정의

def copy_image(img_list, z_list_unique, label_df, old_path, new_path): 
    for img in tqdm(img_list): 
        if img[11:15] in z_list_unique: 
            case_id = img[4:14]
            label = label_df.loc[label_df['case_id'] == case_id, 'label'].unique().tolist()
            if label == ['RCC']: 
                shutil.copy(old_path + img, new_path + 'RCC/' + img)
            if label == ['AML + onco']: 
                shutil.copy(old_path + img, new_path + 'AML + onco/' + img)

In [108]:
shutil.rmtree('/home/ncp/workspace/blocks3/kidneyData_windowing_MASS/TRAIN/')
shutil.rmtree('/home/ncp/workspace/blocks3/kidneyData_windowing_MASS/TEST/')

In [None]:
# mass 만 추출한 데이터 새로 저장, 동시에 LABEL 별 폴더로 분류

ART_train_path = '/home/ncp/workspace/blocks1/kidneyData_windowing/Train_data/ART/'
PRE_train_path = '/home/ncp/workspace/blocks1/kidneyData_windowing/Train_data/PRE/'
GCCT_train_path = '/home/ncp/workspace/blocks1/kidneyData_windowing/Train_data/GCCT/'
ART_test_path = '/home/ncp/workspace/blocks1/kidneyData_windowing/Test_data/ART/'
PRE_test_path = '/home/ncp/workspace/blocks1/kidneyData_windowing/Test_data/PRE/'
GCCT_test_path = '/home/ncp/workspace/blocks1/kidneyData_windowing/Test_data/GCCT/'

new_ART_train_path = '/home/ncp/workspace/blocks3/kidneyData_windowing_MASS/TRAIN/ART/'
new_PRE_train_path = '/home/ncp/workspace/blocks3/kidneyData_windowing_MASS/TRAIN/PRE/'
new_GCCT_train_path = '/home/ncp/workspace/blocks3/kidneyData_windowing_MASS/TRAIN/GCCT/'
new_ART_test_path = '/home/ncp/workspace/blocks3/kidneyData_windowing_MASS/TEST/ART/'
new_PRE_test_path = '/home/ncp/workspace/blocks3/kidneyData_windowing_MASS/TEST/PRE/'
new_GCCT_test_path = '/home/ncp/workspace/blocks3/kidneyData_windowing_MASS/TEST/GCCT/'

# make dir
def makedir(path): 
    if not os.path.exists(path): 
        os.mkdir(path)
        
dir = '/home/ncp/workspace/blocks3/kidneyData_windowing_MASS/'
makedir(os.path.join(dir, "TRAIN"))
makedir(os.path.join(dir, "TEST"))

makedir(os.path.join(dir, "TRAIN", "ART"))
makedir(os.path.join(dir, "TRAIN", "ART", "RCC"))
makedir(os.path.join(dir, "TRAIN", "ART", "AML + onco"))

makedir(os.path.join(dir, "TEST", "ART"))
makedir(os.path.join(dir, "TEST", "ART", "RCC"))
makedir(os.path.join(dir, "TEST", "ART", "AML + onco"))

makedir(os.path.join(dir, "TRAIN", "PRE"))
makedir(os.path.join(dir, "TRAIN", "PRE", "RCC"))
makedir(os.path.join(dir, "TRAIN", "PRE", "AML + onco"))

makedir(os.path.join(dir, "TEST", "PRE"))
makedir(os.path.join(dir, "TEST", "PRE", "RCC"))
makedir(os.path.join(dir, "TEST", "PRE", "AML + onco"))

makedir(os.path.join(dir, "TRAIN", "GCCT"))
makedir(os.path.join(dir, "TRAIN", "GCCT", "RCC"))
makedir(os.path.join(dir, "TRAIN", "GCCT", "AML + onco"))

makedir(os.path.join(dir, "TEST", "GCCT"))
makedir(os.path.join(dir, "TEST", "GCCT", "RCC"))
makedir(os.path.join(dir, "TEST", "GCCT", "AML + onco"))

# create img name list
art_train_list = os.listdir(ART_train_path)
pre_train_list = os.listdir(PRE_train_path)
art_test_list = os.listdir(ART_test_path)
pre_test_list = os.listdir(PRE_test_path)
gcct_train_list = os.listdir(GCCT_train_path)
gcct_test_list = os.listdir(GCCT_test_path)


# ART train
LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/1.Training/라벨링데이터/'
people = os.listdir(LABEL_path)
for ppl in people: 
    mass_list = check_if_multiple_mass('train', ppl)
    for mass_name in mass_list: 
        z_list_unique = extract_z_of_mass(LABEL_path, ppl, mass_name)
        copy_image(art_train_list, z_list_unique, ART_train_label, ART_train_path, new_ART_train_path)

# PRE train
LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/1.Training/라벨링데이터/'
people = os.listdir(LABEL_path)
for ppl in people: 
    mass_list = check_if_multiple_mass('train', ppl)
    for mass_name in mass_list: 
        z_list_unique = extract_z_of_mass(LABEL_path, ppl, mass_name)
        copy_image(pre_train_list, z_list_unique, PRE_train_label, PRE_train_path, new_PRE_train_path)
                
# GCCT train
LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/1.Training/라벨링데이터/'
people = os.listdir(LABEL_path)
for ppl in people: 
    mass_list = check_if_multiple_mass('train', ppl)
    for mass_name in mass_list: 
        z_list_unique = extract_z_of_mass(LABEL_path, ppl, mass_name)
        copy_image(gcct_train_list, z_list_unique, PRE_train_label, GCCT_train_path, new_GCCT_train_path)
        
# ART test
LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/2.Validation/라벨링데이터/'
people = os.listdir(LABEL_path)
for ppl in people: 
    mass_list = check_if_multiple_mass('test', ppl)
    for mass_name in mass_list: 
        z_list_unique = extract_z_of_mass(LABEL_path, ppl, mass_name)
        copy_image(art_test_list, z_list_unique, ART_val_label, ART_test_path, new_ART_test_path)
        
# PRE test
LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/2.Validation/라벨링데이터/'
people = os.listdir(LABEL_path)
for ppl in people: 
    mass_list = check_if_multiple_mass('test', ppl)
    for mass_name in mass_list: 
        z_list_unique = extract_z_of_mass(LABEL_path, ppl, mass_name)
        copy_image(pre_test_list, z_list_unique, PRE_val_label, PRE_test_path, new_PRE_test_path)
                
# GCCT test
LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/2.Validation/라벨링데이터/'
people = os.listdir(LABEL_path)
for ppl in people: 
    mass_list = check_if_multiple_mass('test', ppl)
    for mass_name in mass_list: 
        z_list_unique = extract_z_of_mass(LABEL_path, ppl, mass_name)
        copy_image(gcct_test_list, z_list_unique, PRE_val_label, GCCT_test_path, new_GCCT_test_path)

In [111]:
ppl

'02_01_0108'

In [123]:
mass_list = check_if_multiple_mass('train', '02_01_0041')
mass_list

['left mass']

In [63]:
mass_list = check_if_multiple_mass('train', ppl)
    for mass_name in mass_list: 
        z_list_unique = extract_z_of_mass(LABEL_path, ppl, mass_name)
        copy_image(art_train_list, z_list_unique, ART_train_label, ART_train_path, new_ART_train_path)

1


In [81]:
ppl

'02_01_0041'

In [None]:
LABEL_path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/1.Training/라벨링데이터/'
people = os.listdir(LABEL_path) 

for ppl in people: 
    ppl_ARTlabel_path = LABEL_path + ppl + '/' + '{}_ART.json'.format(ppl)
    with open(ppl_ARTlabel_path) as f: 
        ART_json = json.load(f)
        
    print(ART_json['Clinical Information (Local)'])

In [85]:
ppl

'02_01_0303'