In [None]:
import pandas as pd
import numpy as np
import math

import os
from tqdm import tqdm
import json
import shutil

# image data
from PIL import Image
# import cv2

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from glob import glob
import matplotlib.pyplot as plt

### 1. GPU setting

In [None]:
gpus =tf.config.experimental.list_physical_devices('GPU')
if gpus: 
    # tensorflow가 첫번째 gpu만 사용하도록 제한
    try: 
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    except RuntimeError as e: 
        print(e)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
#tf.test.is_gpu_available()
#tf.test.is_built_with_cuda()
tf.test.is_built_with_gpu_support()

In [None]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

### Visualization

In [None]:
# visualization - only mass
lst = glob('/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Train_data/ART/02_01_0001_*')

test = Image.open(lst[0])
plt.figure(figsize = (10, 10))
for i in range(1, len(lst)): 
    tmp = Image.open(lst[i])
    test = np.concatenate((test, tmp), axis = 1)

plt.imshow(test)

In [None]:
# visualization - only mass
lst = glob('/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Train_data/ART/02_01_0002_*')

test = Image.open(lst[0])
plt.figure(figsize = (10, 10))
for i in range(1, len(lst)): 
    tmp = Image.open(lst[i])
    test = np.concatenate((test, tmp), axis = 1)
    
plt.imshow(test)

In [None]:
# visualization - only mass
lst = glob('/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Train_data/ART/02_01_0003_*')

test = Image.open(lst[0])
plt.figure(figsize = (20, 20))
for i in range(1, len(lst)): 
    tmp = Image.open(lst[i])
    test = np.concatenate((test, tmp), axis = 1)

plt.imshow(test)

In [None]:
# visualization - only mass - ppl 1 from 9

for idx in range(1, 10): 
    lst = glob('/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Train_data/ART/02_01_000{}_*'.format(idx))

    test = Image.open(lst[0])
    plt.figure(figsize = (20, 20))
    for i in range(1, len(lst)): 
        tmp = Image.open(lst[i])
        test = np.concatenate((test, tmp), axis = 1)
        
    print("patient index: ", idx)
    plt.imshow(test)

In [None]:
# visualization - only mass - ppl 10 from 30

for idx in range(10, 31): 
    lst = glob('/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Train_data/ART/02_01_00{}_*'.format(idx))

    test = Image.open(lst[0])
    plt.figure(figsize = (20, 20))
    for i in range(1, len(lst)): 
        tmp = Image.open(lst[i])
        test = np.concatenate((test, tmp), axis = 1)
        
    print("patient index: ", idx)
    plt.imshow(test)

### count min max

In [None]:
patient = pd.DataFrame(os.listdir('/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Train_data/ART/'))
patient['id'] = patient[0].str[0:10]
patient.groupby(['id']).count().sort_values([0], ascending = False)
# min 1
# max 22

In [None]:
patient = pd.DataFrame(os.listdir('/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Train_data/PRE/'))
patient['id'] = patient[0].str[0:10]
patient.groupby(['id']).count().sort_values([0], ascending = False)
# min 1
# max 21

In [None]:
patient = pd.DataFrame(os.listdir('/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Test_data/ART/'))
patient['id'] = patient[0].str[0:10]
patient.groupby(['id']).count().sort_values([0], ascending = False)
# min 2
# max 19

In [None]:
patient = pd.DataFrame(os.listdir('/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Test_data/PRE/'))
patient['id'] = patient[0].str[0:10]
patient.groupby(['id']).count().sort_values([0], ascending = False)
# min 2
# max 18

### DCM header check

In [None]:
# dcm header check
import pydicom

path__ = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/1.Training/원천데이터/02_01_0001/02_01_0001_ART/0001.dcm'
header_01 = pydicom.dcmread(path__, stop_before_pixels = True)
header_01

In [None]:
df = pd.DataFrame(columns = ['name', 'modality'])
data = [{'name' : '0', 'modality': str(header_01[0x0008, 0x0001030][0:])}]
tmp = pd.DataFrame(data)
tmp

In [None]:
# dcm header별 modality list
path = '/home/ncp/workspace/202002n035/035.신장암 진단을 위한 의료 영상 데이터/01.데이터/신장암2/1.Training/원천데이터/'
patient = os.listdir(path)
df = pd.DataFrame(columns = ['name', 'modality'])
lst = []

for ppl in patient: 
    deep_path = path + ppl + '/' + '{}_ART'.format(ppl) + '/'
    filelist = os.listdir(deep_path)
    for file in filelist: 
        try: 
            header = pydicom.dcmread(deep_path + file, stop_before_pixels = True)
            data = [{'name' : ppl, 'modality': str(header[0x0008, 0x0001030][0:])}]
            tmp = pd.DataFrame(data)
            df = pd.concat([df, tmp], axis = 0)
        except Exception: 
            lst.append(ppl)
            pass

In [None]:
df

In [None]:
df['modality'].unique()

In [None]:
df_groupby = df.groupby(['modality']).count().reset_index()
df_groupby

In [None]:
set(lst)

In [None]:
modality_kidney_list = df.loc[df['modality'].isin(['CT Kidney (3P) + 3D', 'CT Kidney (3P) + 3D (contrast)']) == True, 'name'].unique()
len(modality_kidney_list)

In [None]:
modality_kidney_list

In [None]:
modality_pelvis_list = df.loc[df['modality'].isin(['Pelvis^00_Kidney_3D (Adult)']) == True, 'name'].unique()
len(modality_pelvis_list)

In [None]:
# 사람별 ct 이미지 장수 확인

n_list = []

for ppl in modality_kidney_list: 
    lst = glob('/home/ncp/workspace/blocks1/kidneyData_windowing/TRAIN/ART/{}_*'.format(ppl))
    lst.sort()
    n_list.append(len(lst))
    print(ppl, "N: ", len(lst))

In [None]:
# 사람별 ct 이미지 장수 확인

n_list = []

for ppl in modality_pelvis_list: 
    lst = glob('/home/ncp/workspace/blocks1/kidneyData_windowing/TRAIN/ART/{}_*'.format(ppl))
    lst.sort()
    n_list.append(len(lst))
    print(ppl, "N: ", len(lst))

In [None]:
n_list = pd.DataFrame(n_list)

In [None]:
n_list[0].describe()

In [None]:
# 0장인 사람이?
# 0drop후 describe

n_list = n_list.loc[n_list[0] != 0].reset_index(drop = True)
n_list[0].describe()

### 2. Build train and validation datasets

In [None]:
# 전체 데이터 쓸 때 실행
# abnormal : RCC (악성)
# normal : aml + onco (양성)

ART_train_path = '/home/ncp/workspace/blocks3/zio_code/kidneyData_windowing/TRAIN/ART/'
ART_test_path = '/home/ncp/workspace/blocks3/zio_code/kidneyData_windowing/TEST/ART/'
PRE_train_path = '/home/ncp/workspace/blocks3/zio_code/kidneyData_windowing/TRAIN/PRE/'
PRE_test_path = '/home/ncp/workspace/blocks3/zio_code/kidneyData_windowing/TEST/PRE/'

In [None]:
# mass 만 쓸 때 실행 - mass 데이터를 악성 양성으로 분류해서 다시 저장해야 함
# abnormal : RCC (악성)
# normal : aml + onco (양성)

ART_train_path = '/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Train_data/ART/'
ART_test_path = '/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Test_data/ART/'
PRE_train_path = '/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Train_data/PRE/'
PRE_test_path = '/home/ncp/workspace/blocks1/kidneyData_windowing_mass/Test_data/PRE/'

In [None]:
def process_scan(path): 
    # read scan
    volume = Image.open(path).convert('L')
    volume = np.array(volume)

    return volume

In [None]:
# 전체 데이터용

def padding_stacking(data_path, cancer_type): 
    ## create distinct list of patients ##
    file_list = os.listdir(data_path + cancer_type + '/')
    patient = []
    for items in file_list: 
        patient.append(items[0:10])

    patient = list(set(patient))
    print("total distinct patient N: ", len(patient))

    ## create empty datasets ##
    scans_final = np.zeros((512, 512, 128))
    ppl_scan_list = []

    ## iterate through distinct patient list and process ##
    for ppl in patient: 

        ## 사람별로 path list 생성, path list에서 ct 불러와서 stack ## 
        ## (512, 512, n) 장의 file 생성됨 ##
        ppl_path = []
        ppl_scan = np.zeros((512, 512))
        for x in os.listdir(data_path + cancer_type + '/'): 
            if x[0:10] == ppl: 
                ppl_path.append(x)
        for path in ppl_path: 
            ppl_scan = np.dstack((ppl_scan, process_scan(data_path + cancer_type + '/' + path)))  # one layer of zero padding added on top

        print("patient id: ", ppl)
        print("ppl_scan shape: ", ppl_scan.shape)

        ## zero pad to (512, 512, 128) ##
        height, width, depth = ppl_scan.shape
        pad_len = (128-depth)//2 # 양쪽에 padding 할 length 정의

        if depth >= 128: # if depth >= 128 then truncate
            if depth % 2 == 0: 
                pad_len = (depth-128)//2
                ppl_scan_padded = ppl_scan[:, :, pad_len:depth-pad_len]
                print("padded ppl_scan shape: ", ppl_scan_padded.shape)
            else: 
                pad_len = (depth-128)//2
                ppl_scan_padded = ppl_scan[:, :, pad_len:depth-pad_len-1]
                print("padded ppl_scan shape: ", ppl_scan_padded.shape)
        else: # if depth < 128 than pad
            if depth % 2 == 0: 
                ppl_scan_padded = np.pad(ppl_scan, ((0,0), (0,0), (pad_len, pad_len)), 'constant')
                print("padded ppl_scan shape: ", ppl_scan_padded.shape)
            else: 
                ppl_scan_padded = np.pad(ppl_scan, ((0,0), (0,0), (pad_len, pad_len+1)), 'constant')
                print("padded ppl_scan shape: ", ppl_scan_padded.shape)

        ppl_scan_list.append(ppl_scan_padded) # padding 완료된 file을 ppl_scan_list 에 저장
        print("---------------------") 

    # 사람별 생성 및 process 된 ppl_scan을 순회하면서 (N, 512, 512, 128) 로 저장 ##
    scans_final = np.array([ppl_scan_list[i] for i in range(len(ppl_scan_list))])
    print("scans_final shape: ", scans_final.shape) 
    
    return scans_final

In [None]:
# mass 데이터용 - 수정 필요

def padding_stacking(data_path, cancer_type): 
    ## create distinct list of patients ##
    file_list = os.listdir(data_path + cancer_type + '/')
    patient = []
    for items in file_list: 
        patient.append(items[0:10])

    patient = list(set(patient))
    print("total distinct patient N: ", len(patient))

    ## create empty datasets ##
    scans_final = np.zeros((512, 512, 128))
    ppl_scan_list = []

    ## iterate through distinct patient list and process ##
    for ppl in patient: 

        ## 사람별로 path list 생성, path list에서 ct 불러와서 stack ## 
        ## (512, 512, n) 장의 file 생성됨 ##
        ppl_path = []
        ppl_scan = np.zeros((512, 512))
        for x in os.listdir(data_path + cancer_type + '/'): 
            if x[0:10] == ppl: 
                ppl_path.append(x)
        for path in ppl_path: 
            ppl_scan = np.dstack((ppl_scan, process_scan(data_path + cancer_type + '/' + path)))  # one layer of zero padding added on top

        print("patient id: ", ppl)
        print("ppl_scan shape: ", ppl_scan.shape)

        ## zero pad to (512, 512, 128) ##
        height, width, depth = ppl_scan.shape
        pad_len = (128-depth)//2 # 양쪽에 padding 할 length 정의

        if depth >= 128: # if depth >= 128 then truncate
            if depth % 2 == 0: 
                pad_len = (depth-128)//2
                ppl_scan_padded = ppl_scan[:, :, pad_len:depth-pad_len]
                print("padded ppl_scan shape: ", ppl_scan_padded.shape)
            else: 
                pad_len = (depth-128)//2
                ppl_scan_padded = ppl_scan[:, :, pad_len:depth-pad_len-1]
                print("padded ppl_scan shape: ", ppl_scan_padded.shape)
        else: # if depth < 128 than pad
            if depth % 2 == 0: 
                ppl_scan_padded = np.pad(ppl_scan, ((0,0), (0,0), (pad_len, pad_len)), 'constant')
                print("padded ppl_scan shape: ", ppl_scan_padded.shape)
            else: 
                ppl_scan_padded = np.pad(ppl_scan, ((0,0), (0,0), (pad_len, pad_len+1)), 'constant')
                print("padded ppl_scan shape: ", ppl_scan_padded.shape)

        ppl_scan_list.append(ppl_scan_padded) # padding 완료된 file을 ppl_scan_list 에 저장
        print("---------------------") 

    # 사람별 생성 및 process 된 ppl_scan을 순회하면서 (N, 512, 512, 128) 로 저장 ##
    scans_final = np.array([ppl_scan_list[i] for i in range(len(ppl_scan_list))])
    print("scans_final shape: ", scans_final.shape) 
    
    return scans_final

ART train : RCC (abnormal)

In [None]:
ART_train_abnormal_scans = padding_stacking(ART_train_path, 'RCC')

In [None]:
ART_train_abnormal_scans.shape

ART train: AML + onco (normal)

In [None]:
ART_train_normal_scans = padding_stacking(ART_train_path, 'AML + onco')

ART test: RCC (abnormal)

In [None]:
ART_test_abnormal_scans = padding_stacking(ART_test_path, 'RCC')

ART test: AML + onco (normal)

In [None]:
ART_test_normal_scans = padding_stacking(ART_test_path, 'AML + onco')

PRE train: RCC (abnormal)

In [None]:
# PRE_train_abnormal_scans = padding_stacking(PRE_train_path, 'RCC')

PRE train: AML + onco (normal)

In [None]:
# PRE_train_normal_scans = padding_stacking(PRE_train_path, 'AML + onco')

PRE test: RCC (abnormal)

In [None]:
# PRE_test_abnormal_scans = padding_stacking(PRE_test_path, 'RCC')

PRE test: AML + onco (normal)

In [None]:
# PRE_test_normal_scans = padding_stacking(PRE_test_path, 'AML + onco')

### FINAL DATASETS

In [None]:
# ART
print("ART_train_abnormal_scans shape: ", ART_train_abnormal_scans.shape)
print("ART_train_normal_scans shape: ", ART_train_normal_scans.shape)
print("ART_test_abnormal_scans shape: ", ART_test_abnormal_scans.shape)
print("ART_test_normal_scans shape: ", ART_test_normal_scans.shape)

In [None]:
# save

np.savetxt('ART_train_abnormal_scans', ART_train_abnormal_scans)

In [None]:
# # PRE
# print("PRE_train_abnormal_scans shape: ", PRE_train_abnormal_scans.shape)
# print("PRE_train_normal_scans shape: ", PRE_train_normal_scans.shape)
# print("PRE_test_abnormal_scans shape: ", PRE_test_abnormal_scans.shape)
# print("PRE_test_normal_scans shape: ", PRE_test_normal_scans.shape)

In [None]:
# ART 먼저 하고 PRE 해야함 (메모리 에러 남)

### CREATE LABELS

In [None]:
round(len(ART_train_normal_labels) * 0.7)

In [None]:
ART_train_abnormal_labels = np.array([1 for _ in range(len(ART_train_abnormal_scans))])
ART_train_normal_labels = np.array([0 for _ in range(len(ART_train_normal_scans))])

# split 7:3 for validation
abnormal_train_idx = round(len(ART_train_abnormal_labels) * 0.7)
normal_train_idx = round(len(ART_train_normal_labels) * 0.7)

x_train = np.concatenate((ART_train_abnormal_scans[:50], ART_train_normal_scans[:50]), axis = 0)
y_train = np.concatenate((ART_train_abnormal_labels[50:100], ART_train_normal_labels[50:100]), axis = 0)

x_val = np.concatenate((ART_train_abnormal_scans[:50], ART_train_normal_scans[:50]), axis = 0)
y_val = np.concatenate((ART_train_abnormal_labels[50:100], ART_train_normal_labels[50:100]), axis = 0)

print("Number of samples in train and validation are %d and %d" %(x_train.shape[0], x_val.shape[0]))

### MODEL

In [None]:
def get_model(width = 512, height = 512, depth = 128): 
    inputs = keras.Input((width, height, depth, 1))
    
    x = layers.Conv3D(filters = 64, kernel_size = 3, activation = "relu")(inputs)
    x = layers.MaxPool3D(pool_size = 2)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Conv3D(filters = 64, kernel_size = 3, activation = "relu")(x)
    x = layers.MaxPool3D(pool_size = 2)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Conv3D(filters = 128, kernel_size = 3, activation = "relu")(x)
    x = layers.MaxPool3D(pool_size = 2)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Conv3D(filters = 128, kernel_size = 3, activation = "relu")(x)
    x = layers.MaxPool3D(pool_size = 2)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Conv3D(filters = 256, kernel_size = 3, activation = "relu")(x)
    x = layers.MaxPool3D(pool_size = 2)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Conv3D(filters = 256, kernel_size = 1, activation = "relu")(x)
    x = layers.MaxPool3D(pool_size = 2)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.GlobalAveragePooling3D()(x)
    x = layers.Dense(units = 512, activation = "relu")(x)
    x = layers.Dropout(0.3)(x)
    
    outputs = layers.Dense(units = 1, activation = "sigmoid")(x)
    
    model = keras.Model(inputs, outputs, name = "3dcnn")
    
    return model

In [None]:
model = get_model(width = 512, height = 512, depth = 128)
model.summary()

In [None]:
initial_learning_rate = 0.0001
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps = 100000, decay_rate = 0.96, staircase = True
)
model.compile(
    loss = "binary_crossentropy", 
    optimizer = keras.optimizers.Adam(learning_rate = lr_schedule), 
    metrics = ["acc"]
)

In [None]:
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    "ART_abnormal_3d_image_classification.hdf5", save_best_only = True
)
early_stopping_cb = keras.callbacks.EarlyStopping(monitor = "val_acc", patience = 15)

epochs = 100
history = model.fit(
        ART_train_abnormal_scans,
        validation_data = ART_test_abnormal_scans, 
        epochs = epochs, 
        shuffle = True, 
        verbose = 2, 
        callbacks = [checkpoint_cb, early_stopping_cb]
    )