In [2]:
import sys
sys.path.append('C:\\Program Files\\ASAP 2.2\\bin')

import multiresolutionimageinterface as mir
import cv2
from tqdm import tqdm_notebook
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd


In [14]:
def getTissueMask(tifPath):
    maskPath = tifPath.replace('.tif', '_tissue_mask_ds16.npy')
    if (not os.path.isfile(maskPath)): return None
    return np.load(maskPath)

In [3]:
# 샘플(패치) 중심위치 생성 함수
def sample_centers(tissue_mask, mask_downscale=16, sample_side=256, focus_width_percentage=0.25, padding_percentage=0.01):
    # 조직 마스크의 가로 ,세로
    mask_width, mask_height = tissue_mask.shape[:2]

    # 샘플 사이즈
    side = sample_side / mask_downscale

    # 패딩 크기
    padding_width = mask_width*padding_percentage
    padding_height = mask_height*padding_percentage

    # 포커스 영역의 반 너비
    half_focus = int(sample_side*focus_width_percentage / mask_downscale)
    
    # 샘플 중심 좌표를 저장할 리스트
    sample_centers = []
    
    # 조직이 존재하는 영역 기반으로 샘플 중심을 결정
    for i in range(int(mask_width // side)):
        for j in range(int(mask_height // side)):
            for sub_shift in [0, 0.5]:
                x = int((i+sub_shift) * side)
                y = int((j+sub_shift) * side)
                min_x = int(max(0, x - half_focus))
                max_x = int(min(x + half_focus, mask_width - 1))
                min_y = int(max(0, y - half_focus))
                max_y = int(min(y + half_focus, mask_height - 1))
                
                # 패딩 영역의 샘플은 건너뜀
                if(min_x < padding_width or max_x > mask_width-padding_width): continue
                if(min_y < padding_height or max_y > mask_height-padding_height): continue
                
                # 조직이 존재하는 영역만 샘플로 추가
                if(tissue_mask[min_x:max_x, min_y:max_y].sum() > 0):
                    sample_centers.append(np.array([x, y]))
                    
    # 마스크 다운스케일을 복원하여 좌표 계산
    sample_centers = np.array(sample_centers) * mask_downscale
    return sample_centers

In [4]:
# tissue mask 가져오기
def getTissueMask(tifPath):
    maskPath = tifPath.replace('.tif', '_tissue_mask_ds16.npy')
    if (not os.path.isfile(maskPath)): return None
    return np.load(maskPath)

In [5]:
import colorsys

# 패치 내 종양이 있는지 
def isTumor(mask_level_0):
    return (mask_level_0.max() > 0)

# 패치 내 종양의 비율
def tumorPercentage(mask_level_0):
    area = mask_level_0.shape[0] * mask_level_0.shape[1]
    tumorPixels = np.count_nonzero(mask_level_0)
    channels = 3
    return tumorPixels / (area * channels)

# 패치 내 조직의 비율
def tissuePercentage(tissueMask):
    area = tissueMask.shape[0] * tissueMask.shape[1]
    tissuePixels = np.count_nonzero(tissueMask)
    return tissuePixels / area

# 색 평균
def colorMean(tissue):
    (b,g,r,a) = cv2.mean(tissue)
    return np.array([r,g,b])/255

# rgb를 hsv로
def rgb2hsv(rgb):
    return colorsys.rgb_to_hsv(rgb[0], rgb[1], rgb[2])

In [6]:
reader = mir.MultiResolutionImageReader()

# 이미지 불러오기
def getImage(tifPath):
    if (not os.path.isfile(tifPath)): return None
    return reader.open(tifPath)

# mask 파일(종양인 것만) 불러오기
def getAnnoMask(tifPath):
    maskPath = tifPath.replace('.tif', '_mask.tif')
    if (not os.path.isfile(maskPath)): return None
    return reader.open(maskPath)

In [7]:
# x, y center 입력을 받아서 x1, x2, y1, y2로 변환
def center2Bounds(center,ds=1, side=256):
    ''' Get an array of [x1,x2,y1,y2] from a center point in a set downsampling scale
    Args:
        center (int array of x,y): center point coordinate
        ds (int): downsampling scale
        side (int): size of the box side (default 256)
    '''
    assert center.shape[0] == 2, "Invalid center point shape. Got {0} but expected (2,)".format(center.shape)
    half_side = int((side / ds) // 2)
    return np.array([center[1]//ds-half_side,
                    center[1]//ds+half_side,
                    center[0]//ds-half_side,
                    center[0]//ds+half_side], dtype=np.int32)

In [8]:
# 중심 값을 매개변수로 받아 이미지 패치, 조직 마스크, annotation mask를 반환
def getPatchAndMasks(mr_image, mr_mask, tissue_mask,center, side=256):
    patch_bounds = center2Bounds(center)
    mask_bounds = center2Bounds(center, ds=16)
    
    channels = 3
    annoMask = np.zeros((side, side, channels), dtype=np.uint8)

    img = mr_image.getUCharPatch(int(patch_bounds[0]),
                                 int(patch_bounds[2]),
                                 side,
                                 side,
                                 0)
    
    tissueMask = tissue_mask[mask_bounds[2]:mask_bounds[3],mask_bounds[0]:mask_bounds[1]]
    if mr_mask is not None:
        annoMask = mr_mask.getUCharPatch(int(patch_bounds[0]),
                                     int(patch_bounds[2]),
                                     side,
                                     side,
                                     0)
    return img, tissueMask, np.array(annoMask)

In [9]:
dirData = '../data/training/'
ImageFiles = []
# r=root, d=directories, f = files
for r, d, f in os.walk(dirData):
    for file in f:
        if '.tif' in file and 'mask' not in file:
            ImageFiles.append(r + '/' + file)

In [10]:
ImageFiles

['../data/training/center_0/patient_001_node_1.tif',
 '../data/training/center_0/patient_004_node_0.tif',
 '../data/training/center_0/patient_004_node_1.tif',
 '../data/training/center_0/patient_004_node_2.tif',
 '../data/training/center_0/patient_004_node_3.tif',
 '../data/training/center_0/patient_004_node_4.tif']

In [11]:
# 한 wsi 파일을 패치로 쪼개고, 패치 별 annotation을 저장하는 csv파일 생성
def CreateDF(tifPath, overrideExisting=False):
    
    # 파일 명만 저장
    fileNamePart = tifPath.replace('.tif','').replace(dirData, "")
    df_path = '../data/training/dataframes/' + fileNamePart.split('/')[1] + '.csv'
    
    if (os.path.isfile(df_path) and overrideExisting == False):
        print('Info - Dataframe file of {0} already exists - skipping'.format(tifPath))
        return
    
    tissue_mask = getTissueMask(tifPath)
    patch_centers = sample_centers(tissue_mask)

    print("Sliced WSI {1} to {0} pathes.".format(len(patch_centers), tifPath))
    
    # 현재 이미지 파일/ 마스크 파일 불러오기
    mr_image = getImage(tifPath)
    mr_mask = getAnnoMask(tifPath)
    
    df = pd.DataFrame(columns=['patchId',
                               'fileName',
                               'center',
                               'patient',
                               'node',
                               'centerX',
                               'centerY',
                               'isTumor',
                               'tumorPercentage',
                               'tissuePercentage',
                               'meanHue',
                               'meanSaturation',
                               'meanValue'])
    
    # 디렉토리가 다르다면 바꿔야 합니다.
    split = tifPath.split('/')
    cnt = int(split[3].strip('center_'))
    splitpatient = split[4].split('_')
    patient = int(splitpatient[1])
    node = int(splitpatient[3].strip('.tif'))
    
    for c in tqdm_notebook(patch_centers, 'Patches...'):
        img,tissue,anno = getPatchAndMasks(mr_image, mr_mask, tissue_mask, c)
        isTumor_attr = isTumor(anno)
        tumorPrc_attr = tumorPercentage(anno)
        tissuePrc_attr = tissuePercentage(tissue)
        colorMean_attr = colorMean(img)
        (mean_h, mean_s, mean_v) = rgb2hsv(colorMean_attr)
        
        df = df._append({'patchId': str(patient)+str(0)+str(c[0]).zfill(7)+str(c[1]).zfill(7),
                       'fileName': tifPath,
                       'center': cnt,
                      'patient': patient,
                      'node': node,
                      'centerX':c[0],
                      'centerY':c[1],
                      'isTumor':isTumor_attr,
                      'tumorPercentage': int(tumorPrc_attr * 1000)/10,
                      'tissuePercentage': int(tissuePrc_attr * 1000)/10,
                      'meanHue': int(mean_h * 100)/100,
                      'meanSaturation': int(mean_s * 100)/100,
                      'meanValue': int(mean_v * 100)/100}, ignore_index=True)
        
    df.to_csv(df_path)

In [12]:
for f in tqdm_notebook(ImageFiles, 'Creating dataframes...'):
    CreateDF(f)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for f in tqdm_notebook(ImageFiles, 'Creating dataframes...'):


Creating dataframes...:   0%|          | 0/6 [00:00<?, ?it/s]

Info - Dataframe file of ../data/training/center_0/patient_001_node_1.tif already exists - skipping
Sliced WSI ../data/training/center_0/patient_004_node_0.tif to 38723 pathes.


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for c in tqdm_notebook(patch_centers, 'Patches...'):


Patches...:   0%|          | 0/38723 [00:00<?, ?it/s]

  df = df._append({'patchId': str(patient)+str(0)+str(c[0]).zfill(7)+str(c[1]).zfill(7),


Sliced WSI ../data/training/center_0/patient_004_node_1.tif to 12270 pathes.


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for c in tqdm_notebook(patch_centers, 'Patches...'):


Patches...:   0%|          | 0/12270 [00:00<?, ?it/s]

  df = df._append({'patchId': str(patient)+str(0)+str(c[0]).zfill(7)+str(c[1]).zfill(7),


Sliced WSI ../data/training/center_0/patient_004_node_2.tif to 52337 pathes.


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for c in tqdm_notebook(patch_centers, 'Patches...'):


Patches...:   0%|          | 0/52337 [00:00<?, ?it/s]

  df = df._append({'patchId': str(patient)+str(0)+str(c[0]).zfill(7)+str(c[1]).zfill(7),


Sliced WSI ../data/training/center_0/patient_004_node_3.tif to 56626 pathes.


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for c in tqdm_notebook(patch_centers, 'Patches...'):


Patches...:   0%|          | 0/56626 [00:00<?, ?it/s]

  df = df._append({'patchId': str(patient)+str(0)+str(c[0]).zfill(7)+str(c[1]).zfill(7),


Sliced WSI ../data/training/center_0/patient_004_node_4.tif to 60454 pathes.


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for c in tqdm_notebook(patch_centers, 'Patches...'):


Patches...:   0%|          | 0/60454 [00:00<?, ?it/s]

  df = df._append({'patchId': str(patient)+str(0)+str(c[0]).zfill(7)+str(c[1]).zfill(7),


In [13]:
dftest = pd.read_csv('../data/training/dataframes/patient_001_node_1.csv')
dftest.head()

Unnamed: 0.1,Unnamed: 0,patchId,fileName,center,patient,node,centerX,centerY,isTumor,tumorPercentage,tissuePercentage,meanHue,meanSaturation,meanValue
0,0,1001207040038016,../data/training/center_0/patient_001_node_1.tif,0,1,1,120704,38016,False,0.0,32.4,0.73,0.02,0.95
1,1,1001207040038272,../data/training/center_0/patient_001_node_1.tif,0,1,1,120704,38272,False,0.0,42.1,0.71,0.02,0.96
2,2,1001209600037504,../data/training/center_0/patient_001_node_1.tif,0,1,1,120960,37504,False,0.0,45.7,0.71,0.03,0.95
3,3,1001208320037632,../data/training/center_0/patient_001_node_1.tif,0,1,1,120832,37632,False,0.0,26.5,0.7,0.01,0.97
4,4,1001209600037760,../data/training/center_0/patient_001_node_1.tif,0,1,1,120960,37760,False,0.0,94.1,0.73,0.09,0.92
