In [1]:
cd /home/watts/lal/Kaggle/lung_cancer

/home/watts/lal/medicine-ai/lung_cancer


In [2]:
#from __future__ import print_function, division
import os
import SimpleITK as sitk
import numpy as np
import csv
from glob import glob
import pandas as pd
try:
    from tqdm import tqdm # long waits are not fun
except:
    print('TQDM does make much nicer wait bars...')
    tqdm = lambda x: x

In [3]:
luna_path = "/home/watts/lal/Kaggle/lung_cancer/data_luna16/"
luna_subset_path = luna_path+"subset0/"
output_path = "/home/watts/lal/Kaggle/lung_cancer/cache/luna16/"
file_list=glob(luna_subset_path+"*.mhd")

In [4]:
def make_mask(center,diam,z,width,height,spacing,origin):
    '''
Center : centers of circles px -- list of coordinates x,y,z
diam : diameters of circles px -- diameter
widthXheight : pixel dim of image
spacing = mm/px conversion rate np array x,y,z
origin = x,y,z mm np.array
z = z position of slice in world coordinates mm
    '''
    mask = np.zeros([height,width]) # 0's everywhere except nodule swapping x,y to match img
    #convert to nodule space from world coordinates

    # Defining the voxel range in which the nodule falls
    v_center = (center-origin)/spacing
    v_diam = int(diam/spacing[0]+5)
    v_xmin = np.max([0,int(v_center[0]-v_diam)-5])
    v_xmax = np.min([width-1,int(v_center[0]+v_diam)+5])
    v_ymin = np.max([0,int(v_center[1]-v_diam)-5]) 
    v_ymax = np.min([height-1,int(v_center[1]+v_diam)+5])

    v_xrange = range(v_xmin,v_xmax+1)
    v_yrange = range(v_ymin,v_ymax+1)

    # Convert back to world coordinates for distance calculation
    x_data = [x*spacing[0]+origin[0] for x in range(width)]
    y_data = [x*spacing[1]+origin[1] for x in range(height)]

    # Fill in 1 within sphere around nodule
    for v_x in v_xrange:
        for v_y in v_yrange:
            p_x = spacing[0]*v_x + origin[0]
            p_y = spacing[1]*v_y + origin[1]
            if np.linalg.norm(center-np.array([p_x,p_y,z]))<=diam:
                mask[int((p_y-origin[1])/spacing[1]),int((p_x-origin[0])/spacing[0])] = 1.0
    return(mask)

def matrix2int16(matrix):
    ''' 
matrix must be a numpy array NXN
Returns uint16 version
    '''
    m_min= np.min(matrix)
    m_max= np.max(matrix)
    matrix = matrix-m_min
    return(np.array(np.rint( (matrix-m_min)/float(m_max-m_min) * 65535.0),dtype=np.uint16))

#####################
#
# Helper function to get rows in data frame associated 
# with each file
def get_filename(file_list, case):
    for f in file_list:
        if case in f:
            return(f)

In [5]:
#
# The locations of the nodes
df_node = pd.read_csv(luna_path+"annotations.csv")
df_node["file"] = df_node["seriesuid"].map(lambda file_name: get_filename(file_list, file_name))
df_node = df_node.dropna()

In [None]:
num_slices = 8
for fcount, img_file in enumerate(tqdm(file_list)):
    mini_df = df_node[df_node["file"]==img_file] #get all nodules associate with file
    if mini_df.shape[0]>0: # some files may not have a nodule--skipping those 
        # load the data once
        itk_img = sitk.ReadImage(img_file) 
        img_array = sitk.GetArrayFromImage(itk_img) # indexes are z,y,x (notice the ordering)
        num_z, height, width = img_array.shape        #heightXwidth constitute the transverse plane
        # print(num_z, height, width)
        origin = np.array(itk_img.GetOrigin())      # x,y,z  Origin in world coordinates (mm)
        spacing = np.array(itk_img.GetSpacing())    # spacing of voxels in world coor. (mm)
        # go through all nodes (why just the biggest?)
        for node_idx, cur_row in mini_df.iterrows():       
            node_x = cur_row["coordX"]
            node_y = cur_row["coordY"]
            node_z = cur_row["coordZ"]
            diam = cur_row["diameter_mm"]
            # just keep 6 slices
            imgs = np.ndarray([num_slices,height,width],dtype=np.float32)
            masks = np.ndarray([num_slices,height,width],dtype=np.uint8)
            center = np.array([node_x, node_y, node_z])   # nodule center
            v_center = np.rint((center-origin)/spacing)  # nodule center in voxel space (still x,y,z ordering)
            for i, i_z in enumerate(np.arange(int(v_center[2])-1,
                             int(v_center[2])+num_slices-1).clip(0, num_z-1)): # clip prevents going out of bounds in Z
                # print(i, i_z)
                mask = make_mask(center, diam, i_z*spacing[2]+origin[2],
                                 width, height, spacing, origin)
                masks[i] = mask
                imgs[i] = img_array[i_z]
            np.save(os.path.join(output_path,"images_%04d_%04d.npy" % (fcount, node_idx)),imgs)
            np.save(os.path.join(output_path,"masks_%04d_%04d.npy" % (fcount, node_idx)),masks)

  1%|          | 1/89 [00:00<00:18,  4.68it/s]

(177, 512, 512)
(0, 70)
(1, 71)
(2, 72)
(3, 73)
(4, 74)
(5, 75)
(0, 73)
(1, 74)
(2, 75)
(3, 76)
(4, 77)
(5, 78)
(127, 512, 512)
(0, 108)


  2%|▏         | 2/89 [00:00<00:17,  5.08it/s]

(1, 109)
(2, 110)
(3, 111)
(4, 112)
(5, 113)


  3%|▎         | 3/89 [00:00<00:16,  5.07it/s]

(280, 512, 512)
(0, 213)
(1, 214)
(2, 215)
(3, 216)
(4, 217)
(5, 218)
(157, 512, 512)
(0, 83)
(1, 84)
(2, 85)
(3, 86)
(4, 87)


  6%|▌         | 5/89 [00:00<00:14,  5.62it/s]

(5, 88)
(133, 512, 512)
(0, 80)
(1, 81)
(2, 82)
(3, 83)
(4, 84)
(5, 85)
(250, 512, 512)
(0, 51)
(1, 52)
(2, 53)
(3, 54)
(4, 55)
(5, 56)
(0, 51)
(1, 52)
(2, 53)
(3, 54)
(4, 55)
(5, 56)


  7%|▋         | 6/89 [00:01<00:19,  4.28it/s]

(0, 71)
(1, 72)
(2, 73)
(3, 74)
(4, 75)
(5, 76)
(246, 512, 512)
(0, 91)
(1, 92)
(2, 93)
(3, 94)
(4, 95)
(5, 96)
(0, 89)
(1, 90)
(2, 91)
(3, 92)
(4, 93)
(5, 94)


  9%|▉         | 8/89 [00:03<00:57,  1.40it/s]

(474, 512, 512)
(0, 214)
(1, 215)
(2, 216)
(3, 217)
(4, 218)
(5, 219)
(276, 512, 512)
(0, 198)
(1, 199)
(2, 200)
(3, 201)
(4, 202)
(5, 203)
(0, 102)
(1, 103)
(2, 104)
(3, 105)


 11%|█         | 10/89 [00:04<00:58,  1.35it/s]

(4, 106)
(5, 107)
(0, 211)
(1, 212)
(2, 213)
(3, 214)
(4, 215)
(5, 216)
(125, 512, 512)
(0, 44)
(1, 45)
(2, 46)
(3, 47)
(4, 48)
(5, 49)
(0, 61)
(1, 62)
(2, 63)
(3, 64)
(4, 65)
(5, 66)


 13%|█▎        | 12/89 [00:08<01:32,  1.20s/it]

(483, 512, 512)
(0, 369)
(1, 370)
(2, 371)
(3, 372)
(4, 373)
(5, 374)
(733, 512, 512)
(0, 583)
(1, 584)
(2, 585)
(3, 586)
(4, 587)
(5, 588)
(0, 448)
(1, 449)


 15%|█▍        | 13/89 [00:11<02:12,  1.74s/it]

(2, 450)
(3, 451)
(4, 452)
(5, 453)
(0, 662)
(1, 663)
(2, 664)
(3, 665)
(4, 666)
(5, 667)


 16%|█▌        | 14/89 [00:11<01:46,  1.42s/it]

(139, 512, 512)
(0, 60)
(1, 61)
(2, 62)
(3, 63)
(4, 64)
(5, 65)
(209, 512, 512)
(0, 178)
(1, 179)
(2, 180)
(3, 181)
(4, 182)


 17%|█▋        | 15/89 [00:12<01:36,  1.30s/it]

(5, 183)


 18%|█▊        | 16/89 [00:13<01:22,  1.13s/it]

(176, 512, 512)
(0, 68)
(1, 69)
(2, 70)
(3, 71)
(4, 72)
(5, 73)
(127, 512, 512)
(0, 107)
(1, 108)
(2, 109)
(3, 110)


 19%|█▉        | 17/89 [00:14<01:14,  1.03s/it]

(4, 111)
(5, 112)
(280, 512, 512)
(0, 212)
(1, 213)
(2, 214)
(3, 215)
(4, 216)
(5, 217)
(0, 47)
(1, 48)


 20%|██        | 18/89 [00:15<01:19,  1.12s/it]

(2, 49)
(3, 50)
(4, 51)
(5, 52)


 25%|██▍       | 22/89 [00:17<01:01,  1.08it/s]

(471, 512, 512)
(0, 54)
(1, 55)
(2, 56)
(3, 57)
(4, 58)
(5, 59)


 26%|██▌       | 23/89 [00:18<01:02,  1.05it/s]

(250, 512, 512)
(0, 199)
(1, 200)
(2, 201)
(3, 202)
(4, 203)
(5, 204)
(0, 176)
(1, 177)
(2, 178)
(3, 179)
(4, 180)
(5, 181)


 27%|██▋       | 24/89 [00:19<01:07,  1.04s/it]

(297, 512, 512)
(0, 188)
(1, 189)
(2, 190)
(3, 191)
(4, 192)
(5, 193)


 28%|██▊       | 25/89 [00:20<00:57,  1.11it/s]

(119, 512, 512)
(0, 32)
(1, 33)
(2, 34)
(3, 35)
(4, 36)
(5, 37)


 29%|██▉       | 26/89 [00:21<00:51,  1.22it/s]

(140, 512, 512)
(0, 55)
(1, 56)
(2, 57)
(3, 58)
(4, 59)
(5, 60)
