In [1]:
import numpy as np
import pickle
import pandas as pd
from skimage import measure
import datetime
import os
from tqdm import tqdm
from time import strftime
# from sklearn import cross_validation
# from sklearn.cross_validation import StratifiedKFold as KFold
# from sklearn.metrics import classification_report
# from sklearn.ensemble import RandomForestClassifier as RF
# import xgboost as xgb


In [2]:
nodule_path = "/home/watts/lal/Kaggle/lung_cancer/cache/predictions/"
working_path = "/home/watts/lal/Kaggle/lung_cancer/"

In [3]:
def check_if_image_exists(fname):
    fname = os.path.join(working_path+'data/stage1/stage1/', fname)
    return os.path.exists(fname)

def check_if_scan_exists(folder):
    folder = os.path.join(working_path+'data/stage1/stage1/', folder)
    return os.path.isdir(folder)

def check_if_scan_exists2(folder):
    folder = os.path.join(working_path+'data/stage2/', folder)
    return os.path.isdir(folder)

def get_current_date():
    return strftime('%Y%m%d')

In [4]:
num_slices = 16
img_width = 128
img_height = 128

In [5]:
def getRegionFromMap(slice_npy):
    thr = np.where(slice_npy > np.mean(slice_npy),0.,1.0)
    # print thr
    label_image = measure.label(thr)
    labels = label_image.astype(int)
    regions = measure.regionprops(labels)
    return regions

def getRegionMetricRow(fname = "nodules.npy"):
    # fname, numpy array of dimension [#slices, 1, 512, 512] containing the images
    seg = np.load(fname)
    nslices = seg.shape[0]

    #metrics
    totalArea = 0.
    avgArea = 0.
    maxArea = 0.
    avgEcc = 0.
    avgEquivlentDiameter = 0.
    stdEquivlentDiameter = 0.
    weightedX = 0.
    weightedY = 0.
    numNodes = 0.
    numNodesperSlice = 0.
    # crude hueristic to filter some bad segmentaitons
    # do not allow any nodes to be larger than 10% of the pixels to eliminate background regions
    maxAllowedArea = 0.10 * 512 * 512

    areas = []

    eqDiameters = []
    for slicen in range(nslices):
        regions = getRegionFromMap(seg[slicen,0,:,:])
        for region in regions:
            if region.area > maxAllowedArea:
                continue
            totalArea += region.area
            areas.append(region.area)
            avgEcc += region.eccentricity
            avgEquivlentDiameter += region.equivalent_diameter
            eqDiameters.append(region.equivalent_diameter)
            weightedX += region.centroid[0]*region.area
            weightedY += region.centroid[1]*region.area
            numNodes += 1

    weightedX = weightedX / totalArea
    weightedY = weightedY / totalArea
    avgArea = totalArea / numNodes

    avgEcc = avgEcc / numNodes
    avgEquivlentDiameter = avgEquivlentDiameter / numNodes
    stdEquivlentDiameter = np.std(eqDiameters)

    maxArea = max(areas)


    numNodesperSlice = numNodes*1. / nslices


    return np.array([avgArea,maxArea,avgEcc,avgEquivlentDiameter,\
                     stdEquivlentDiameter, weightedX, weightedY, numNodes, numNodesperSlice])

def getRegionMetricRow2(fname = "nodules.npy"):
    # fname, numpy array of dimension [1, #slices, 128, 128] containing the images
    seg = np.load(fname)
    nslices = seg.shape[1]

    #print nslices
    #metrics
    totalArea = 0.
    avgArea = 0.
    maxArea = 0.
    avgEcc = 0.
    avgEquivlentDiameter = 0.
    stdEquivlentDiameter = 0.
    weightedX = 0.
    weightedY = 0.
    numNodes = 0.
    numNodesperSlice = 0.
    # crude hueristic to filter some bad segmentaitons
    # do not allow any nodes to be larger than 10% of the pixels to eliminate background regions
    #maxAllowedArea = 0.10 * 128 * 128
    maxAllowedArea = img_width * img_height

    areas = []

    eqDiameters = []
    for slicen in range(nslices):
        #regions = getRegionFromMap(seg[slicen,0,:,:])
        regions = getRegionFromMap(seg[0,slicen,:,:])
        for region in regions:
            if region.area > maxAllowedArea:
                #print region.area, maxAllowedArea
                continue
            
            
            totalArea += region.area
            areas.append(region.area)
            avgEcc += region.eccentricity
            avgEquivlentDiameter += region.equivalent_diameter
            eqDiameters.append(region.equivalent_diameter)
            weightedX += region.centroid[0]*region.area
            weightedY += region.centroid[1]*region.area
            numNodes += 1

    weightedX = weightedX / totalArea
    weightedY = weightedY / totalArea
    avgArea = totalArea / numNodes

    avgEcc = avgEcc / numNodes
    avgEquivlentDiameter = avgEquivlentDiameter / numNodes
    stdEquivlentDiameter = np.std(eqDiameters)

    maxArea = max(areas)


    numNodesperSlice = numNodes*1. / nslices


    return avgArea,maxArea,avgEcc,avgEquivlentDiameter,\
                     stdEquivlentDiameter, weightedX, weightedY, numNodes, numNodesperSlice



In [6]:
def createFeatureDataset(nodfiles=None):
    if nodfiles == None:
        # directory of numpy arrays containing masks for nodules
        # found via unet segmentation
        noddir = "/training_set/"
        nodfiles = glob(noddir +"*npy")
    # dict with mapping between training examples and true labels
    # the training set is the output masks from the unet segmentation
    truthdata = pickle.load(open("truthdict.pkl",'r'))
    numfeatures = 9
    feature_array = np.zeros((len(nodfiles),numfeatures))
    truth_metric = np.zeros((len(nodfiles)))

    for i,nodfile in enumerate(nodfiles):
        patID = nodfile.split("_")[2]
        truth_metric[i] = truthdata[int(patID)]
        feature_array[i] = getRegionMetricRow(nodfile)

    np.save("dataY.npy", truth_metric)
    np.save("dataX.npy", feature_array)

def createFeatureDataset2(nodfiles=None):
    if nodfiles == None:
        # directory of numpy arrays containing masks for nodules
        # found via unet segmentation
        noddir = "/training_set/"
        nodfiles = glob(noddir +"*npy")
    # dict with mapping between training examples and true labels
    # the training set is the output masks from the unet segmentation
    truthdata = pickle.load(open("truthdict.pkl",'r'))
    numfeatures = 9
    feature_array = np.zeros((len(nodfiles),numfeatures))
    truth_metric = np.zeros((len(nodfiles)))

    for i,nodfile in enumerate(nodfiles):
        patID = nodfile.split("_")[2]
        truth_metric[i] = truthdata[int(patID)]
        feature_array[i] = getRegionMetricRow(nodfile)

    np.save("dataY.npy", truth_metric)
    np.save("dataX.npy", feature_array)


In [9]:
df = pd.read_csv(working_path+'data/stage1/stage1_labels_all.csv')

df['scan_folder'] = df['id']

df['exist'] = df['scan_folder'].apply(check_if_scan_exists)

print '%i does not exists' % (len(df) - df['exist'].sum())
print df[~df['exist']]

df = df[df['exist']]
df = df.reset_index(drop=True)

0 does not exists
Empty DataFrame
Columns: [id, cancer, scan_folder, exist]
Index: []


In [10]:
data = []
IMG_PX_SIZE = img_width
IMG_PX_SIZE_ORG = 512
HM_SLICES = num_slices
for i, row in tqdm(df.iterrows(), total=len(df)):
#     if i != 0:
#         continue
    scan_folder = row['scan_folder']
    # X_nodule_fname = nodule_path+'X_nodule_%s_%s_%s_%s.npy' % (scan_folder, HM_SLICES, IMG_PX_SIZE, IMG_PX_SIZE)
    X_nodule_fname = nodule_path+'stage1/%s.npy' % scan_folder
    avgArea,maxArea,avgEcc,avgEquivlentDiameter,\
                     stdEquivlentDiameter, weightedX, weightedY, numNodes, numNodesperSlice \
    = getRegionMetricRow2(X_nodule_fname)

    cancer = row['cancer']
    t = {'scan_folder': scan_folder,
         'avgArea': avgArea, 
         'maxArea':maxArea, 
         'avgEcc': avgEcc, 
         'avgEquivlentDiameter': avgEquivlentDiameter,
         'stdEquivlentDiameter': stdEquivlentDiameter,
         'weightedX': weightedX,
         'weightedY': weightedY,
         'numNodes': numNodes,
         'numNodesperSlice': numNodesperSlice,
         'output': cancer
        }
    data.append(t)
df = pd.DataFrame(data)
train_fname = working_path+'cache/my_train_%d_%d_%d_%s.csv' % (num_slices, img_width, img_height, get_current_date())
df.to_csv(train_fname, sep=',', index_label = 'id')
print 'Done'
now = datetime.datetime.now()
print now

  0%|          | 0/1595 [00:00<?, ?it/s]


IOError: [Errno 2] No such file or directory: '/home/watts/lal/Kaggle/lung_cancer/cache/predictionsstage1/0015ceb851d7251b8f399e39779d1e7d.npy'

In [48]:
df.head(10)

Unnamed: 0,avgArea,avgEcc,avgEquivlentDiameter,maxArea,numNodes,numNodesperSlice,output,scan_folder,stdEquivlentDiameter,weightedX,weightedY
0,16380.0,0.010361,144.414901,16382,16.0,1.0,1,0015ceb851d7251b8f399e39779d1e7d,0.010221,63.498569,63.490381
1,16380.0625,0.020174,144.415177,16382,16.0,1.0,0,0030a160d58723ff36d73f41b170ec21,0.006698,63.49155,63.496369
2,16379.0625,0.020743,144.410768,16381,16.0,1.0,0,003f41c78e6acfa92430a057ac0b306e,0.008744,63.488169,63.496865
3,16378.3125,0.017675,144.407462,16381,16.0,1.0,1,006b96310a37b36cccb2ab48d10b49a3,0.011424,63.485287,63.494484
4,16380.125,0.017509,144.415452,16381,16.0,1.0,1,008464bb8521d09a42985dd8add3d0d2,0.003068,63.493223,63.499016
5,16377.4375,0.021087,144.403604,16380,16.0,1.0,0,0092c13f9e00a3717fdc940641f00015,0.010453,63.487198,63.497811
6,16379.5,0.02511,144.412697,16380,16.0,1.0,0,00986bebc45e12038ef0ce3e9962b51a,0.003117,63.488877,63.50042
7,16377.3125,0.018855,144.403052,16381,16.0,1.0,0,00cba091fa4ad62cc3200a657aeb957e,0.020128,63.476666,63.492133
8,16379.5625,0.015926,144.412972,16382,16.0,1.0,1,00edff4f51a893d80dae2d42a7f45ad1,0.009217,63.484777,63.4988
9,16379.25,0.017188,144.411595,16382,16.0,1.0,0,0121c2845f2b7df060945b072b2515d7,0.009671,63.485504,63.498069


In [49]:
df = pd.read_csv(working_path+'data/stage2_sample_submission.csv')

df['scan_folder'] = df['id']

df['exist'] = df['scan_folder'].apply(check_if_scan_exists2)

print '%i does not exists' % (len(df) - df['exist'].sum())
print df[~df['exist']]

df = df[df['exist']]
df = df.reset_index(drop=True)

0 does not exists
Empty DataFrame
Columns: [id, cancer, scan_folder, exist]
Index: []


In [50]:
data = []
IMG_PX_SIZE = img_width
IMG_PX_SIZE_ORG = 512
HM_SLICES = num_slices
for i, row in tqdm(df.iterrows(), total=len(df)):
#     if i != 0:
#         continue
    scan_folder = row['scan_folder']
    #X_nodule_fname = nodule_path+'X_test_nodule_%s_%s_%s_%s.npy' % (scan_folder, HM_SLICES, IMG_PX_SIZE, IMG_PX_SIZE)
    X_nodule_fname = nodule_path+'stage2/%s.npy' % scan_folder
    avgArea,maxArea,avgEcc,avgEquivlentDiameter,\
                     stdEquivlentDiameter, weightedX, weightedY, numNodes, numNodesperSlice \
    = getRegionMetricRow2(X_nodule_fname)

    #cancer = row['cancer']
    t = {'scan_folder': scan_folder,
         'avgArea': avgArea, 
         'maxArea':maxArea, 
         'avgEcc': avgEcc, 
         'avgEquivlentDiameter': avgEquivlentDiameter,
         'stdEquivlentDiameter': stdEquivlentDiameter,
         'weightedX': weightedX,
         'weightedY': weightedY,
         'numNodes': numNodes,
         'numNodesperSlice': numNodesperSlice
        }
    data.append(t)
df = pd.DataFrame(data)
test_fname = working_path+'cache/my_test_%d_%d_%d_%s.csv' % (num_slices, img_width, img_height, get_current_date())
df.to_csv(test_fname, sep=',', index_label = 'id')
print 'Done'
now = datetime.datetime.now()
print now

  from ipykernel import kernelapp as app
100%|██████████| 198/198 [00:05<00:00, 38.08it/s]

Done
2017-04-08 05:47:44.369109





In [51]:
df.head(20)

Unnamed: 0,avgArea,avgEcc,avgEquivlentDiameter,maxArea,numNodes,numNodesperSlice,scan_folder,stdEquivlentDiameter,weightedX,weightedY
0,16377.1875,0.030918,144.402502,16380,16.0,1.0,026470d51482c93efc18b9803159c960,0.009762,63.480962,63.503078
1,16377.0625,0.030894,144.401951,16380,16.0,1.0,031b7ec4fe96a3b035a8196264a8c8c3,0.010277,63.480001,63.503089
2,16377.8125,0.023824,144.405258,16380,16.0,1.0,03bd22ed5858039af223c04993e9eb22,0.009636,63.489134,63.498296
3,16379.6875,0.023099,144.413524,16381,16.0,1.0,06a90409e4fcea3e634748b967993531,0.007095,63.484296,63.503558
4,16367.8125,0.013923,144.361039,16384,16.0,1.0,07b1defcfae5873ee1f03c90255eb170,0.190909,63.536327,63.489085
5,16377.125,0.030578,144.402227,16379,16.0,1.0,0b20184e0cd497028bdd155d9fb42dc9,0.00823,63.495653,63.498058
6,16377.0625,0.030004,144.401951,16380,16.0,1.0,12db1ea8336eafaf7f9e3eda2b4e4fef,0.010277,63.489637,63.500204
7,16377.3125,0.019936,144.403053,16380,16.0,1.0,159bc8821a2dc39a1e770cb3559e098d,0.014091,63.477585,63.500719
8,16377.75,0.024155,144.404981,16382,16.0,1.0,174c5f7c33ca31443208ef873b9477e5,0.018543,63.478351,63.489525
9,16380.1875,0.01061,144.415728,16382,16.0,1.0,1753250dab5fc81bab8280df13309733,0.009117,63.486224,63.499758
