# The nucleus challenge

## Read the data

In [None]:
import os.path

import pathlib
import imageio
import numpy as np

In [None]:
FULL_ANALYSIS = False

In [None]:
training_paths = pathlib.Path('data/data-science-bowl-2018/stage1_train').glob('*/images/*.png')
training_sorted = sorted([x for x in training_paths])
print('Found', len(training_sorted), 'training images')
print(*training_sorted[0:2], sep='\n')

## Plot the data

In [None]:
import matplotlib.pyplot as plt


def show_images(ims, cmap=None, labels=None):
    plt.figure(figsize=(3 * len(ims),10))
    if labels is not None:
        assert(len(labels) == len(ims)), 'provide exactly one label per image'
    for idx, im in enumerate(ims):
        plt.subplot(1,len(ims),idx + 1)
        plt.imshow(im, cmap=cmap)
        # plt.axis('off')
        if labels is None:
            plt.title('Image ' + str(idx))
        else:
            plt.title(labels[idx])
            
    plt.tight_layout()
    plt.show()

In [None]:
ims = list(map(lambda p: imageio.imread(str(p)), training_sorted[0:4]))
show_images(ims)

## Clean the data

### Coerce to gray scale

In [None]:
from skimage.color import rgb2gray

im = imageio.imread(str(training_sorted[0]))  # image instance for testing
im2 = imageio.imread(str(training_sorted[2]))  # image instance for testing

# Print the image dimensions
print('Original image shape: {}'.format(im.shape))

# Coerce the image into grayscale format (if not already)
print('New image shape: {}'.format(rgb2gray(im).shape))

ims = list(map(lambda p: rgb2gray(imageio.imread(str(p))), training_sorted[0:4]))
show_images(ims, cmap='gray')

### Remove background

#### Analyze distribution of pixel intensity

In [None]:
def show_pixel_values(ims):
    plt.figure(figsize=(2 * len(ims),2))
    for idx, im in enumerate(ims):
        plt.subplot(1,len(ims),idx + 1)
        im_pixels = im.flatten()
        plt.hist(im_pixels, bins=50)
        # plt.vlines(thresh_val, 0, 100000, linestyle='--')
        plt.xlim([0,1])
        plt.yscale('log', nonposy='clip')
        plt.xlabel('grayscale')
        plt.title('Image ' + str(idx))
            
    plt.tight_layout()
    plt.show()
    
ims = list(map(lambda p: rgb2gray(imageio.imread(str(p))), training_sorted[0:4]))
show_pixel_values(ims)

#### Mask all values above/below certain threshold

In [None]:
from skimage.filters import threshold_otsu, threshold_yen, threshold_isodata, threshold_triangle, threshold_sauvola, threshold_mean


def get_image_mask(im, strategy='otsu'):
    if len(im.shape) > 2:
        raise ValueError('need 2-dimensional image data, you provided ' + str(im.shape))
    if strategy == 'yen':
        thresh_val = threshold_mean(im)
#         thresh_val = np.mean(im)
    elif strategy == 'yen':
        thresh_val = threshold_yen(im)
    elif strategy == 'isodata':
        thresh_val = threshold_isodata(im)
    elif strategy == 'otsu':
        thresh_val = threshold_otsu(im)
    elif strategy == 'triangle':
        thresh_val = threshold_triangle(im)
    elif strategy == 'sauvola':
        thresh_val = threshold_sauvola(im)
    else:
        thresh_val = 0.3
    mask = np.where(im > thresh_val, 1, 0)

    # Make sure the larger portion of the mask is considered background
    if np.sum(mask==0) < np.sum(mask==1):
        mask = np.where(mask, 0, 1)
        
    return mask


def mask_image(im, **kwargs):   
    return np.where(get_image_mask(im, **kwargs), im, 0)

Investigate different scikit image filters

In [None]:
showOnlyOne = False

if showOnlyOne:
    ims = list(map(lambda p: mask_image(rgb2gray(imageio.imread(str(p)))), training_sorted[0:4]))
    show_images(ims, cmap='gray')
else:
    strategies = ['yen', 'isodata', 'otsu', 'triangle', 'sauvola', 'else']
    for strat in strategies:
        ims = list(map(lambda p: mask_image(rgb2gray(imageio.imread(str(p))), strategy=strat), training_sorted[0:4]))
        show_images(ims, cmap='gray', labels=['Image ' + str(i) + ': ' + strat for i in range(len(ims))])

In [None]:
print('Unmasked:')
ims = list(map(lambda p: rgb2gray(imageio.imread(str(p))), training_sorted[0:4]))
show_pixel_values(ims)
print('Masked:')
ims = list(map(lambda p: mask_image(rgb2gray(imageio.imread(str(p)))), training_sorted[0:4]))
show_pixel_values(ims)

### Create separate mask for each blob

In [None]:
from scipy import ndimage

mask = get_image_mask(rgb2gray(im))
labels, nlabels = ndimage.label(mask)

show_images([mask])
# show_images(labels[:3])
print(labels[200])

In [None]:
MIN_PIXELS_IN_BLOB = 30

def get_separate_masks(im):
    mask = get_image_mask(rgb2gray(im))
    labels, nlabels = ndimage.label(mask)
    label_arrays = []
    for label_num in range(1, nlabels+1):
        label_mask = np.where(labels == label_num, 1, 0)
        if label_mask.sum() > MIN_PIXELS_IN_BLOB:
            label_arrays.append(label_mask)
    return label_arrays

masks = get_separate_masks(rgb2gray(im))
print(len(masks))
show_images([masks[0], masks[0][0:7, 187:210]], labels=['Full', 'Zoom'])

Plot different masks in different colors

In [None]:
from matplotlib.colors import ListedColormap


def show_masks(ims, labels=None):
    plt.figure(figsize=(3 * len(ims),10))
    if labels is not None:
        assert(len(labels) == len(ims)), 'provide exactly one label per image'
    for idx, im in enumerate(ims):
        plt.subplot(1,len(ims),idx + 1)
        masks = get_separate_masks(im)
        for mask in masks:
            rand_cmap = ListedColormap(np.random.rand(256,3))
            mask = np.where(mask, 1, np.nan)
            plt.imshow(mask, cmap=rand_cmap)
        # plt.axis('off')
        if labels is None:
            plt.title('Image ' + str(idx))
        else:
            plt.title(labels[idx])
            
    plt.tight_layout()
    plt.show()
    
    
ims = list(map(lambda p: rgb2gray(imageio.imread(str(p))), training_sorted[0:6]))
show_images(ims, cmap='gray')
show_masks(ims)

### Create string representation of mask

In [None]:
import functools


def create_output_string(mask):
    nY, nX = mask.shape
    result = ''
    idx = 0
    count = 0
    maxIdx = functools.reduce(lambda x, y: x*y, mask.shape)
    lastIdx = 0
    for x in range(nX):
        for y in range(nY):
            if mask[y, x] == 1 and count == 0:
                result += str(idx+1) + ' '
                count += 1
                lastIdx = idx
            elif mask[y, x] == 1:
                count += 1

            if (mask[y, x] == 0 or y == nY - 1) and count != 0:
                result += str(count) + ' '
                count = 0
                lastIdx = 0

            idx += 1
                
    return result
    
print(create_output_string(masks[0]))

## Analyze test data

### Load test data

In [None]:
test_paths = pathlib.Path('data/data-science-bowl-2018/stage1_test').glob('*/images/*.png')
test_sorted = sorted([x for x in test_paths])
print(test_sorted[0])

Show some of them with their masks

In [None]:
paths = test_sorted[0:4]
ims = list(map(lambda p: imageio.imread(str(p)), paths))
masks = list(map(lambda p: get_image_mask(rgb2gray(imageio.imread(str(p)))), paths))
show_images(ims)
show_images(masks)

### Run the algorithm

Still takes quite some time to run. Reduce MAX_MASK parameter to speed up

In [None]:
MAX_MASK = 100

def analyze_image(paths, fileName=None):
    with open(fileName, 'w') as myfile:
        print('ImageId,EncodedPixels', file=myfile)
        for pidx, path in enumerate(paths):
            name = str.split(os.path.basename(str(path)), '.')[0]
            print(str(pidx) + '/' + str(len(paths)) + ': processing', name)
            im = rgb2gray(imageio.imread(str(path)))
            masks = get_separate_masks(im)
            for idx, mask in enumerate(masks):
                if idx >= MAX_MASK:
                    print('Warning: image', name, 'has more than', MAX_MASK, 'masks! Skip rest.')
                    break
                maskString = create_output_string(mask)
                print(name + ', ' + maskString, file=myfile)

if FULL_ANALYSIS:
    analyze_image(test_sorted, 'results/result_analytic.csv')
else:
    analyze_image(test_sorted[0:4], 'results/result_test.csv')

## Investigate training data

In [None]:
import itertools

training_label_paths = pathlib.Path('data/data-science-bowl-2018/stage1_train').glob('*/masks/')
training_label_sorted = sorted([x for x in training_label_paths])

maskStrings = []
for maskDir in training_label_sorted:
    mask_paths =  pathlib.Path(maskDir).glob('*.png')
    mask_paths = sorted([x for x in mask_paths])
    maskStrings.append(mask_paths)

# print(training_label_sorted[0:2], sep='\n')
print('Found', len(maskStrings), 'mask dirs')
print('Found', len(maskStrings[0]), 'masks in first dir')

In [None]:
mask = maskStrings[0][0]


def show_superimposed(maskLists, labels=None):
    plt.figure(figsize=(3 * len(maskLists),10))
    if labels is not None:
        assert(len(labels) == len(maskLists)), 'provide exactly one label per image'
    for idx, masks in enumerate(maskLists):
        plt.subplot(1,len(maskLists),idx + 1)
        for mask in masks:
            rand_cmap = ListedColormap(np.random.rand(256,3))
            mask = np.where(mask, 1, np.nan)
            plt.imshow(mask, cmap=rand_cmap)
        # plt.axis('off')
        if labels is None:
            plt.title('Image ' + str(idx))
        else:
            plt.title(labels[idx])
            
    plt.tight_layout()
    plt.show()
    
print('Reconstructed masks:')
recoMasks = list(map(lambda p: get_separate_masks(rgb2gray(imageio.imread(str(p)))), training_sorted[0:6]))
show_superimposed(recoMasks)
print('True masks:')
labelMasks = list(map(lambda p: list(map(lambda m: rgb2gray(imageio.imread(str(m))), p)), maskStrings[0:6]))
show_superimposed(labelMasks)