# Nucleus challenge using a CNN

In [None]:
import glob
import os.path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Load data

In [None]:
dataDir = 'data/data-science-bowl-2018/'

### Load labels

In [None]:
train_labels = pd.read_csv(os.path.join(dataDir, 'stage1_train_labels.csv/stage1_train_labels.csv'))
train_labels['EncodedPixels'] = train_labels['EncodedPixels'].map(lambda ep: [int(x) for x in ep.split(' ')])
train_labels.head()

### Load training images

In [None]:
all_images = glob.glob(os.path.join(dataDir, 'stage1_*', '*', '*', '*.png'))
img_df = pd.DataFrame({'path': all_images})

print('An exemplary data path with indices of split:')
print(*map(lambda x: (x[0]-6, x[1]), enumerate(img_df['path'].iloc[0].split('/'))), sep='\n', end='\n\n')

img_id = lambda x: x.split('/')[-3]
img_type = lambda in_path: in_path.split('/')[-2]
img_group = lambda in_path: in_path.split('/')[-4].split('_')[1]
img_stage = lambda in_path: in_path.split('/')[-4].split('_')[0]
                           
img_df['ImageId'] = img_df['path'].map(img_id)
img_df['ImageType'] = img_df['path'].map(img_type)
img_df['TrainingSplit'] = img_df['path'].map(img_group)
img_df['Stage'] = img_df['path'].map(img_stage)

print(img_df.info())
img_df.head()

### Create dataframe with training data (images and masks)

In [None]:
%%time

train_df = img_df.query('TrainingSplit=="train"')
train_rows = []
group_cols = ['Stage', 'ImageId']

for group, rows in train_df.groupby(group_cols):
#     print('group', group, 'contains', len(rows), 'rows')
    c_row = {col_name: col_value for col_name, col_value in zip(group_cols, group)}
    c_row['images'] = rows.query('ImageType == "images"')['path'].values.tolist()
    c_row['masks'] = rows.query('ImageType == "masks"')['path'].values.tolist()
    train_rows += [c_row]
    
train_img_df = pd.DataFrame(train_rows)    

In [None]:
train_img_df.head()

## Analysis using a single combined mask

### Load the images and save them in dataframe

In [None]:
from skimage.io import imread

IMG_CHANNELS = 3
def read_and_stack(in_img_list):
    return np.sum(np.stack([imread(c_img) for c_img in in_img_list], 0), 0) / 255.0

In [None]:
%%time

train_img_df['images'] = train_img_df['images'].map(read_and_stack).map(lambda x: x[:,:,:IMG_CHANNELS])
train_img_df['masks'] = train_img_df['masks'].map(read_and_stack).map(lambda x: x.astype(int))

In [None]:
train_img_df.head()

### Show some of the pictures with their labels

In [None]:
n_img = 6
fig, m_axs = plt.subplots(2, n_img, figsize = (12, 4))
for (c_row_idx, c_row), (c_im, c_lab) in zip(train_img_df.sample(n_img).iterrows(), 
                                     m_axs.T):
    c_im.imshow(c_row['images'])
    c_im.axis('off')
    c_im.set_title('Microscope ' + str(c_row_idx))
    
    c_lab.imshow(c_row['masks'])
    c_lab.axis('off')
    c_lab.set_title('Labeled ' + str(c_row_idx))

## Analysis using separate masks