In [1]:
import skimage.io
import numpy as np
import pandas as pd
import sys
from pathlib import Path
import matplotlib.pyplot as plt
import pickle

In [2]:
class TileMaker:
    
    def __init__(self, size, number):
        self.size = size
        self.number = number
        
    def make(self, image, mask):
        
        h, w, c = image.shape
        horizontal_pad = 0 if (w % self.size) == 0 else self.size - (w % self.size)
        vertical_pad = 0 if (h % self.size) == 0 else self.size - (h % self.size)
        
        image = np.pad(image, pad_width=((vertical_pad//2, vertical_pad - vertical_pad//2), 
                                         (horizontal_pad//2, horizontal_pad - horizontal_pad//2),
                                         (0, 0)), 
                       mode='constant', constant_values=255)  # Empty is white in this data
        
        mask = np.pad(mask, pad_width=((vertical_pad//2, vertical_pad - vertical_pad//2), 
                                       (horizontal_pad//2, horizontal_pad - horizontal_pad//2),
                                       (0, 0)), 
                       mode='constant', constant_values=0)  # Empty is black in this data
        
        h, w, c = image.shape
        image = image.reshape(h // self.size, self.size, w // self.size, self.size, c)
        image = image.swapaxes(1, 2).reshape(-1, self.size, self.size, c)
        mask = mask.reshape(h // self.size, self.size, w // self.size, self.size, c)
        mask = mask.swapaxes(1, 2).reshape(-1, self.size, self.size, c)
        
        if image.shape[0] < self.number:
            image = np.pad(image, pad_width=((0, self.number - image.shape[0]), (0, 0), (0, 0), (0 ,0)),
                           mode='constant', constant_values=255)
            mask = np.pad(mask, pad_width=((0, self.number - mask.shape[0]), (0, 0), (0, 0), (0 ,0)),
                          mode='constant', constant_values=0)
        
        # Find the images with the most stuff (the less white):
        sorted_tiles = np.argsort(np.sum(image, axis=(1, 2, 3)))
        sorted_tiles = sorted_tiles[:self.number]
        
        return image[sorted_tiles], mask[sorted_tiles]

In [3]:
TRAIN_PATH = Path('G:/Datasets/panda/train_images/')
MASKS_TRAIN_PATH = Path('G:/Datasets/panda/train_label_masks/')
OUTPUT_IMG_PATH = Path('G:/Datasets/panda/train_tiles/imgs/')
OUTPUT_MASK_PATH = Path('G:/Datasets/panda/train_tiles/masks/')
CSV_PATH = Path('G:/Datasets/panda/train.csv')
SAVE_FULL = False

In [4]:
OUTPUT_IMG_PATH.mkdir(exist_ok=True, parents=True)
OUTPUT_MASK_PATH.mkdir(exist_ok=True, parents=True)

In [5]:
tile_maker = TileMaker(128, 16)

In [6]:
img_list = list(TRAIN_PATH.glob('**/*.tiff'))
img_list.pop(5765)
bad_images = []
bad_masks = []
image_stats = []
for i, img_fn in enumerate(img_list):
    
    img_id = img_fn.stem
    mask_fn = MASKS_TRAIN_PATH/(img_id + '_mask.tiff')
    
    try:
        image = skimage.io.MultiImage(str(img_fn))[-1]
    except:
        bad_images.append(img_id)
        continue
    
    if mask_fn.exists():
        
        try:
            mask = skimage.io.MultiImage(str(mask_fn))[-1]
        except:
            bad_masks.append(img_id)
            mask = np.zeros_like(image)
            
    else:
        mask = np.zeros_like(image)
        
    image, mask = tile_maker.make(image, mask)
    sys.stdout.write(f'\r{i + 1}/{len(img_list)}')

    image_stats.append({'image_id': img_id, 'mean': image.mean(axis=(0, 1, 2))/255, 
                        'mean_square': ((image/255) ** 2).mean(axis=(0, 1, 2))})
    
    if SAVE_FULL:
        full_image = image.reshape(4, 4, 128, 128, 3).swapaxes(1, 2).reshape(4*128, 4*128, 3)
        full_mask = mask.reshape(4, 4, 128, 128, 3).swapaxes(1, 2).reshape(4*128, 4*128, 3)
        skimage.io.imsave(OUTPUT_IMG_PATH/(img_id +'.png'), full_image, check_contrast=False)
        skimage.io.imsave(OUTPUT_MASK_PATH/(img_id +'.png'), full_mask, check_contrast=False)
    else:
        for i, (tile_image, tile_mask) in enumerate(zip(image, mask)):
            skimage.io.imsave(OUTPUT_IMG_PATH/(img_id + '_' + str(i) + '.png'), tile_image, check_contrast=False)
            skimage.io.imsave(OUTPUT_MASK_PATH/(img_id +'_' + str(i) + '.png'), tile_mask, check_contrast=False)

5763/10615



5848/10615



6022/10615



6029/10615



6099/10615



6132/10615



6136/10615



6183/10615



6282/10615



10615/10615

In [7]:
image_stats = pd.DataFrame(image_stats)

In [8]:
df = pd.read_csv(CSV_PATH)
df = pd.merge(df, image_stats, on='image_id', how='left')

In [9]:
provider_stats = {}
for provider in df['data_provider'].unique():
    mean = (df[df['data_provider']==provider]['mean']).mean()
    std = np.sqrt((df[df['data_provider']==provider]['mean_square']).mean() - mean ** 2)
    provider_stats[provider] = (mean, std)

In [10]:
with open('./stats.pkl', 'wb') as file:
    pickle.dump(provider_stats, file)

In [11]:
print(bad_images)

[]


In [12]:
print(bad_masks)

['8d6d8329cd0bbf6d0356b300f83d6b6a', '8d9bf04e714c959d4c571030c51ee9f5', '8fa7f6ad508c78ef7dbd621d579350b8', '93e2ce38e743146a128afb9ff3a61383', '93f366029ac746d84ea2aea80cc998e8', '9403cc77efc7d9414204d12e31dadac1', '95deb0caf878d715c1a497a77abbd6d9', '95df6544155901a142acce3866e316c2', '96aac9c3672cadf72e9e60d634bb145a', '96c62d8799651597247be49bf790d585', '97a83309575b765469e6a09250d92170', '97bed3de5c372f3dc9ba818cd5e97369', '99bc60c2d554be9304993efa0a5ef740']


In [13]:
print(provider_stats)

{'karolinska': (array([0.8935662 , 0.78511786, 0.87020329]), array([0.14768412, 0.28661991, 0.17399626])), 'radboud': (array([0.92652057, 0.85519609, 0.88622817]), array([0.11088003, 0.19650294, 0.1517879 ]))}
