In [1]:
!nvidia-smi

Sun Jun 28 10:35:04 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 2060    Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   78C    P2    73W /  N/A |   3450MiB /  5934MiB |     90%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

# Tiles v0
- what makes no sense: tiling on too small images
- otherwise: tile with basic lafoss approach for baseline
- we don't use otsu stuff anymore

In [1]:
import os
import cv2
import skimage.io
from tqdm.notebook import tqdm
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import partial


In [26]:
benjamin_path = '/media/benjamin/Seagate Expansion Drive/content/prostate-cancer-grade-assessment/'

In [27]:
TRAIN = benjamin_path + '/train_images/'
MASKS = benjamin_path + '/train_label_masks/'
sz = 128
N = 12

wanted_size = sz*sz*N  # CHANGE THIS ACCORDING TO YOUR NEEDS

# add a margin, which prevent getting to level1 image if level2 is just slightly not enough
# bigger margin = more white tiles on small pictures, more info used on big ones
# smaller margin = we get to level1 as soon as level2 is not enough. But getting to level1 too soon may result
# in modelling with only 6% of the available info
# Maybe experiment with margin of 1, 1.5 and 2 and see if model performs better with white tiles but all the info 
# or with low info but no white tiles ???
margin = 1.3


OUT_TRAIN = '../data/processed/train_tiles_v0/'
OUT_MASKS = '../data/processed/masks_tiles_v0/'
os.makedirs(OUT_TRAIN, exist_ok=True)
os.makedirs(OUT_MASKS, exist_ok=True)

In [28]:
os.listdir('../data/')

['interim',
 'thousand_dups.csv',
 'raw',
 'train.csv',
 'images_wo_duplicates.csv',
 'five_hundred_dups.csv',
 'n_non_white_pixels.csv',
 'augmented_df.csv',
 'processed',
 'folds.csv']

In [29]:
# fully white pic: 3790f55cad63053e956fb73027179707
df = pd.read_csv('../data/../data/n_non_white_pixels.csv')
df

Unnamed: 0,name,nup_lvl0,nup_lvl1,nup_lvl2
0,7e72dea4ae66b5bf583013ec985e5209.tiff,35034032,2189627,144437
1,0005f7aaab2800f6170c399693a96917.tiff,28117072,1757317,114954
2,000920ad0b612851f8e01bcc880d9b3d.tiff,11694016,730876,48557
3,0018ae58b01bdadc8e347995b69f99aa.tiff,85130816,5320676,337150
4,001c62abd11fa4b57bf7a6c603a11bb9.tiff,35243200,2202700,144804
...,...,...,...,...
10510,878be7ee5bdda3b29e2417f6dc93af64,42199616,2637476,167727
10511,878e0b1bbfa299387e3afed9d7c372bf,50417776,3151111,201905
10512,8794d36a3038f4537f7d261d3af4eebd,31610800,1975675,126389
10513,8796432c343bcf3bcca556dc3375702e,39946144,2496634,166341


In [30]:
lvl2_enough = (wanted_size < df['nup_lvl2'] * margin).astype(int)
lvl1_enough = (wanted_size < df['nup_lvl1'] * margin).astype(int)
lvl0_enough = (wanted_size < df['nup_lvl0'] * margin).astype(int)
df['level_for_wanted_size'] = np.clip(lvl2_enough + lvl1_enough + lvl0_enough - 1, 0, 2)
df['level_for_wanted_size'].value_counts()

2    5373
1    5136
0       6
Name: level_for_wanted_size, dtype: int64

In [31]:
# iterate only on pictures where there are masks
names = [name[:-10] for name in os.listdir(MASKS)]
df['name'] = df['name'].apply(lambda x: x.split('.')[0])
df_masks_only = df[ df['name'].isin(names)]

In [32]:
df

Unnamed: 0,name,nup_lvl0,nup_lvl1,nup_lvl2,level_for_wanted_size
0,7e72dea4ae66b5bf583013ec985e5209,35034032,2189627,144437,1
1,0005f7aaab2800f6170c399693a96917,28117072,1757317,114954,1
2,000920ad0b612851f8e01bcc880d9b3d,11694016,730876,48557,1
3,0018ae58b01bdadc8e347995b69f99aa,85130816,5320676,337150,2
4,001c62abd11fa4b57bf7a6c603a11bb9,35243200,2202700,144804,1
...,...,...,...,...,...
10510,878be7ee5bdda3b29e2417f6dc93af64,42199616,2637476,167727,2
10511,878e0b1bbfa299387e3afed9d7c372bf,50417776,3151111,201905,2
10512,8794d36a3038f4537f7d261d3af4eebd,31610800,1975675,126389,1
10513,8796432c343bcf3bcca556dc3375702e,39946144,2496634,166341,2


In [33]:
def tile(img):  # , mask
    result = []
    shape = img.shape
    # get perfect multiples of sz to cut the image
    pad0,pad1 = (sz - shape[0]%sz)%sz, (sz - shape[1]%sz)%sz
    img = np.pad(img,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],
                constant_values=255)
#     mask = np.pad(mask,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],
#                 constant_values=0)
    
    # all_patches x sz x sz x n_channels
    img = img.reshape(img.shape[0]//sz,sz,img.shape[1]//sz,sz,3)
    img = img.transpose(0,2,1,3,4).reshape(-1,sz,sz,3)
#     mask = mask.reshape(mask.shape[0]//sz,sz,mask.shape[1]//sz,sz,3)
#     mask = mask.transpose(0,2,1,3,4).reshape(-1,sz,sz,3)
    
    # if not enough patches, create additionnal with all_zeros (masks) or all_white (images)
    if len(img) < N:
        #mask = np.pad(mask,[[0,N-len(img)],[0,0],[0,0],[0,0]],constant_values=0)
        img = np.pad(img,[[0,N-len(img)],[0,0],[0,0],[0,0]],constant_values=255)
        
    # I changed the following line to focus on having masks with cancer instead of getting rid of white
    idxs = np.argsort(img.reshape(img.shape[0],-1).sum(-1))[:N]  # [::-1]
    img = img[idxs]
#     mask = mask[idxs]
    for i in range(len(img)):
        result.append({'img':img[i], 'idx':i})  # 'mask':mask[i], 
    return result

In [34]:
x_tot,x2_tot = [],[]
for row in tqdm(df_masks_only.iterrows(), total=len(df_masks_only)):
    name = row[1]['name']
    level = row[1]['level_for_wanted_size']
    img = skimage.io.MultiImage(os.path.join(TRAIN,name+'.tiff'))[level]
    #mask = skimage.io.MultiImage(os.path.join(MASKS,name+'_mask.tiff'))[level]
    tiles = tile(img)
    for t in tiles:
        img,idx = t['img'], t['idx']  #,mask, = t['mask'],
        x_tot.append((img/255.0).reshape(-1,3).mean(0))
        x2_tot.append(((img/255.0)**2).reshape(-1,3).mean(0)) 
        #if read with PIL RGB turns into BGR
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        cv2.imwrite(OUT_TRAIN+f'{name}_{idx}.png', img)
#         mask = mask[:,:,0]
#         cv2.imwrite(OUT_MASKS+f'{name}_{idx}.png', mask)

HBox(children=(FloatProgress(value=0.0, max=10515.0), HTML(value='')))




In [35]:
img_avr =  np.array(x_tot).mean(0)
img_std =  np.sqrt(np.array(x2_tot).mean(0) - img_avr**2)
print('mean:',img_avr, ', std:', np.sqrt(img_std))

mean: [0.80822448 0.6134578  0.74502575] , std: [0.41603894 0.53319771 0.43256397]
