***
please notice these codes are not required to be run by the moment that the pre-processed image files are already present in the folder and it take quite a long time, this does not influence the usage time that will be of interest after the construction of the system by the mnoment that most of the time in this pre-processing is needed to clean the provided manual annotations
***

In [1]:
# INSTALLINGS

!pip install imagecodecs
!pip install histomicstk --find-links https://girder.github.io/large_image_wheels
!pip install opencv-python-headless==4.1.2.30
!pip install pyyaml==5.4.1




In [5]:
# LINKAGE TO GOOGLE DRIVE AND LIBRERIES IMPORTING

from google.colab import drive
drive.mount('/content/drive')

import os
import numpy as np
import imagecodecs
import histomicstk as htk  # notice this line will give an error the first time 
                           # the cell is runned, please just re-run the cell and
                           # the problem will automatically solved (there is 
                           # probably some kind of bug)

from tqdm import tqdm
from skimage.io import imread
from skimage.transform import resize
from skimage.util import img_as_ubyte
from skimage import morphology

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# DATASET UNRAR: LOADING DATASET IN COLAB

!pip install unrar
!unrar x "drive/MyDrive/cytology challenge/00_DATASET/train.rar"        # unraring training set
!unrar x "drive/MyDrive/cytology challenge/00_DATASET/validation.rar"   # unraring validation set
!unrar x "drive/MyDrive/cytology challenge/00_DATASET/test.rar"         # unraring test set

Collecting unrar
  Downloading unrar-0.4-py3-none-any.whl (25 kB)
Installing collected packages: unrar
Successfully installed unrar-0.4

UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from drive/MyDrive/cytology challenge/00_DATASET/train.rar

Creating    train                                                     OK
Creating    train/images                                              OK
Extracting  train/images/102.tif                                           0%  OK 
Extracting  train/images/104.tif                                           0%  1%  OK 
Extracting  train/images/106.tif                                           1%  OK 
Extracting  train/images/108.tif                                           1%  2%  OK 
Extracting  train/images/109.tif                                           2%  OK 
Extracting  train/images/111.tif                                           2%  3%  OK 
Ex

In [8]:
# SETTINGS OF CURRENT PRE-PROCESSING

pre_proc_name = 'pre_proc' 
pre_proc_annotations_name = 'manual_masks'
rsz = 512  # resizing size (resize images to rsz x rsz) 
NUM_CLASSES = 3  # number of classes choosen to cope with the problem 

In [1]:
# STORING TRAINING SET IMAGES AND MASKS IN PROPER NDARRAY

# path
tr_IMGS_path = os.path.join('train','images')
tr_MANU_path = os.path.join('train','manual')                          

# extracting list of images
tr_images = os.listdir(tr_IMGS_path)

# reference image for Reihnard normalization
img_ref = imread(tr_IMGS_path+'/104.tif')
img_ref = img_as_ubyte(resize(img_ref,[rsz,rsz]))  # mean and std for color normalization are calculated after resizing to 512x512
mean_ref, std_ref = htk.preprocessing.color_conversion.lab_mean_std(img_ref)

# body
X_tr = np.zeros([len(tr_images),rsz,rsz,3], dtype=np.uint8)
Y_tr = np.zeros([len(tr_images),rsz,rsz], dtype=np.uint8)
for n, id_ in tqdm(enumerate(tr_images), total=len(tr_images)):

    # loading 
    img = imread(tr_IMGS_path+'/'+id_)  # uint8 stained image
    manu0 = imagecodecs.imread(tr_MANU_path+'/'+id_)  # N layers manual segmentations (each layer a different MM cell)

    # "compressing" manual annotations on a single layer (mask correction: 
    # removal of small objects erroneously annotated (single pixels or little 
    # spots) and small holes errouneously not annotated) 
    if len(manu0.shape)==2:  # case of single annotate cell in the patch
      manu = morphology.area_closing(morphology.area_opening(manu0,0.001*manu0.shape[0]*manu0.shape[1]),0.001*manu0.shape[0]*manu0.shape[1])  
    else:  # case of multiple annotate cells in the patch
      manu = np.zeros((manu0.shape[0],manu0.shape[1]),dtype=int)  
      for j in range(manu0.shape[2]):
        curr = morphology.area_closing(morphology.area_opening(manu0[:,:,j],0.001*manu0.shape[0]*manu0.shape[1]),0.001*manu0.shape[0]*manu0.shape[1])
        manu[curr==255] = 255
        manu[curr==128] = 128      

    # resizing to 512x512 (network feed size)
    img = img_as_ubyte(resize(img,[rsz,rsz]))
    manu = img_as_ubyte(resize(manu,[rsz,rsz]))
    manu[manu < 80] = 0
    manu[ (manu >= 80)*(manu <= 175) ] = 1
    manu[manu > 175] = 2

    # color normalization
    img = htk.preprocessing.color_normalization.reinhard(img, mean_ref, std_ref)

    # actual storage
    X_tr[n] = np.copy(img)
    Y_tr[n] = np.copy(manu)



# STORING VALIDATION SET IMAGES AND MASKS IN PROPER NDARRAY

# path
vl_IMGS_path = os.path.join('validation','images')                              
vl_MANU_path = os.path.join('validation','manual')    

# extracting list of images
vl_images = os.listdir(vl_IMGS_path)

# body
X_vl = np.zeros([len(vl_images),rsz,rsz,3], dtype=np.uint8)
Y_vl = np.zeros([len(vl_images),rsz,rsz], dtype=np.uint8)
for n, id_ in tqdm(enumerate(vl_images), total=len(vl_images)):

    # loading
    img = imread(vl_IMGS_path+'/'+id_)
    manu0 = imagecodecs.imread(vl_MANU_path+'/'+id_)  # N layers manual segmentations (each layer a different MM cell)

    # "compressing" manual annotations on a single layer (mask correction: 
    # removal of small objects erroneously annotated (single pixels or little 
    # spots) and small holes errouneously not annotated) 
    if len(manu0.shape)==2:  # case of single annotate cell in the patch
      manu = morphology.area_closing(morphology.area_opening(manu0,0.001*manu0.shape[0]*manu0.shape[1]),0.001*manu0.shape[0]*manu0.shape[1])  
    else:  # case of multiple annotate cells in the patch
      manu = np.zeros((manu0.shape[0],manu0.shape[1]),dtype=int)  
      for j in range(manu0.shape[2]):
        curr = morphology.area_closing(morphology.area_opening(manu0[:,:,j],0.001*manu0.shape[0]*manu0.shape[1]),0.001*manu0.shape[0]*manu0.shape[1])
        manu[curr==255] = 255
        manu[curr==128] = 128      

    # resizing to 512x512 (network feed size)
    img = img_as_ubyte(resize(img,[rsz,rsz]))
    manu = img_as_ubyte(resize(manu,[rsz,rsz]))
    manu[manu < 80] = 0
    manu[ (manu >= 80)*(manu <= 175) ] = 1
    manu[manu > 175] = 2

    # color normalization
    img = htk.preprocessing.color_normalization.reinhard(img, mean_ref, std_ref)

    # actual storage
    X_vl[n] = np.copy(img)
    Y_vl[n] = np.copy(manu)




In [2]:
# SAVING PRE-PROCESSED

np.savez(os.path.join('drive/MyDrive/EIM challenge_gruppo_FA_DO_PA_PA/01_PRE-PROCESSED FOR TRAINING',pre_proc_name),X_tr=X_tr,X_vl=X_vl)
np.savez(os.path.join('drive/MyDrive/cytology challenge/01_PRE-PROCESSED FOR TRAINING',pre_proc_annotations_name),Y_tr=Y_tr,Y_vl=Y_vl)  


