In [1]:
dataname="tissue"

patch_size=256 #size of the tiles to extract and save in the database, must be >= to training size
stride_size=256 #distance to skip between patches, 1 indicated pixel wise extraction, patch_size would result in non-overlapping tiles
mirror_pad_size=128 # number of pixels to pad *after* resize to image with by mirroring (edge's of patches tend not to be analyzed well, so padding allows them to appear more centered in the patch)
test_set_size=.1 # what percentage of the dataset should be used as a held out validation/testing set
resize_scale=1 #resize input images
class_names=["negative", "positive"]#what classes we expect to have in the data, here we have only 2 classes but we could add additional classes

#-----Note---
#One should likely make sure that  (nrow+mirror_pad_size) mod patch_size == 0, where nrow is the number of rows after resizing
#so that no pixels are lost (any remainer is ignored)


In [2]:
import torch
import tables

import os,sys
import glob

import PIL
import numpy as np

import cv2
import matplotlib.pyplot as plt

from sklearn import model_selection
from patchify import patchify
import random


seed = random.randrange(sys.maxsize) #get a random seed so that we can reproducibly do the cross validation setup
random.seed(seed) # set the seed
print(f"random seed (note down for reproducibility): {seed}")

random seed (note down for reproducibility): 6968726797688606138


In [3]:
img_dtype = tables.UInt8Atom()  # dtype in which the images will be saved, this indicates that images will be saved as unsigned int 8 bit, i.e., [0,255]
filenameAtom = tables.StringAtom(itemsize=255) #create an atom to store the filename of the image, just incase we need it later, 

In [4]:
files=glob.glob('./data/**/*.jpg') # create a list of the files, in this case we're only interested in files which have masks so we can use supervised learning
sp = model_selection.ShuffleSplit(n_splits=1,test_size=test_set_size) # define split parameters

#create training and validation stages and split the files appropriately between them
phases={}
phases["train"],phases["val"]=next(iter(sp.split(files))) # with n_splits=1 sp.split will generate only 1 iteration

In [5]:
#--subset for rapid testing
phases["train"]=phases["train"][0:100]
phases["val"]=phases["val"][0:20]

In [5]:
storage={} #holder for future pytables

block_shape=np.array((patch_size,patch_size,3)) #block shape specifies what we'll be saving into the pytable array, here we assume that masks are 1d and images are 3d

filters=tables.Filters(complevel=6, complib='zlib') #we can also specify filters, such as compression, to improve storage speed


for phase in phases.keys(): #now for each of the phases, we'll loop through the files
    print(phase)
    
    totals=np.zeros(len(class_names)) # we can to keep counts of all the classes in for in particular training, since we 
    
    hdf5_file = tables.open_file(f"./{dataname}_{phase}.pytable", mode='w') #open the respective pytable
    storage["filenames"] = hdf5_file.create_earray(hdf5_file.root, 'filenames', filenameAtom, (0,)) #create the array for storage
    
    storage["imgs"]= hdf5_file.create_earray(hdf5_file.root, "imgs", img_dtype,  
                                              shape=np.append([0],block_shape), 
                                              chunkshape=np.append([1],block_shape),
                                              filters=filters)
    storage["labels"]= hdf5_file.create_earray(hdf5_file.root, "labels", img_dtype,  
                                              shape=[0], 
                                              chunkshape=[1],
                                              filters=filters)

    
    for filei in phases[phase]: #now for each of the files
        fname=files[filei] 
        print(fname)
        
        classid=[idx for idx in range(len(class_names)) if class_names[idx] in fname][0]
        totals[classid]+=1

        io=cv2.cvtColor(cv2.imread(fname),cv2.COLOR_BGR2RGB)
        interp_method=PIL.Image.BICUBIC

        io = cv2.resize(io,(0,0),fx=resize_scale,fy=resize_scale, interpolation=interp_method) #resize it as specified above
        io = np.pad(io, [(mirror_pad_size, mirror_pad_size), (mirror_pad_size, mirror_pad_size), (0, 0)], mode="reflect")

        #convert input image into overlapping tiles, size is ntiler x ntilec x 1 x patch_size x patch_size x3
        io_arr_out=patchify(io, (patch_size, patch_size, 3), step=stride_size)

        #resize it into a ntile x patch_size x patch_size x 3
        io_arr_out=io_arr_out.reshape(-1,patch_size,patch_size,3)

        storage["imgs"].append(io_arr_out)
        storage["labels"].append([classid for x in range(io_arr_out.shape[0])]) #add the filename to the storage array
        storage["filenames"].append([fname for x in range(io_arr_out.shape[0])]) #add the filename to the storage array
        
    #lastly, we should store the number of each class instances
    classsizes=hdf5_file.create_carray(hdf5_file.root, 'classsizes', tables.Atom.from_dtype(totals.dtype), totals.shape)
    classsizes[:]=totals
    hdf5_file.close()

train
./data\positive\21 (11).jpg
./data\positive\16 (43).jpg
./data\positive\29 (22).jpg
./data\negative\10 (11).jpg
./data\positive\16 (86).jpg
./data\positive\20 (39).jpg
./data\negative\41 (24).jpg
./data\positive\8 (24).jpg
./data\positive\22 (21).jpg
./data\positive\8 (12).jpg
./data\negative\16 (31).jpg
./data\positive\16 (44).jpg
./data\positive\11 (13).jpg
./data\positive\21 (2).jpg
./data\negative\38 (1).jpg
./data\positive\29 (16).jpg
./data\positive\20 (9).jpg
./data\negative\29 (11).jpg
./data\positive\20 (4).jpg
./data\negative\46 (1).jpg
./data\positive\6 (6).jpg
./data\positive\29 (28).jpg
./data\positive\4 (7).jpg
./data\positive\29 (26).jpg
./data\negative\52 (3).jpg
./data\negative\39 (4).jpg
./data\negative\41 (19).jpg
./data\negative\43 (4).jpg
./data\negative\24 (1).jpg
./data\positive\25 (8).jpg
./data\positive\15 (12).jpg
./data\positive\2 (8).jpg
./data\negative\54 (4).jpg
./data\positive\2 (21).jpg
./data\positive\33 (1).jpg
./data\positive\22 (24).jpg
./data\

./data\negative\8 (22).jpg
./data\positive\22 (5).jpg
./data\positive\17 (4).jpg
./data\negative\56 (5).jpg
./data\negative\41 (10).jpg
./data\positive\9 (21).jpg
./data\negative\57 (9).jpg
./data\positive\14 (4).jpg
./data\positive\16 (5).jpg
./data\negative\41 (3).jpg
./data\positive\26 (2).jpg
./data\negative\15 (10).jpg
./data\negative\49 (10).jpg
./data\negative\40 (5).jpg
./data\positive\16 (61).jpg
./data\positive\25 (6).jpg
./data\positive\7 (19).jpg
./data\negative\56 (2).jpg
./data\negative\52 (2).jpg
./data\negative\30 (13).jpg
./data\positive\28 (9).jpg
./data\negative\49 (17).jpg
./data\negative\4 (8).jpg
./data\negative\15 (9).jpg
./data\positive\31 (11).jpg
./data\positive\11 (20).jpg
./data\negative\57 (2).jpg
./data\positive\20 (38).jpg
./data\negative\46 (7).jpg
./data\positive\6 (5).jpg
./data\negative\48 (7).jpg
./data\positive\16 (17).jpg
./data\negative\43 (2).jpg
./data\positive\21 (10).jpg
./data\positive\25 (3).jpg
./data\positive\6 (2).jpg
./data\negative\18 (

./data\positive\16 (72).jpg
./data\positive\16 (94).jpg
./data\positive\20 (10).jpg
./data\negative\24 (14).jpg
./data\positive\14 (31).jpg
./data\negative\35 (5).jpg
./data\positive\16 (85).jpg
./data\negative\57 (27).jpg
./data\negative\39 (6).jpg
./data\negative\7 (8).jpg
./data\negative\1 (25).jpg
./data\negative\39 (17).jpg
./data\negative\57 (7).jpg
./data\negative\9 (9).jpg
./data\positive\17 (1).jpg
./data\negative\49 (12).jpg
./data\positive\32 (10).jpg
./data\negative\57 (24).jpg
./data\positive\13 (37).jpg
./data\positive\9 (17).jpg
./data\negative\41 (16).jpg
./data\positive\30 (9).jpg
./data\positive\27 (2).jpg
./data\positive\29 (29).jpg
./data\negative\13 (18).jpg
./data\positive\18 (31).jpg
./data\positive\13 (44).jpg
./data\negative\53 (5).jpg
./data\positive\16 (91).jpg
./data\negative\49 (15).jpg
./data\positive\28 (1).jpg
./data\positive\16 (4).jpg
./data\negative\35 (7).jpg
./data\positive\16 (18).jpg
./data\positive\17 (9).jpg
./data\negative\45 (7).jpg
./data\neg

./data\negative\8 (2).jpg
./data\positive\13 (30).jpg
./data\positive\18 (48).jpg
./data\negative\53 (6).jpg
./data\positive\19 (10).jpg
./data\negative\50 (2).jpg
./data\positive\16 (29).jpg
./data\positive\5 (3).jpg
./data\positive\7 (15).jpg
./data\positive\16 (58).jpg
./data\positive\15 (14).jpg
./data\positive\22 (28).jpg
./data\positive\17 (3).jpg
./data\negative\46 (6).jpg
./data\positive\29 (41).jpg
./data\positive\3 (9).jpg
./data\positive\7 (12).jpg
./data\positive\14 (59).jpg
./data\negative\24 (11).jpg
./data\negative\58 (1).jpg
./data\negative\26 (20).jpg
./data\positive\13 (35).jpg
./data\positive\17 (5).jpg
./data\positive\16 (84).jpg
./data\negative\36 (13).jpg
./data\negative\47 (2).jpg
./data\negative\45 (2).jpg
./data\negative\18 (3).jpg
./data\positive\11 (29).jpg
./data\positive\15 (15).jpg
./data\positive\16 (45).jpg
./data\positive\13 (40).jpg
./data\positive\16 (62).jpg
./data\negative\42 (7).jpg
./data\positive\16 (11).jpg
./data\positive\26 (8).jpg
./data\nega