<a href="https://colab.research.google.com/github/aashrithresearch/pytorch_pathology/blob/main/lymphoma_hdf5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! cd /content/drive/MyDrive/lymphoma.tar

In [None]:
import torch
import tables

import os,sys
import glob

import PIL
import numpy as np

import cv2
import matplotlib.pyplot as plt

from sklearn import model_selection
import random

In [None]:
data = "lymphoma"
patch_size = 256
stride_size = 256
mirror_pad_size = 128
test_data = 0.1
resize = 1
classes = ['CLL', 'FL', 'MCL']

In [None]:
seed = random.randrange(sys.maxsize)
print(seed)

1320398590637321459


In [None]:
random.seed(seed)
print(f"random seed: {seed}")

random seed: 1320398590637321459


In [None]:
img_dtype = tables.UInt8Atom()
filenameAtom = tables.StringAtom(itemsize=255)

In [None]:
files = glob.glob(f"/content/drive/MyDrive/lymphoma.tar/**/*.tif")

In [None]:
phases={}
phases["train"],phases["val"]=next(iter(model_selection.ShuffleSplit(n_splits=1,test_size=test_data).split(files)))

In [None]:
phases["train"]=phases["train"][0:100]
phases["val"]=phases["val"][0:20]

In [None]:
storage = {}
block_shape = np.array((patch_size, patch_size, 3))

In [None]:
filters=tables.Filters(complevel=6, complib='zlib')

In [None]:
import numbers
import numpy as np
from numpy.lib.stride_tricks import as_strided

In [None]:
def extract_patches(arr, patch_shape=8, extraction_step=1):
    arr_ndim = arr.ndim

    if isinstance(patch_shape, numbers.Number):
        patch_shape = tuple([patch_shape] * arr_ndim)
    if isinstance(extraction_step, numbers.Number):
        extraction_step = tuple([extraction_step] * arr_ndim)

    patch_strides = arr.strides

    slices = tuple(slice(None, None, st) for st in extraction_step)
    indexing_strides = arr[slices].strides

    patch_indices_shape = (
        (np.array(arr.shape) - np.array(patch_shape)) // np.array(extraction_step)
    ) + 1

    shape = tuple(list(patch_indices_shape) + list(patch_shape))
    strides = tuple(list(indexing_strides) + list(patch_strides))

    patches = as_strided(arr, shape=shape, strides=strides)
    return patches

In [None]:
for phase in phases.keys():
  print(phase)
  totals=np.zeros(len(classes))
  hdf5_file = tables.open_file(f"/content/drive/MyDrive/lymphoma.tar/{data}_{phase}.pytable", mode='w')

  storage["filenames"] = hdf5_file.create_earray(hdf5_file.root, 'filenames', filenameAtom, (0,)) #create the array for storage

  storage["imgs"]= hdf5_file.create_earray(hdf5_file.root, "imgs", img_dtype,
                                              shape=np.append([0],block_shape),
                                              chunkshape=np.append([1],block_shape),
                                              filters=filters)
  storage["labels"]= hdf5_file.create_earray(hdf5_file.root, "labels", img_dtype,
                                              shape=[0],
                                              chunkshape=[1],
                                              filters=filters)

  for filei in phases[phase]: #now for each of the files
    fname=files[filei]

    print(fname)
    classid=[idx for idx in range(len(classes)) if classes[idx] in fname][0]
    totals[classid]+=1

    io=cv2.cvtColor(cv2.imread(fname),cv2.COLOR_BGR2RGB)
    interp_method=PIL.Image.BICUBIC


    io = cv2.resize(io,(0,0),fx=resize,fy=resize, interpolation=interp_method) #resize it as specified above
    io = np.pad(io, [(mirror_pad_size, mirror_pad_size), (mirror_pad_size, mirror_pad_size), (0, 0)], mode="reflect")

        #convert input image into overlapping tiles, size is ntiler x ntilec x 1 x patch_size x patch_size x3
    io_arr_out=extract_patches(io,(patch_size,patch_size,3),stride_size)

        #resize it into a ntile x patch_size x patch_size x 3
    io_arr_out=io_arr_out.reshape(-1,patch_size,patch_size,3)



    storage["imgs"].append(io_arr_out)
    storage["labels"].append([classid for x in range(io_arr_out.shape[0])]) #add the filename to the storage array
    storage["filenames"].append([fname for x in range(io_arr_out.shape[0])]) #add the filename to the storage array

    #lastely, we should store the number of pixels
  npixels=hdf5_file.create_carray(hdf5_file.root, 'classsizes', tables.Atom.from_dtype(totals.dtype), totals.shape)
  npixels[:]=totals
  hdf5_file.close()

train
/content/drive/MyDrive/lymphoma.tar/CLL/sj-05-1396-R3_001.tif
/content/drive/MyDrive/lymphoma.tar/CLL/sj-03-4957_002.tif
/content/drive/MyDrive/lymphoma.tar/CLL/sj-05-3874-R2_002.tif
/content/drive/MyDrive/lymphoma.tar/CLL/sj-05-3874-R2_001.tif
/content/drive/MyDrive/lymphoma.tar/MCL/sj-05-768_008.tif
/content/drive/MyDrive/lymphoma.tar/MCL/sj-05-4179-R1_002.tif
/content/drive/MyDrive/lymphoma.tar/CLL/sj-03-476_005.tif
/content/drive/MyDrive/lymphoma.tar/CLL/sj-03-4957_006.tif
/content/drive/MyDrive/lymphoma.tar/FL/sj-05-5389-R1_004.tif
/content/drive/MyDrive/lymphoma.tar/MCL/sj-04-4525-R4_007.tif
/content/drive/MyDrive/lymphoma.tar/CLL/sj-05-3344_009.tif
/content/drive/MyDrive/lymphoma.tar/CLL/sj-03-2810_005.tif
/content/drive/MyDrive/lymphoma.tar/MCL/sj-05-3362-R2_002.tif
/content/drive/MyDrive/lymphoma.tar/FL/sj-05-5389-R1_010.tif
/content/drive/MyDrive/lymphoma.tar/CLL/sj-03-852-R2_002.tif
/content/drive/MyDrive/lymphoma.tar/CLL/sj-05-3165_010.tif
/content/drive/MyDrive/lymph