In [None]:
import glob
import os.path

import bioformats
import javabridge

import deepometry.parse

javabridge.start_vm(class_path=bioformats.JARS, max_heap_size="8G")

# Parse CIFs

Use `deepometry.parse` module to transform .CIF files to NumPy arrays. In this example, .CIF files are stored at `/data/raw/` in subdirectories corresponding to the class labels, filenames begin with the patient names, and there may be more than one .CIF file per patient per class label.

    /data/raw/
        positive/
            A_foo.cif
            A_bar.cif
            B_foo.cif
            B_foo.cif
        negative/
            A_foo.cif
            B_foo.cif
            B_bar.cif
            B_foo.cif
        ...

Within each .CIF file, we shall parse the images of selected channels of each object into a numpy array, e.g. one cell - one numpy array that contains multiple channels. The arrays are stored at `/data/parsed` in subdirectories corresponding to the class labels. Array filenames have the patient prefixes, followed by a hex series.

    /data/parsed/
        positive/
            A__32e88e1ac3a8f44bf8f77371155553b9.npy
            A__3dc56a0c446942aa0da170acfa922091.npy
            B__8068ef7dcddd89da4ca9740bd2ccb31e.npy
        negative/
            A__8348deaa70dfc95c46bd02984d28b873.npy
            B__c1ecbca7bd98c01c1d3293b64cd6739a.npy
            B__c56cfb8e7e7121dd822e47c67d07e2d4.npy
        ...

In [None]:
src = "/data/raw/"
dest = "/data/parsed/"
labels = ["positive", "negative"]

In [None]:
channels = [2, 11, 10, 1, 3, 6, 5, 0]

In [None]:
image_size = 55

In [None]:
for label in labels:
    src_dir = os.path.join(src, label)
    print("Parsing directory: {}".format(src_dir))

    dest_dir = os.path.join(dest, label)
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    
    pathnames = glob.glob(os.path.join(src_dir, "*.cif"))
    
    deepometry.parse.parse(pathnames, dest_dir, image_size, channels)

print('Done')