In [None]:
import glob
import os.path

import bioformats
import javabridge
import numpy

import deepometry.parse

# Parse

Use `deepometry.parse` module to transform .CIF files to NumPy arrays. In this example, our .CIF images are stored at `/data/cifs` in subdirectories corresponding to the class label. Image filenames begin with the patient number, and there may be more than one image per patient per class label.

    /data/cifs/
        A/
            001_foo.cif
            001_bar.cif
            002_foo.cif
            003_foo.cif
        B/
            001_foo.cif
            002_foo.cif
            002_bar.cif
            003_foo.cif
        ...

We aggregate each patient's data into a single numpy array, per-patient. The arrays are stored at `/data/parsed` in subdirectories corresponding to the input data's class label. Array filenames are the patient number.

    /data/parsed/
        A/
            001.npy
            002.npy
            003.npy
        B/
            001.npy
            002.npy
            003.npy
        ...

In [None]:
javabridge.start_vm(class_path=bioformats.JARS, max_heap_size="8G")

In [None]:
src = "/data/cifs"

dest = "/data/parsed"

labels = ["A", "B", "C", "D"]

In [None]:
patients = ["001", "002", "003"]

In [None]:
channels = [0, 5, 6]

In [None]:
image_size = 48

In [None]:
for label in labels:
    print("Parsing directory: {}".format(label))
    
    src_dir = os.path.join(src, label)
    
    dest_dir = os.path.join(dest, label)
    
    for patient in patients:
        pathnames = glob.glob(os.path.join(src_dir, "{}*.cif".format(patient)))
                
        images = [deepometry.parse.parse(pathname, image_size, channels) for pathname in pathnames]
        
        images = numpy.concatenate(images)
        
        numpy.save(os.path.join(dest_dir, "{}.npy".format(patient)), images)
        
        print("\tparsed patient {}".format(patient))