### Map each cell label to its sorted boundary coordinates across image planes.

In [23]:
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.sparse import coo_matrix
import json
import pciSeq
import os
from joblib import Parallel, delayed
from pciSeq.src.preprocess.cell_processing import extract_borders
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import time

In [24]:
root_dir = r'/home/dimitris/data/Izzie/Aang_coppa_v1_0_0_output/'

In [32]:
ca1_masks = np.load(os.path.join(root_dir, '3d_masks', 'ca1_masks_normalised.npy'))
# ca1_masks = np.load(os.path.join(root_dir, 'v1-aang2D_unfiltered-dapi_masks.npy'))
ca1_masks = [coo_matrix(d) for d in ca1_masks]

In [34]:
# extract_borders(ca1_masks[54].toarray().astype(np.uint32)).head(2).to_clipboard()

### Simple function

In [35]:
def process_ca1_masks(ca1_masks):
    results = []

    # Iterate over all planes
    for plane_num in range(len(ca1_masks)):
        df_plane = extract_borders(ca1_masks[plane_num].toarray().astype(np.uint32))
        df_plane["plane_num"] = plane_num
        results.append(df_plane)

    # Combine all dataframes
    combined_df = pd.concat(results, ignore_index=True)

    # Create dictionary structure
    label_coords_dict = defaultdict(dict)

    # Populate dictionary
    for _, row in combined_df.iterrows():
        label = row['label']
        plane = row['plane_num']
        coords = row['coords']

        label_coords_dict[label][plane] = coords

    # Sort inner dictionaries by plane number
    sorted_label_coords_dict = {
        label: dict(sorted(plane_dict.items()))
        for label, plane_dict in label_coords_dict.items()
    }

    return sorted_label_coords_dict

In [36]:
start_time = time.time()
out_1 = process_ca1_masks(ca1_masks)
end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")

Execution time: 47.11 seconds


### Optimised, using ProcessPoolExecutor

In [37]:
def process_one_plane(args):
    plane_num, mask = args
    arr = mask.toarray().astype(np.uint32)
    df = extract_borders(arr)
    # return list of (label, plane_num, coords)
    return [(row.label, plane_num, row.coords) for row in df.itertuples(index=False)]

def process_ca1_masks_parallel(ca1_masks, n_workers=None):
    # build jobs
    jobs = enumerate(ca1_masks)
    label_coords = defaultdict(dict)

    # fire off all planes
    with ProcessPoolExecutor(max_workers=n_workers) as exe:
        futures = {exe.submit(process_one_plane, job): job[0] for job in jobs}
        for fut in as_completed(futures):
            for label, plane, coords in fut.result():
                label_coords[label][plane] = coords

    # sort plane dicts
    return {
        lbl: dict(sorted(planes.items()))
        for lbl, planes in label_coords.items()
    }



In [38]:
start_time = time.time()
out_2 = process_ca1_masks_parallel(ca1_masks)
end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")

Execution time: 23.22 seconds


### Save as a json file

In [39]:
# Wrap it with a metadata section
to_dump = {
    "__metadata__": {
        "description": "cell_boundaries maps each cell label to a dict; "
                       "for each label (int), the value is another dict "
                       "mapping plane numbers (int) to a list of "
                       "boundary coordinates, where each coordinate "
                       "is [x, y]."
    },
    "cell_boundaries": out_2
}

# Write it out
with open("cell_boundaries.json", "w") as f:
    json.dump(to_dump, f, indent=2)

In [None]:
np.all([out_2[i] == out_1[i] for i in out_1.keys()])

In [12]:
ca1_masks

[<COOrdinate sparse matrix of dtype 'uint16'
 	with 0 stored elements and shape (2000, 5130)>,
 <COOrdinate sparse matrix of dtype 'uint16'
 	with 0 stored elements and shape (2000, 5130)>,
 <COOrdinate sparse matrix of dtype 'uint16'
 	with 8888 stored elements and shape (2000, 5130)>,
 <COOrdinate sparse matrix of dtype 'uint16'
 	with 51507 stored elements and shape (2000, 5130)>,
 <COOrdinate sparse matrix of dtype 'uint16'
 	with 121652 stored elements and shape (2000, 5130)>,
 <COOrdinate sparse matrix of dtype 'uint16'
 	with 256100 stored elements and shape (2000, 5130)>,
 <COOrdinate sparse matrix of dtype 'uint16'
 	with 454727 stored elements and shape (2000, 5130)>,
 <COOrdinate sparse matrix of dtype 'uint16'
 	with 742317 stored elements and shape (2000, 5130)>,
 <COOrdinate sparse matrix of dtype 'uint16'
 	with 1059724 stored elements and shape (2000, 5130)>,
 <COOrdinate sparse matrix of dtype 'uint16'
 	with 1340523 stored elements and shape (2000, 5130)>,
 <COOrdinat