## Combine Nuclear Centroid Files

This script 

In [None]:
import numpy as np
import os
import pandas as pd
from cloudfiles import CloudFiles
import matplotlib.pyplot as plt

In [None]:
# Input path may be either a local file path, or a gs: (Google Storage) link
input_path = "gs://zheng_mouse_hippocampus_scratch_30/nuclei/com/seg/20240519214242"
# Output path currently must be a local file path
output_path = "~/zheng-mouse-hippo/nucleus-centroids.csv"

In [None]:
com_list = []
if input_path.startswith("gs:"):
    from cloudfiles import CloudFiles
    cf = CloudFiles(input_path)
    files = cf.get(list(cf))
    com_list = []
    for f in files:
        print(f"\n{f['path']}", end="")
        com_list.append(np.frombuffer(f["content"], dtype=np.int64).reshape([-1, 5]))
    print('\n')
else:
    for fn in os.listdir(output_path):
        fp = os.path.join(output_path, fn)
        with open(fp, "rb") as f:
            print(f"\r{fn}", end="")
            com_list.append(np.frombuffer(f.read(), dtype=np.int64).reshape([-1, 5]))
centroids = np.concatenate(com_list)

In [None]:
np.set_printoptions(linewidth=120)
centroids

The data above is a segment (nucleus) ID, which may include multiple lines (in the case of a nucleus split by a chunk or otherwise oversegmented); and then a sum of X, Y, and Z values, and a voxel count (mass).  To collapse this down into a single centroid for each nucleus, we need to group by the ID, sum across rows in each group, and then divide by the mass (total voxel count).

In [None]:
df = pd.DataFrame(centroids[:, 1:], index=centroids[:, 0], columns=["x", "y", "z", "m"])
df = df.groupby(df.index).sum()
df64 = df[["x", "y", "z"]].div(df["m"], axis=0).round().astype(int)
df64["m"] = df["m"]
df64.to_csv(output_path, index_label="id")
df64

In [None]:
# Plot a histogram of "m" (total volume) values
axes = df['m'].hist(bins=30)  # You can adjust the number of bins as needed
axes.set_yscale('log')
plt.title('Distribution of Nucleus Volume')
plt.xlabel('Volume')
plt.ylabel('Frequency')

In [None]:
# Same as above, but using log binning
min_value = df['m'].min()
max_value = df['m'].max()
bins = np.logspace(np.log10(min_value), np.log10(max_value), num=30)  # Adjust the number of bins as needed
df['m'].hist(bins=bins)
plt.xscale('log')
plt.yscale('log')
plt.title('Distribution of Nucleus Volume')
plt.xlabel('Volume')
plt.ylabel('Frequency')

In [None]:
# Pick a threshold based on the above, and filter out all the small junk.
threshold = 60000

df_filtered = df64[df64['m'] >= threshold]
df_filtered

In [None]:
df_filtered.to_csv(output_path[:-4] + "-filtered.csv", index_label="id")