# ECM pixel cluster stats

In [None]:
base_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/intermediate_files/ecm_pixel_clustering"
tiff_dir = "/Volumes/Shared/Noah Greenwald/TONIC_Cohort/image_data/samples"

## Pixel cluster counts

In [None]:
"""
Count pixel clusters

Author: Candace Liu
Date: 6/19/23

"""

import numpy as np
import skimage.io as io
import pandas as pd
import os

pixie_dir = os.path.join(base_dir, "pixie", "ecm_061423_pixel_output_dir")
all_fovs = os.listdir(os.path.join(pixie_dir, "pixel_masks"))
all_fovs = [x.replace("_pixel_mask.tiff","") for x in all_fovs]

all_df = []
for fov in all_fovs:
    print(fov)
    one_fov = io.imread(os.path.join(pixie_dir, "pixel_masks", fov+"_pixel_mask.tiff"))
    cluster, counts = np.unique(one_fov, return_counts=True)
    one_dat = {'pixel_meta_cluster':cluster, 'counts':counts}
    df = pd.DataFrame(one_dat)
    df['fov'] = fov
    all_df.append(df)

all_dat = pd.concat(all_df, ignore_index=True)

# Add pixel meta cluster names
mapping = pd.read_csv(os.path.join(pixie_dir, "pixel_channel_avg_meta_cluster.csv"))
mapping_unique = mapping[['pixel_meta_cluster','pixel_meta_cluster_rename']].drop_duplicates()
all_dat = pd.merge(all_dat,mapping_unique, on='pixel_meta_cluster')
all_dat.to_csv(os.path.join(pixie_dir, "fov_pixel_cluster_counts.csv"), index=False)



## Shape Analysis
### Save binary cluster tiffs

In [None]:
"""
Save 1 binary TIFF per cluster

Author: Candace Liu
Date: 6/14/23

"""

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib import colors
from PIL import Image
import skimage.io as io

pixie_dir = os.path.join(base_dir, "pixie", "ecm_061423_pixel_output_dir")
pixel_mask_dir = os.path.join(pixie_dir, "pixel_masks")
clust_to_pheno_path = os.path.join(pixie_dir, "pixel_channel_avg_meta_cluster.csv")

shape_analysis_dir = os.path.join(base_dir, "shape_analysis")
output_dir = os.path.join(shape_analysis_dir, "one_cluster_masks")

# Get phenotype mapping
clust_to_pheno = pd.read_csv(clust_to_pheno_path)
clusters_to_name = pd.Series(clust_to_pheno.pixel_meta_cluster_rename.values, index=clust_to_pheno.pixel_meta_cluster).to_dict()

# Make output directory
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

all_fovs = os.listdir(pixel_mask_dir)
all_fovs = [x for x in all_fovs if "_pixel_mask.tiff" in x]
all_fovs = [x.replace("_pixel_mask.tiff","") for x in all_fovs]
for fov in all_fovs:

    if not os.path.exists(os.path.join(output_dir,fov)):
        os.makedirs(os.path.join(output_dir,fov))

    # Read in pixel mask
    clust_array = np.array(io.imread(os.path.join(pixel_mask_dir,fov+"_pixel_mask.tiff")))
    fov_clusters = np.unique(clust_array)
    fov_clusters = [x for x in fov_clusters if x!=0]

    for clust in fov_clusters:
        clust_name = clusters_to_name[clust]
        one_clust_array = np.copy(clust_array)
        one_clust_array[np.where(one_clust_array != clust)] = 0
        one_clust_array[np.where(one_clust_array == clust)] = 1
       
        # Save overlay as TIF
        im = Image.fromarray(one_clust_array.astype(np.int16))
        im.save(os.path.join(output_dir, fov, clust_name+".tiff"))

    print(fov)

### Extract properties from connected objects in cluster masks

In [None]:
import os
import numpy as np
from skimage.measure import label, regionprops_table
import skimage.io as io
from PIL import Image
import pandas as pd

shape_analysis_dir = os.path.join(base_dir, "shape_analysis")

one_cluster_dir = os.path.join(shape_analysis_dir, "one_cluster_masks")
output_dir = os.path.join(shape_analysis_dir, "object_masks")

all_fovs = os.listdir(one_cluster_dir)
all_clusters = os.listdir(os.path.join(one_cluster_dir,all_fovs[0]))
all_clusters = [x.replace(".tiff","") for x in all_clusters]

for fov in all_fovs:
    print(fov)

    if not os.path.exists(os.path.join(output_dir,fov)):
        os.makedirs(os.path.join(output_dir,fov))

    for cluster in all_clusters:
        if os.path.exists(os.path.join(one_cluster_dir,fov,cluster+".tiff")):
            binary_im = io.imread(os.path.join(one_cluster_dir,fov,cluster+".tiff"))
            labels = label(binary_im, connectivity=2)

            # Save object mask
            im = Image.fromarray(labels.astype(np.int16))
            im.save(os.path.join(output_dir,fov,cluster+".tiff"))

            # Extract region props
            tab = regionprops_table(labels, properties=('label',
                                                        'area',
                                                        'centroid',
                                                        'perimeter',
                                                        'major_axis_length',
                                                        'minor_axis_length'))
            tab = pd.DataFrame(tab)
            tab.to_csv(os.path.join(output_dir,fov,cluster+"_regions.csv"))



### Stats calculation

In [None]:
## Quantify pixel cluster object masks

import os
import pandas as pd
import numpy as np
from alpineer import io_utils

shape_analysis_dir = os.path.join(base_dir, "shape_analysis")

shape_mask_dir = os.path.join(shape_analysis_dir, 'object_masks')
fovs = io_utils.list_folders(shape_mask_dir)

clusters = ['Collagen', 'FAP', 'FAP_Collagen', 'FAP_Fibronectin', 'FAP_SMA', 'Fibronectin', 'Fibronectin_Collagen', 
            'SMA', 'SMA_Collagen', 'SMA_Fibronectin', 'Vim', 'Vim_Collagen', 'Vim_FAP', 'Vim_Fibronectin', 'Vim_SMA']


all_data = []
for fov in fovs:
    fov_dir = os.path.join(shape_mask_dir, fov)
    
    for cluster in clusters:
        cluster_object_table = pd.read_csv(os.path.join(fov_dir, cluster + '_regions.csv'))
        cluster_object_table_new = cluster_object_table[['fov', 'area', 'major_axis_length', 'minor_axis_length']]
        
        cluster_object_table_new = cluster_object_table_new.rename({'major_axis_length': 'major_axis', 
                                                                    'minor_axis_length': 'minor_axis'})
        
        cluster_object_table_new['cluster'] = cluster
        cluster_object_table_new['axis_ratio'] = cluster_object_table_new.major_axis / cluster_object_table_new.minor_axis
        cluster_object_table_new['axis_diff_norm'] = (cluster_object_table_new.major_axis - cluster_object_table_new.minor_axis)/ cluster_object_table_new.area
        
        all_data.append(cluster_object_table_new)
        
all_data = pd.concat(all_data)
all_data.to_csv(os.path.join(shape_mask_dir, 'object_mask_properties.csv'))

In [None]:
## Summarize FOV level data

mask_data = pd.read_csv(os.path.join(shape_mask_dir, "object_mask_properties.csv"))

# Filter for objects with area greater than 10
mask_data = mask_data[mask_data.area > 10]

cluster_stats = mask_data[['fov', 'cluster', 'axis_ratio', 'axis_diff_norm']].groupby(by=['fov', 'cluster']).mean().reset_index()
cluster_stats = cluster_stats.replace([np.inf, -np.inf], np.nan, inplace=True)

ratio_mean = cluster_stats[['fov', 'cluster', 'axis_ratio']].dropna()
ratio_mean.to_csv(os.path.join(shape_mask_dir,  "fov_object_mean_ratio.csv"), index=False)

diff_norm_mean = cluster_stats[['fov', 'cluster', 'axis_diff_norm']].dropna()
diff_norm_mean.to_csv(os.path.join(shape_mask_dir,  "fov_object_mean_diff_norm.csv"), index=False)

## Neighborhood Analysis
### Kmeans clustering

In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import time
import feather
from joblib import Parallel, delayed

pixie_dir = os.path.join(base_dir, "pixie", "ecm_061423_pixel_output_dir")
cluster_overlay_dir = os.path.join(pixie_dir, "pixel_masks")
step_size = 1
window_size = 50
output_dir = "neighborhood_mats"+"_window"+str(window_size)
subset_dir = "neighborhood_mats_subset"+"_window"+str(window_size)
training_subset = 0.1
seed = 329

# Make output directory
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(subset_dir):
    os.makedirs(subset_dir)

# Get all fovs
all_fovs = os.listdir(cluster_overlay_dir)
all_fovs = [x for x in all_fovs if "_pixel_mask.tiff" in x]
all_fovs = [x.replace("_pixel_mask.tiff","") for x in all_fovs]
# Get completed fovs
done_fovs = os.listdir(subset_dir)
done_fovs = [x for x in done_fovs if ".feather" in x]
done_fovs = [x.replace(".feather","") for x in done_fovs]
# Only keep fovs that have not been completed
fovs = [x for x in all_fovs if x not in done_fovs]

# Get cluster numbers
cluster_data = pd.read_csv(os.path.join(pixie_dir, "pixel_channel_avg_meta_cluster.csv"))
clusters = cluster_data['pixel_meta_cluster'].values

# Create generator for all windows
def sliding_window(image, windowSize):
    for x in range(0, image.shape[0]):
        startx = x-windowSize
        if startx < 0:
            startx = 0
        endx = x+windowSize
        if endx > image.shape[1]:
            endx = image.shape[1]
        for y in range(0, image.shape[1]):
            # Don't return if pixel is 0
            if image[x,y] != 0:
                starty = y-windowSize
                if starty < 0:
                    starty = 0
                endy = y+windowSize
                if endy > image.shape[0]:
                    endy = image.shape[0]
                yield (x, y, startx, endx, starty, endy)

# Count clusters in each window
def extractFeatures(im, window):
    x,y,startx,endx,starty,endy = window
    one_im = im[startx:endx, starty:endy]
    counts = [np.count_nonzero(one_im == c) for c in clusters]
    counts.insert(0,y)
    counts.insert(0,x)
    return counts

# Get neighbors for each pixel
def one_fov(fov):
    features = []
    im = np.array(Image.open(os.path.join(cluster_overlay_dir, fov+"_pixel_mask.tiff")))
    windows = sliding_window(im, window_size)
    for i,window in enumerate(windows):
        featureVector = extractFeatures(im,window)
        # Sum up features (first two elements are x and y)
        if np.sum(featureVector[2:]) != 0:
            featureVector.insert(0, fov)
            features.append(featureVector)

    # Write to file
    features_np = np.asarray(features)
    df = pd.DataFrame(features_np)
    header = ['clust_'+str(c) for c in clusters]
    header = ['fov','x','y']+header
    df.columns = header
    feather.write_dataframe(df, os.path.join(output_dir,fov+'.feather'), compression='uncompressed')

    # Get random subset for training
    df_subset = df.sample(frac=training_subset, random_state=seed)
    feather.write_dataframe(df_subset, os.path.join(subset_dir,fov+'.feather'), compression='uncompressed')

results = Parallel(n_jobs=4)(delayed(one_fov)(i) for i in fovs)

In [None]:
import feather
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from ark.analysis import visualize
from scipy.stats import zscore
from PIL import Image
import joblib

train_dat_dir = "neighborhood_mats_subset_window50"
all_fov_dir = "neighborhood_mats_window50"
pixie_dir = os.path.join(base_dir, "pixie", "ecm_061423_pixel_output_dir")

clust_to_pheno_path = os.path.join(pixie_dir, "pixel_channel_avg_meta_cluster.csv")
neighborhood_dir = os.path.join(base_dir, "neighborhood", "neighborhood_masks")
cluster_num  = 5
seed = 329

# Make output directory
if not os.path.exists(neighborhood_dir):
    os.makedirs(neighborhood_dir)

# Read in data
train_fovs = os.listdir(train_dat_dir)
train_data = pd.concat(
    [feather.read_dataframe(os.path.join(train_dat_dir,fov)) for fov in train_fovs]
)
train_cols = train_data.columns.values
train_cols = [x for x in train_cols if "clust_" in x]
train_data[train_cols] = train_data[train_cols].astype(np.int32)

# Get phenotype mapping
clust_to_pheno = pd.read_csv(clust_to_pheno_path)
clust_to_pheno_dict = pd.Series(clust_to_pheno.pixel_meta_cluster_rename.values, index=clust_to_pheno.pixel_meta_cluster).to_dict()

# k-means clustering
cluster_fit = KMeans(n_clusters=cluster_num, random_state=seed, n_init=10).fit(train_data[train_cols])
joblib.dump(cluster_fit, "kmeans_model.joblib")
train_data['neighborhood'] = cluster_fit.labels_ + 1
# Make heatmap
keep_cols = train_cols+['neighborhood']
train_data_keep = train_data[keep_cols]
mean_dat = train_data_keep.groupby('neighborhood', as_index=False).mean(numeric_only=True)
mean_dat.to_csv("neighborhood_cluster_mean.csv", index=False)
mean_dat_values = mean_dat.drop('neighborhood', axis=1)
visualize.draw_heatmap(data=mean_dat_values.apply(zscore).values,
                       x_labels=["Cluster"+str(x) for x in mean_dat['neighborhood'].values],
                       y_labels=[clust_to_pheno_dict[int(x.replace("clust_",""))] for x in mean_dat.drop('neighborhood', axis=1).columns.values],
                       center_val=0,
                       save_dir=".",
                       save_file="training_k"+str(cluster_num)+"_heatmap")

## Assign neighborhood to all FOVs
all_fovs = os.listdir(all_fov_dir)
all_fovs = [x for x in all_fovs if ".feather" in x]
all_fovs = [x.replace(".feather","") for x in all_fovs]

# Define function to assign labels
def assign_label(clust_array,x,y,clust):
    clust_array[x,y] = clust

keep_cols = ['x','y']
for fov in all_fovs:
    print(fov)
    one_fov = feather.read_dataframe(os.path.join(all_fov_dir,fov+".feather"))
    one_fov['neighborhood'] = cluster_fit.predict(one_fov[train_cols])
    one_fov[keep_cols] = one_fov[keep_cols].astype(np.int32)
    
    # Get size
    sample_im = np.array(Image.open(os.path.join(tiff_dir,fov,"Au.tiff")))
    # Create array for cluster labels, fill with 0's
    nh_array = np.full((sample_im.shape), 0, dtype=int)
    # Fill in array
    [assign_label(nh_array,row[0],row[1],row[2]) for row in one_fov[['x','y','neighborhood']].values]
    # Save
    im = Image.fromarray(nh_array.astype(np.int32))
    im.save(os.path.join(neighborhood_dir, fov+"_nh_mask.tiff"))



### Generate FOV neighborhood counts

In [None]:
import skimage.io as io
import pandas as pd
import numpy as np
import os

neighborhood_mask_dir = "neighborhood_masks"
num_clusters = 5

all_fovs = os.listdir(neighborhood_mask_dir)
all_fovs = [x for x in all_fovs if "_nh_mask.tiff" in x]
all_fovs = [x.replace("_nh_mask.tiff","") for x in all_fovs]

cluster_cols = ['Cluster'+str(x) for x in range(1,num_clusters+1)]
cols = ['fov']+cluster_cols+['total'] 
all_df = pd.DataFrame(columns=cols)
all_df['fov'] = all_fovs

for fov in all_fovs:
    print(fov)
    nh_mask = io.imread(os.path.join(neighborhood_mask_dir, fov+"_nh_mask.tiff"))
    unique, counts = np.unique(nh_mask, return_counts=True)
    count_dict = dict(zip(unique,counts))
    search_counts = [count_dict[x] if x in unique else 0 for x in range(1,num_clusters+1)]
    search_counts = np.array(search_counts)
    fov_df = pd.DataFrame(search_counts.reshape(-1,len(search_counts)), columns=['Cluster'+str(x) for x in range(1,num_clusters+1)])
    all_df.loc[all_df["fov"]==fov,cluster_cols] = search_counts
    all_df.loc[all_df["fov"]==fov,"total"] = np.count_nonzero(nh_mask)

all_df.to_csv(os.path.join(neighborhood_dir, "fov_neighborhood_counts.csv"), index=False)
