# Cell-cell enrichment

In [None]:
import os
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import random
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

## Get enrichment scores

In [None]:
cell_tab_path = "../data/tables/cell_table_size_normalized.csv"
output_dir = "../data/spatial_analysis/cell_cell_enrichment_20um/tables"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

cell_table = pd.read_csv(cell_tab_path)
all_fovs = np.unique(cell_table['fov'])

random.seed(329)

In [None]:
all_pheno1s = ['APC', 'B', 'CD11c', 'CD11c_CD14', 'CD11c_CD68', 'CD14',
               'CD14_CD68_CD163', 'CD4T', 'CD68', 'CD8T', 'Endothelial',
               'FDC', 'Foxp3', 'Mast', 'NK', 'Neutrophils', 'SMA', 'Tfh']
all_pheno2s = ['APC', 'B', 'CD11c', 'CD11c_CD14', 'CD11c_CD68', 'CD14',
               'CD14_CD68_CD163', 'CD4T', 'CD68', 'CD8T', 'Endothelial',
               'FDC', 'Foxp3', 'Mast', 'NK', 'Neutrophils', 'SMA', 'Tfh']
pheno_list = [(pheno1, pheno2) for pheno1 in all_pheno1s for pheno2 in all_pheno2s if pheno1 != pheno2]

dist_thresh = 20*(2048/800)
bootstrap_n = 100

for pheno1,pheno2 in pheno_list:
    data_list = []
    for fov in all_fovs:
        # Calculate distance between all cells in the FOV
        fov_cell_table = cell_table.loc[cell_table['fov'] == fov].reset_index(drop=True)
        all_centroids = list(zip(fov_cell_table['centroid-0'],fov_cell_table['centroid-1']))
        dist_mat = cdist(all_centroids, all_centroids, 'euclidean')

        # Count number of close contacts between cell types
        pheno1_idx = fov_cell_table[fov_cell_table['cell_meta_cluster'] == pheno1].index.to_list()
        pheno2_idx = fov_cell_table[fov_cell_table['cell_meta_cluster'] == pheno2].index.to_list()
        
        # Check if the cell type exists in the data
        if len(pheno1_idx)==0 or len(pheno2_idx)==0:
            data_list.append([pheno1, pheno2, fov, ""])
            continue

        # Only keep pheno1 cells in x-axis of distance matrix
        pheno1_dist_mat = dist_mat[pheno1_idx,:]
        # Binarize the distance matrix for distances that are within the defined threshold
        bin_mask = (pheno1_dist_mat < dist_thresh) & (pheno1_dist_mat > 0)
        pheno1_dist_mat_bin = bin_mask*1

        # Subset this distance matrix for pheno2 cells in y-axis of distance matrix
        true_dist_mat_bin = pheno1_dist_mat_bin[:,pheno2_idx]

        # For each pheno1 cell, count number of "close" contacts with pheno2 cells
        true_close_contacts = np.sum(true_dist_mat_bin, axis=1)
        # Take the average across all pheno1 cells
        true_close_contacts_mean = np.mean(true_close_contacts)

        # Get all possible cell indices (total pool of available cells to randomize)
        all_idx = fov_cell_table.index.to_list()
        # Remove cells that are of pheno1 from this pool (since they are held constant in this randomization)
        all_idx = [x for x in all_idx if x not in pheno1_idx]
        # Get total number of cells that are pheno2
        num_pheno2 = len(pheno2_idx)

        # Randomly sample all cells to be labeled as pheno2 (bootstrapping)
        all_bootstrap = []
        for _ in range(bootstrap_n):
            # Select num_pheno2 random numbers, represents the indices of the randomly selected cells
            random_pheno2_idx = random.sample(all_idx, num_pheno2)
            # Subset the distance matrix to only keep these randomly selected cells
            keep_dist_mat_bin = pheno1_dist_mat_bin[:,random_pheno2_idx]
            # Find the total number of close contacts between pheno1 cells and randomly selected cells
            close_contacts = np.sum(keep_dist_mat_bin, axis=1)
            # Take the mean across all cells of pheno1
            close_contacts_mean = np.mean(close_contacts)
            # Add this value to the list of all bootstraps
            all_bootstrap.append(close_contacts_mean)

        # Calculate statistics of null distribution
        muhat, sigmahat = stats.norm.fit(all_bootstrap)
        # Calculate z score based on distribution
        z = (true_close_contacts_mean - muhat) / sigmahat

        data_list.append([pheno1, pheno2, fov, z])

    save_df = pd.DataFrame(data_list, columns=['pheno1', 'pheno2', 'fov', 'z'])
    save_df.to_csv(os.path.join(output_dir, pheno1+"_"+pheno2+".csv"), index=False)

## Heatmaps of all z-scores

In [None]:
output_dir = "../data/spatial_analysis/cell_cell_enrichment_20um/tables"
all_filenames = os.listdir(output_dir)
metadata_tab = pd.read_csv("../data/tables/metadata.csv")

all_lists = []
for one_file in all_filenames:
    one_tab = pd.read_csv(os.path.join(output_dir, one_file))
    one_tab = pd.merge(one_tab, metadata_tab, on='fov')
    
    all_z = one_tab['z']
    all_z_finite = all_z[np.isfinite(all_z)]
    all_mean = np.mean(all_z_finite)
    all_std = np.std(all_z_finite)
    all_cv = all_std / all_mean
    
    hivneg_z = one_tab[one_tab['status']=='hiv_neg']['z']
    hivneg_z_finite = hivneg_z[np.isfinite(hivneg_z)]
    hivneg_mean = np.mean(hivneg_z_finite)
    hivneg_std = np.std(hivneg_z_finite)
    hivneg_cv = hivneg_std / hivneg_mean
    
    hivpos_z = one_tab[one_tab['status']=='hiv_pos']['z']
    hivpos_z_finite = hivpos_z[np.isfinite(hivpos_z)]
    hivpos_mean = np.mean(hivpos_z_finite)
    hivpos_std = np.std(hivpos_z_finite)
    hivpos_cv = hivpos_std / hivpos_mean
    
    hivpos_viremic_z = one_tab[one_tab['status_with_viremia']=='hiv_pos_viremic']['z']
    hivpos_viremic_z_finite = hivpos_viremic_z[np.isfinite(hivpos_viremic_z)]
    hivpos_viremic_mean = np.mean(hivpos_viremic_z_finite)
    hivpos_viremic_std = np.std(hivpos_viremic_z_finite)
    hivpos_viremic_cv = hivpos_viremic_std / hivpos_viremic_mean
    
    hivpos_aviremic_z = one_tab[one_tab['status_with_viremia']=='hiv_pos_aviremic']['z']
    hivpos_aviremic_z_finite = hivpos_aviremic_z[np.isfinite(hivpos_aviremic_z)]
    hivpos_aviremic_mean = np.mean(hivpos_aviremic_z_finite)
    hivpos_aviremic_std = np.std(hivpos_aviremic_z_finite)
    hivpos_aviremic_cv = hivpos_aviremic_std / hivpos_aviremic_mean
    
    hivpos_p24pos_z = one_tab[one_tab['status_with_p24']=='hiv_pos_p24pos']['z']
    hivpos_p24pos_z_finite = hivpos_p24pos_z[np.isfinite(hivpos_p24pos_z)]
    hivpos_p24pos_mean = np.mean(hivpos_p24pos_z_finite)
    hivpos_p24pos_std = np.std(hivpos_p24pos_z_finite)
    hivpos_p24pos_cv = hivpos_p24pos_std / hivpos_p24pos_mean
    
    hivpos_p24neg_z = one_tab[one_tab['status_with_p24']=='hiv_pos_p24neg']['z']
    hivpos_p24neg_z_finite = hivpos_p24neg_z[np.isfinite(hivpos_p24neg_z)]
    hivpos_p24neg_mean = np.mean(hivpos_p24neg_z_finite)
    hivpos_p24neg_std = np.std(hivpos_p24neg_z_finite)
    hivpos_p24neg_cv = hivpos_p24neg_std / hivpos_p24neg_mean
    
    all_lists.append([np.unique(one_tab['pheno1'])[0], np.unique(one_tab['pheno2'])[0],
                      all_mean, all_std, all_cv,
                      hivneg_mean, hivneg_std, hivneg_cv,
                      hivpos_mean, hivpos_std, hivpos_cv,
                      hivpos_viremic_mean, hivpos_viremic_std, hivpos_viremic_cv,
                      hivpos_aviremic_mean, hivpos_aviremic_std, hivpos_aviremic_cv,
                      hivpos_p24pos_mean, hivpos_p24pos_std, hivpos_p24pos_cv,
                      hivpos_p24neg_mean, hivpos_p24neg_std, hivpos_p24neg_cv])

all_df = pd.DataFrame(all_lists, columns=['pheno1', 'pheno2',
                                          'all_mean', 'all_std', 'all_cv',
                                          'hivneg_mean', 'hivneg_std', 'hivneg_cv',
                                          'hivpos_mean', 'hivpos_std', 'hivpos_cv',
                                          'hivpos_viremic_mean', 'hivpos_viremic_std', 'hivpos_viremic_cv',
                                          'hivpos_aviremic_mean', 'hivpos_aviremic_std', 'hivpos_aviremic_cv',
                                          'hivpos_p24pos_mean', 'hivpos_p24pos_std', 'hivpos_p24pos_cv',
                                          'hivpos_p24neg_mean', 'hivpos_p24neg_std', 'hivpos_p24neg_cv',])

In [None]:
col_name = "hivpos_p24pos_cv"
plot_df = all_df.pivot(index='pheno1', columns='pheno2', values=col_name)

sns.heatmap(
    plot_df, 
    cmap="coolwarm",
    center=0,
    linewidths=0.5,
    vmin = -15,
    vmax = 15
)

## Enrichment score calculation for one example image

In [None]:
# Example image
ex_fov = "sample1_fov1"

# Determine cell types to look at
pheno1 = "CD8T"
pheno2 = "APC"

# Threshold to determine if two cells are "close"
dist_thresh = 50

# Number of bootstraps for generating null distribution
bootstrap_n = 100

### Calculate the distance between all cells in the FOV

In [None]:
# Subset cell table for only cells in this FOV
fov_cell_table = cell_table.loc[cell_table['fov'] == ex_fov].reset_index(drop=True)
# Make list of all cell centroids
all_centroids = list(zip(fov_cell_table['centroid-0'],fov_cell_table['centroid-1']))
# Get distance between all cells
dist_mat = cdist(all_centroids, all_centroids, 'euclidean')
# Print dimensions of distance matrix
print("Dimensions of dist_mat: ", dist_mat.shape)

dist_mat

### Count number of close contacts between pheno1 and pheno2

In [None]:
# Get index of cells belonging to pheno1 and pheno2
pheno1_idx = fov_cell_table[fov_cell_table['cell_meta_cluster'] == pheno1].index.to_list()
pheno2_idx = fov_cell_table[fov_cell_table['cell_meta_cluster'] == pheno2].index.to_list()

# Only keep pheno1 cells in x-axis of distance matrix
pheno1_dist_mat = dist_mat[pheno1_idx,:]
# Binarize the distance matrix for distances that are within the defined threshold
bin_mask = (pheno1_dist_mat < dist_thresh) & (pheno1_dist_mat > 0)
# Change true/false to 1/0
pheno1_dist_mat_bin = bin_mask*1

# Subset this distance matrix for pheno2 cells in y-axis of distance matrix
true_dist_mat_bin = pheno1_dist_mat_bin[:,pheno2_idx]
# Inspect the shape of this matrix, should be number of cells of pheno1 x number of cells of pheno2
# Each element in the matrix is the distance between a pheno1 cell and a pheno2 cell
print("Shape of subsetted distance matrix: ", true_dist_mat_bin.shape)

# For each pheno1 cell, count number of "close" contacts with pheno2 cells
true_close_contacts = np.sum(true_dist_mat_bin, axis=1)
# Take the average across all pheno1 cells
true_close_contacts_mean = np.mean(true_close_contacts)
print("Average number of close contacts between pheno1 and pheno2 cells: ", true_close_contacts_mean)

### Generate null distribution by bootstrapping

In [None]:
# Get all possible cell indices (total pool of available cells to randomize)
all_idx = fov_cell_table.index.to_list()
# Remove cells that are of pheno1 from this pool (since they are held constant in this randomization)
all_idx = [x for x in all_idx if x not in pheno1_idx]
# Get total number of cells that are pheno2
num_pheno2 = len(pheno2_idx)

# Randomly sample all cells to be labeled as pheno2 (bootstrapping)
all_bootstrap = []
for _ in range(bootstrap_n):
    # Select num_pheno2 random numbers, represents the indices of the randomly selected cells
    random_pheno2_idx = random.sample(all_idx, num_pheno2)
    # Subset the distance matrix to only keep these randomly selected cells
    keep_dist_mat_bin = pheno1_dist_mat_bin[:,random_pheno2_idx]
    # Find the total number of close contacts between pheno1 cells and randomly selected cells
    close_contacts = np.sum(keep_dist_mat_bin, axis=1)
    # Take the mean across all cells of pheno1
    close_contacts_mean = np.mean(close_contacts)
    # Add this value to the list of all bootstraps
    all_bootstrap.append(close_contacts_mean)

### Compare null distrbution to actual number of close contacts

In [None]:
fig, ax = plt.subplots(figsize=(5,3))
# Blue histogram is null distribution
ax.hist(all_bootstrap, density=True,  bins=10, alpha=0.5)
# Red line is actual number of close contacts
plt.axvline(x=true_close_contacts_mean, color='red', linestyle='--', linewidth=2)
plt.show()

In [None]:
# Calculate statistics of null distribution
muhat, sigmahat = stats.norm.fit(all_bootstrap)
# Calculate z score based on distribution
z = (true_close_contacts_mean - muhat) / sigmahat
print("z-score: ", z)