In [None]:
# Parameters 
ad_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/BICAN_BG_CPS.h5ad"
geom_store_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/regions/region_geometries_cps.parquet"
N_permute = 1000
output_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/CPS/ms_enrichment"

In [None]:
# imports
import os
from pathlib import Path
import itertools
from tqdm import tqdm

import numpy as np
import pandas as pd
import anndata as ad
from scipy.stats import norm
from statsmodels.stats.multitest import multipletests

import geopandas as gpd

### functions

In [None]:
def permute_geometry(geometry_col):
    """
    Randomly permute a GeoSeries of point geometries.

    Parameters
    ----------
    geometry_col : geopandas.GeoSeries
        A column of shapely Points (e.g., gdf.geometry).

    Returns
    -------
    geopandas.GeoSeries
        Shuffled GeoSeries (same geometries, new order).
    """
    # Ensure input is a GeoSeries
    if not isinstance(geometry_col, gpd.GeoSeries):
        geometry_col = gpd.GeoSeries(geometry_col)

    # Shuffle indices
    shuffled = np.random.permutation(geometry_col)

    return shuffled

# functions from xingjiepan 2023 mouse atlas paper
def adjust_p_value_matrix_by_BH(p_val_mtx):
    '''Adjust the p-values in a matrix by the Benjamini/Hochberg method.
    The matrix should be symmetric.
    '''
    p_val_sequential = []
    N = p_val_mtx.shape[0]
    
    for i in range(N):
        for j in range(i, N):
            p_val_sequential.append(p_val_mtx[i, j])

    p_val_sequential_bh = multipletests(p_val_sequential, method='fdr_bh')[1]
    
    adjusted_p_val_mtx = np.zeros((N, N))
    
    counter = 0
    for i in range(N):
        for j in range(i, N):
            adjusted_p_val_mtx[i, j] = p_val_sequential_bh[counter]
            adjusted_p_val_mtx[j, i] = p_val_sequential_bh[counter]
            counter += 1
            
    return adjusted_p_val_mtx

def one_sided_pval(real, null_dist):
    """
    Calculate one-sided p-value for real value against null distribution.

    Parameters
    ----------
    real : dict
        Dict of cell_type to real value
    null_dist : dict(array-like)
        Dict of cell type to array of the null distribution values.

    Returns
    -------
    z_scores : dict
        Z-scores.
    p_vals : dict
        Raw p-values.
    adj_p_value : dict
        Adjusted p-values (Benjamini/Hochberg).
    """

    z_scores = {}
    p_vals = {}
    for _key in real.keys(): 
        _real = real[_key]
        _null_dist = null_dist[_key]
        null_mean = np.mean(_null_dist)
        null_std = np.maximum(np.std(_null_dist), 1e-6)
        z_score = (_real - null_mean) / null_std
        p = norm.sf(np.abs(z_score))
        z_scores[_key] = z_score
        p_vals[_key] = p

    adj_p_value = multipletests(list(p_vals.values()), method='fdr_bh')[1]
    return z_scores, p_vals, {_key: adj_p_value[i] for i, _key in enumerate(p_vals.keys())}

### Read

In [None]:
Path(output_path).mkdir(parents=True, exist_ok=True)

In [None]:
adata = ad.read_h5ad(ad_path)
geoms = gpd.read_parquet(geom_store_path)

In [None]:
donors = adata.obs['donor'].unique().tolist()
replicates = adata.obs['replicate'].unique().tolist()
brain_regions = ['CAH', 'CAB', 'PU', 'NAC', 'MGM1', 'SUBTH'] # adata.obs['brain_region'].unique().tolist()
skip = [("UWA7648", "CAT", "ucsd"), ("UWA7648", "CAT", "salk")]

In [None]:
# From here on this needs to be iterable. 
# contact_list = []
pbar = tqdm(itertools.product(donors, brain_regions, replicates))
for _i in pbar:
    if _i in skip:
        # print(f"Skipping {_i}")
        continue
    _donor, _brain_region, _replicate, = _i
    pbar.set_description(f"Processing {_donor} | {_brain_region} | {_replicate}")
    adata_sub = adata[ (adata.obs['donor'] == _donor) & 
                       (adata.obs['brain_region'] == _brain_region) & 
                       (adata.obs['replicate'] == _replicate) ].copy()
    geoms_sub = geoms[ (geoms['donor'] == _donor) & 
                       (geoms['brain_region'] == _brain_region) & 
                       (geoms['lab'] == _replicate) ].copy()
    if geoms_sub.shape[0] == 0:
        continue

    gdf = gpd.GeoDataFrame(adata_sub.obs, geometry=gpd.points_from_xy(adata_sub.obs['CENTER_X'], adata_sub.obs['CENTER_Y']), crs=None)

    subclass_cells = gdf['Subclass'].unique().tolist()
    group_cells = gdf['Group'].unique().tolist()

    
    # Doing GM vs. WM first, and then removing WM and doing matrix vs. striosome
    wm_cells = gpd.sjoin(gdf, geoms[geoms['type'] == 'White_Matter'], how="inner", predicate='within')
    wm_cells = wm_cells.loc[~wm_cells.index.duplicated(keep="first")]

    sub_wm_counts = wm_cells.groupby("Subclass", observed=False).size().to_dict()
    gr_wm_counts = wm_cells.groupby("Group", observed=False).size().to_dict()
    
    null_sub_wm_counts = {a: [] for a in subclass_cells}
    null_gr_wm_counts = {a: [] for a in group_cells}
    
    for i in range(N_permute): 
        gdf.geometry = permute_geometry(gdf.geometry)

        wm_cells = gpd.sjoin(gdf, geoms[geoms['type'] == 'White_Matter'], how="inner", predicate='within')
        wm_cells = wm_cells.loc[~wm_cells.index.duplicated(keep="first")]

        for a, b in wm_cells.groupby("Subclass", observed=False).size().items(): 
            null_sub_wm_counts[a].append(b)
        
        for a, b in wm_cells.groupby("Group", observed=False).size().items(): 
            null_gr_wm_counts[a].append(b)

    naming = ["subclass_white_matter", "group_white_matter"]
    real_dicts = [sub_wm_counts, gr_wm_counts]
    null_dicts = [null_sub_wm_counts, null_gr_wm_counts]

    for i, (_name, _real, _null) in enumerate(zip(naming, real_dicts, null_dicts)):
        z_score, ps, adj_ps = one_sided_pval(_real, _null)
        cell_types = _real.keys()
        result_df = pd.DataFrame({
            "cell_type": cell_types,
            "real_count": [_real[ct] for ct in cell_types],
            "mean_null_count": [np.mean(_null[ct]) for ct in cell_types],
            "std_null_count": [np.std(_null[ct]) for ct in cell_types],
            "z_score": [z_score[ct] for ct in cell_types],
            "p_value": [ps[ct] for ct in cell_types],
            "adj_p_value": [adj_ps[ct] for ct in cell_types],
            "log_2FC": [np.log2( (_real[ct] + 1) / (np.mean(_null[ct]) + 1) ) for ct in cell_types]
        })
        
        result_df.to_csv(Path(output_path) / f"ms_composition_{_name}_{_donor}_{_brain_region}_{_replicate}.csv", index=False)

In [None]:
brain_regions = ['CAH', 'CAB', 'PU', 'NAC']

In [None]:
pbar = tqdm(itertools.product(donors, brain_regions, replicates))
for _i in pbar:
    if _i in skip:
        # print(f"Skipping {_i}")
        continue
    _donor, _brain_region, _replicate, = _i
    pbar.set_description(f"Processing {_donor} | {_brain_region} | {_replicate}")
    adata_sub = adata[ (adata.obs['donor'] == _donor) & 
                       (adata.obs['brain_region'] == _brain_region) & 
                       (adata.obs['replicate'] == _replicate) ].copy()
    geoms_sub = geoms[ (geoms['donor'] == _donor) & 
                       (geoms['brain_region'] == _brain_region) & 
                       (geoms['lab'] == _replicate) ].copy()
    if geoms_sub.shape[0] == 0:
        continue

    gdf = gpd.GeoDataFrame(adata_sub.obs, geometry=gpd.points_from_xy(adata_sub.obs['CENTER_X'], adata_sub.obs['CENTER_Y']), crs=None)
    # break
    # Removing all WM cells from this part of the analysis:
    wm_cells = gpd.sjoin(gdf, geoms[geoms['type'] == 'White_Matter'], how="inner", predicate='within')
    wm_cells = wm_cells.loc[~wm_cells.index.duplicated(keep="first")]
    gdf = gdf.drop(index=wm_cells.index).copy()
    gdf['Subclass'] = gdf['Subclass'].cat.remove_unused_categories()
    gdf['Group'] = gdf['Group'].cat.remove_unused_categories()

    subclass_cells = gdf['Subclass'].unique().tolist()
    group_cells = gdf['Group'].unique().tolist()
    
    mat_cells = gpd.sjoin(gdf, geoms[geoms['type'] == 'Matrix'], how="inner", predicate='within')
    mat_cells = mat_cells.loc[~mat_cells.index.duplicated(keep="first")]
    str_cells = gpd.sjoin(gdf, geoms[geoms['type'] == 'Striosome'], how="inner", predicate='within')
    str_cells = str_cells.loc[~str_cells.index.duplicated(keep="first")]
    
    sub_mat_counts = mat_cells.groupby("Subclass", observed=False).size().to_dict()
    sub_str_counts = str_cells.groupby("Subclass", observed=False).size().to_dict()
    
    gr_mat_counts = mat_cells.groupby("Group", observed=False).size().to_dict()
    gr_str_counts = str_cells.groupby("Group", observed=False).size().to_dict()    

    null_sub_mat_counts = {a: [] for a in subclass_cells}
    null_sub_str_counts = {a: [] for a in subclass_cells}
    
    null_gr_mat_counts = {a: [] for a in group_cells}
    null_gr_str_counts = {a: [] for a in group_cells}

    for i in range(N_permute): 
        gdf.geometry = permute_geometry(gdf.geometry)

        mat_cells = gpd.sjoin(gdf, geoms[geoms['type'] == 'Matrix'], how="inner", predicate='within')
        mat_cells = mat_cells.loc[~mat_cells.index.duplicated(keep="first")]

        str_cells = gpd.sjoin(gdf, geoms[geoms['type'] == 'Striosome'], how="inner", predicate='within')
        str_cells = str_cells.loc[~str_cells.index.duplicated(keep="first")]

        for a, b in mat_cells.groupby("Subclass", observed=False).size().items(): 
            null_sub_mat_counts[a].append(b)
        
        for a, b in str_cells.groupby("Subclass", observed=False).size().items(): 
            null_sub_str_counts[a].append(b)

        for a, b in mat_cells.groupby("Group", observed=False).size().items(): 
            null_gr_mat_counts[a].append(b)
        
        for a, b in str_cells.groupby("Group", observed=False).size().items(): 
            null_gr_str_counts[a].append(b)

    naming = ["subclass_matrix", "subclass_striosome", "group_matrix", "group_striosome",]
    real_dicts = [sub_mat_counts, sub_str_counts, gr_mat_counts, gr_str_counts]
    null_dicts = [null_sub_mat_counts, null_sub_str_counts, null_gr_mat_counts, null_gr_str_counts]

    for i, (_name, _real, _null) in enumerate(zip(naming, real_dicts, null_dicts)):
        z_score, ps, adj_ps = one_sided_pval(_real, _null)
        cell_types = _real.keys()
        result_df = pd.DataFrame({
            "cell_type": cell_types,
            "real_count": [_real[ct] for ct in cell_types],
            "mean_null_count": [np.mean(_null[ct]) for ct in cell_types],
            "std_null_count": [np.std(_null[ct]) for ct in cell_types],
            "z_score": [z_score[ct] for ct in cell_types],
            "p_value": [ps[ct] for ct in cell_types],
            "adj_p_value": [adj_ps[ct] for ct in cell_types],
            "log_2FC": [np.log2( (_real[ct] + 1) / (np.mean(_null[ct]) + 1) ) for ct in cell_types]
        })
        
        result_df.to_csv(Path(output_path) / f"ms_composition_{_name}_{_donor}_{_brain_region}_{_replicate}.csv", index=False)