## Import

In [47]:
import numpy as np
import pandas as pd
import anndata as ad
import os
import scanpy as sc
import json
from scipy.stats import fisher_exact
import pickle

In [48]:
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [49]:
panglaodb_path = "../input data/PanglaoDB_markers_27_Mar_2020_allcells.tsv.gz"

In [50]:
path = "output_phenotypes"

## Functions

### bin_cells()

In [5]:
def bin_cells(adata, bin_size = 100, bin_key='bin', spatial_key='spatial', center=True):
    # if 'center' is True, the grid is alligned to the minimum point of the coordinates
    # if False, it starts from origin (0,0)

    df = pd.DataFrame(adata.obsm[spatial_key], columns=['x', 'y'], index=adata.obs.index) # df with cells coordinates
    if center is True:
        x0 = df.x.min()
        y0 = df.y.min()
    else:
        x0 = 0
        y0 = 0

    # for each cell new bin discrete coordinates
    df[bin_key+'_x'] = ((df.x - x0)//bin_size).astype(int)
    df[bin_key+'_y'] = ((df.y - y0)//bin_size).astype(int)
    df[bin_key] = bin_key+'_x' + df[bin_key+'_x'].astype(str) + '_y' + df[bin_key+'_y'].astype(str) # every bin is identified with a string (e.g. bin_x3_y5)

    bins_x   = df[bin_key+'_x'].max() + 1
    bins_y   = df[bin_key+'_y'].max() + 1
    print('Generated', str(bins_x), 'x-bins')
    print('Generated', str(bins_y), 'y-bins')

    if bin_key in adata.obs.columns: 
        adata.obs.update(df[[bin_key, bin_key+'_x', bin_key+'_y']], overwrite=True)
    else:
        adata.obs = adata.obs.join(df[[bin_key, bin_key+'_x', bin_key+'_y']], how='left')
    
    return adata

### aggregate_counts()

In [6]:
# aggregates the Anndata to bin-level, computing sum or median of bin gene expression
# needed to prepare data to pseudo bulk analysis
def aggregate_counts(adata, group_by, fun=None, layer=None, sep="_", min_cells=0):

    def unique_per_group(series):
        grouped_meta = series.groupby(group_keys, observed=True)
        return grouped_meta.apply(lambda x: x.iloc[0] if x.nunique() == 1 else np.nan)
    
    expr_df = adata.to_df(layer=layer)
    group_keys = [adata.obs[col] for col in group_by] # Series list

    # df groups x genes
    if fun == 'sum':
        grouped_expr = expr_df.groupby(group_keys, observed=True).sum()
    elif fun == 'mean':
        grouped_expr = expr_df.groupby(group_keys, observed=True).mean()
    else:
        raise ValueError("aggregate_fun must be 'mean' or 'sum'")

    # Filter out groups with less than 'min_cells' cells
    cell_counts = adata.obs.groupby(group_keys, observed=True).size()
    valid_groups = cell_counts[cell_counts >= min_cells].index
    grouped_expr = grouped_expr.loc[valid_groups]
    
    aggregated_counts_df = grouped_expr.T
    # rename columns (=groups)
    # if more than 1 group, combined name using 'sep'
    if len(group_by) > 1:
        aggregated_counts_df.columns = [sep.join(map(str, col)) for col in aggregated_counts_df.columns]
    else:
        aggregated_counts_df.columns = aggregated_counts_df.columns.astype(str)

    # apply unique_per_group
    aggregated_metadata_df = pd.DataFrame({
        col: unique_per_group(adata.obs[col]) for col in adata.obs.columns
    })
    
    aggregated_metadata_df["n_cells"] = cell_counts
    aggregated_metadata_df = aggregated_metadata_df.loc[valid_groups]
    aggregated_metadata_df = aggregated_metadata_df.dropna(axis=1, how="all") # remove empty columns

    # rename indeces to be consistent with expression matrix
    if len(group_by) > 1:
        aggregated_metadata_df.index = [sep.join(map(str, idx)) for idx in aggregated_metadata_df.index]
    else:
        aggregated_metadata_df.index = aggregated_metadata_df.index.astype(str)
    aggregated_metadata_df = aggregated_metadata_df.loc[aggregated_counts_df.columns]
    
    return aggregated_counts_df.T, aggregated_metadata_df

### get_background_features()

In [7]:
# it builds a background map to understand if the expression of a specific gene in a specific cell type (target) is
#    1) correlated to other cell types (similar spatial patterns)
#    2) higher or lower than other cell types' expression in the same bin

# high corr means unspecific expression (background)
# low corr and low expr in other cell types means the gene is specific for the target

def get_background_features(aggr_sum, aggr_mean, target):
    bg_features = {}
    features = list(aggr_sum.var.index) # genes list

    # dataframes of summed/mean expression (aggregated_bins x genes)
    expr_mean = pd.DataFrame(np.asarray(aggr_mean.X), index=aggr_mean.obs_names, columns=list(aggr_mean.var.index))
    expr_sum = pd.DataFrame(np.asarray(aggr_sum.X), index=aggr_sum.obs_names, columns=list(aggr_sum.var.index))

    for f in features: 
        # for each gene, matrix bins per cell types
        # HERE WE ARE SURE THAT EVERY CELLTYPE IS ASSOCIATED TO ALL THE EXESTING BINS
        # PIVOT INSERT NaN IN CASE A CELL TYPE WAS NOT PRESENT IN THE BIN
        # SO WE ARE NOW COMPUTING CORRELATIONS ON VECTORS THAT ARE EQUALLY LONG
        binned_counts = aggr_sum.obs[['celltype', 'bin']].join(expr_sum[f]).melt(id_vars=['celltype', 'bin']).pivot(index='bin', columns='celltype', values='value')
        binned_counts_mean = aggr_mean.obs[['celltype', 'bin']].join(expr_mean[f]).melt(id_vars=['celltype', 'bin']).pivot(index='bin', columns='celltype', values='value')
        
        # correlation (check if expression in target is correlated with the expression in other celltypes)
        binned_counts[target] = binned_counts_mean[target]
        bin_cor = binned_counts.corr(method='spearman') # AUTOMATICALLY MANAGES NaN (IT USES ONLY ROWS WHERE BOTH VALUES ARE NOT NaN)
        bin_cor.loc[target] = 0 # self correlation
    
        # expression (check if expression in target is higher/lower than in the other celltypes)
        bin_expr_higher = (~binned_counts_mean.lt(binned_counts_mean[target], axis=0)).astype(float)
        bin_expr_higher[np.isnan(binned_counts_mean)] = np.nan
        bin_expr_higher[target] = 0
        # bin_expr_higher[celltype] = 1 if expression>=target; = 0 if expression<target
    
        df = pd.DataFrame({'expr': bin_expr_higher.mean(axis=0), 'cor': bin_cor[target]}) # mean is automatically ignoring NaN
        bg_features[f] = df
        
    return bg_features

'''
For each cell type we get this dictionary:
{
  'GeneA':   expr | cor
             -----|-----
             celltype1 | 0.2 | 0.8
             celltype2 | 0.7 | 0.3
             target    | 0.0 | 0.0

  'GeneB':   expr | cor
             -----|-----
             celltype1 | 0.4 | 0.6
             ...
}
'''

"\nFor each cell type we get this dictionary:\n{\n  'GeneA':   expr | cor\n             -----|-----\n             celltype1 | 0.2 | 0.8\n             celltype2 | 0.7 | 0.3\n             target    | 0.0 | 0.0\n\n  'GeneB':   expr | cor\n             -----|-----\n             celltype1 | 0.4 | 0.6\n             ...\n}\n"

### get_background_sources()

In [8]:
# for each target cell type and for each gene indicates which other cell type could be considered a background source for that gene (from where the signal is generated, contaminated, correlated)
def get_background_sources(bg_features, expr_thres=0.75, cor_thres=0.25):
    sources = {}
    for ct in list(bg_features.keys()): # loop on targets (cell types)
        sources[ct] = {}
        for f in bg_features[ct].keys(): # loop on genes
            df = bg_features[ct][f]
            sources_f = list(df[(df['expr'] > expr_thres) & (df['cor'] > cor_thres)].index)
            # if another cell type has high expr or high spatial corr
            if sources_f:
                sources[ct][f] = sources_f # saves list of found cell type(s) (background source)
            else:
                sources[ct][f] = None
    sources = pd.DataFrame(sources)
    return sources

### filter_sources()

In [9]:
def filter_sources(df):
    df_filtered = df.copy()
    for col in ['Cancer cell', 'Epithelial cell']:
        for idx in df.index:
            val = df.at[idx, col]
            if isinstance(val, list):
                # remove from list
                if col == 'Cancer cell' and 'Epithelial cell' in val:
                    val = [x for x in val if x != 'Epithelial cell']
                elif col == 'Epithelial cell' and 'Cancer cell' in val:
                    val = [x for x in val if x != 'Cancer cell']
                # set to None if the list is empty
                if not val:
                    val = None
                df_filtered.at[idx, col] = val
    return df_filtered

## Load adata

In [10]:
adata = sc.read_h5ad(os.path.join(path, 'adata_phenotype.h5ad'))

In [11]:
phenotype_key = 'phenotype'
cell_type_key = 'celltype'
sample_key = 'name'
patient_key = 'patient_id'
tissue_region_key = 'tissue_region'

## Background removal

In [12]:
# background features
adata = bin_cells(adata, bin_size=200)
aggr_sum = aggregate_counts(adata, group_by=['bin', cell_type_key], layer='norm', min_cells=0, fun='sum')
aggr_mean = aggregate_counts(adata, group_by=['bin', cell_type_key], layer='norm', min_cells=5, fun='mean')

Generated 484 x-bins
Generated 80 y-bins


In [13]:
# convert to adata
aggr_sum = ad.AnnData(aggr_sum[0], obs=aggr_sum[1])
aggr_sum = aggr_sum[~aggr_sum.obs[cell_type_key].isna()].copy()
aggr_mean = ad.AnnData(aggr_mean[0], obs=aggr_mean[1])
aggr_mean = aggr_mean[~aggr_mean.obs[cell_type_key].isna()].copy()

In [14]:
aggr_sum.obs['bin'] = aggr_sum.obs.index.str.extract(r'^(bin_[^_]+_[^_]+)').set_index(aggr_sum.obs.index)
aggr_mean.obs['bin'] = aggr_mean.obs.index.str.extract(r'^(bin_[^_]+_[^_]+)').set_index(aggr_mean.obs.index)

In [15]:
expr_mean = pd.DataFrame(np.asarray(aggr_mean.X), index=aggr_mean.obs_names, columns=list(aggr_mean.var.index)) # group (bin_celltype) x gene
expr_sum = pd.DataFrame(np.asarray(aggr_sum.X), index=aggr_sum.obs_names, columns=list(aggr_sum.var.index))

In [16]:
bg_features = {}
for ct in adata.obs.celltype.unique():
    print(ct)
    bg_features[ct] = get_background_features(aggr_sum, aggr_mean, target=ct)

Macrophage
Plasma cell
Fibroblast
Cancer cell
B cell
Dendritic cell
T cell
Neutrophil
Smooth muscle cell
Endothelial cell
Epithelial cell
Mast cell
Schwann cell


In [52]:
sources = get_background_sources(bg_features, expr_thres=0.55, cor_thres=0.15)

In [53]:
filtered_sources = filter_sources(sources)

In [54]:
filtered_sources

Unnamed: 0,Macrophage,Plasma cell,Fibroblast,Cancer cell,B cell,Dendritic cell,T cell,Neutrophil,Smooth muscle cell,Endothelial cell,Epithelial cell,Mast cell,Schwann cell
A2M,[Fibroblast],"[Cancer cell, Dendritic cell, Endothelial cell...","[Endothelial cell, Smooth muscle cell]","[B cell, Dendritic cell, Endothelial cell, Fib...","[Cancer cell, Endothelial cell, Epithelial cel...","[Cancer cell, Endothelial cell, Epithelial cel...","[Cancer cell, Dendritic cell, Endothelial cell...","[Dendritic cell, Endothelial cell, Fibroblast,...","[B cell, Dendritic cell, Endothelial cell, Epi...",,"[B cell, Dendritic cell, Endothelial cell, Fib...","[B cell, Cancer cell, Dendritic cell, Epitheli...","[B cell, Cancer cell, Endothelial cell, Epithe..."
ACE2,"[Cancer cell, Endothelial cell, Epithelial cel...",,"[Cancer cell, Endothelial cell, Epithelial cel...",[Schwann cell],[Schwann cell],"[Cancer cell, Fibroblast, Macrophage]","[Cancer cell, Endothelial cell, Epithelial cel...","[Cancer cell, Fibroblast, Macrophage, T cell]",,"[Cancer cell, Epithelial cell, Fibroblast, Mac...","[Endothelial cell, Fibroblast, Macrophage, Neu...",[Epithelial cell],"[Cancer cell, Epithelial cell, Fibroblast, Mac..."
ACTA2,"[Dendritic cell, Endothelial cell, Fibroblast,...","[Fibroblast, Macrophage, Schwann cell]",,"[Macrophage, Schwann cell]","[Cancer cell, Dendritic cell, Epithelial cell,...","[Endothelial cell, Epithelial cell, Fibroblast...","[Dendritic cell, Epithelial cell, Fibroblast, ...","[Endothelial cell, Fibroblast, Schwann cell]",,[Fibroblast],"[B cell, Dendritic cell, Macrophage, Plasma cell]","[Fibroblast, Plasma cell, Schwann cell]","[Cancer cell, Dendritic cell, Endothelial cell..."
ACTB,[Cancer cell],"[Fibroblast, Macrophage, Mast cell]",[Cancer cell],"[Macrophage, Schwann cell]","[Cancer cell, Dendritic cell, Endothelial cell...",[Macrophage],[Cancer cell],"[Cancer cell, Endothelial cell, Fibroblast, Ma...",[Schwann cell],"[Macrophage, Neutrophil]","[Macrophage, Neutrophil]","[Cancer cell, Fibroblast]","[Cancer cell, Dendritic cell, Fibroblast, Macr..."
ADAM28,"[B cell, Dendritic cell]","[B cell, Epithelial cell, Fibroblast, Macropha...",[Macrophage],[Fibroblast],[Epithelial cell],[B cell],"[B cell, Epithelial cell, Fibroblast, Macrophage]",,[Epithelial cell],"[Fibroblast, Macrophage]",[Fibroblast],"[Epithelial cell, Fibroblast, Schwann cell]",
...,...,...,...,...,...,...,...,...,...,...,...,...,...
VEGFA,"[Cancer cell, Fibroblast, Mast cell, Schwann c...","[Cancer cell, Dendritic cell, Endothelial cell...","[Cancer cell, Macrophage, Neutrophil, Schwann ...",[Schwann cell],"[Dendritic cell, Endothelial cell, Fibroblast,...","[B cell, Endothelial cell, Fibroblast, Macroph...","[B cell, Cancer cell, Dendritic cell, Endothel...","[Cancer cell, Epithelial cell, Macrophage, Sch...","[B cell, Dendritic cell, Endothelial cell, Mac...","[Cancer cell, Dendritic cell, Fibroblast, Macr...","[Endothelial cell, Macrophage, Neutrophil, Sch...","[B cell, Cancer cell, Dendritic cell, Endothel...","[B cell, Cancer cell, Dendritic cell, Endothel..."
VSIG4,,"[Endothelial cell, Fibroblast, Macrophage]","[Cancer cell, Macrophage]","[Fibroblast, Macrophage, T cell]",[Schwann cell],[Smooth muscle cell],"[B cell, Endothelial cell, Fibroblast, Macroph...",,"[Dendritic cell, Macrophage, T cell]","[Fibroblast, Macrophage, T cell]",,[Macrophage],"[B cell, Fibroblast, Macrophage, T cell]"
VSIR,"[Dendritic cell, Endothelial cell, Neutrophil]","[Endothelial cell, Epithelial cell, Fibroblast]","[Dendritic cell, Endothelial cell, Macrophage,...",,"[Dendritic cell, Fibroblast, Macrophage, T cell]",,"[Dendritic cell, Endothelial cell, Epithelial ...","[Dendritic cell, Endothelial cell, Fibroblast,...",,"[Dendritic cell, Neutrophil]",[T cell],,"[Epithelial cell, Fibroblast, Mast cell]"
XBP1,"[Cancer cell, Epithelial cell]","[Endothelial cell, Epithelial cell]","[Cancer cell, Epithelial cell]",,"[Fibroblast, Macrophage, Plasma cell, T cell]",[Plasma cell],"[Cancer cell, Epithelial cell]","[Cancer cell, Epithelial cell]",,"[Cancer cell, Epithelial cell, Plasma cell]","[Fibroblast, Smooth muscle cell]",[Plasma cell],"[B cell, Cancer cell]"


In [55]:
# dict: for each target the list of genes that are not coming from any sources
genes_no_source = {col: filtered_sources.index[filtered_sources[col].isna()].tolist() for col in filtered_sources.columns}

In [56]:
for ct, genes in genes_no_source.items():
    print(f"{ct}: {len(genes)} geni -> {genes[:10]} ...")

Macrophage: 57 geni -> ['AIRE', 'APOE', 'ARG1', 'BRAF', 'C1QB', 'CCL16', 'CCR1', 'CCR3', 'CD14', 'CD163'] ...
Plasma cell: 55 geni -> ['ACE2', 'AIRE', 'ARG1', 'CCL15', 'CCL16', 'CCL7', 'CCR3', 'CD1A', 'CD40LG', 'CDK2'] ...
Fibroblast: 30 geni -> ['ACTA2', 'AIRE', 'ARG1', 'C1R', 'C1S', 'CCL16', 'CCR3', 'CDKN2B', 'CDKN2D', 'CFC1'] ...
Cancer cell: 158 geni -> ['AIRE', 'ANPEP', 'ARG1', 'ARID1A', 'ARPC3', 'ARPC5', 'BANK1', 'BATF3', 'BRAF', 'C1QBP'] ...
B cell: 94 geni -> ['AIRE', 'ARPC3', 'CCL15', 'CCL16', 'CCL26', 'CCL3L1', 'CCR3', 'CD1A', 'CD37', 'CD40'] ...
Dendritic cell: 80 geni -> ['AIF1', 'AIRE', 'ARG1', 'ARID1A', 'ARPC3', 'CCL15', 'CCL16', 'CCL7', 'CCR1', 'CCR5'] ...
T cell: 57 geni -> ['ARG1', 'BATF3', 'CCL16', 'CCL26', 'CCL7', 'CCR3', 'CCR5', 'CD1A', 'CD2', 'CD247'] ...
Neutrophil: 163 geni -> ['ADAM28', 'AIRE', 'ARG1', 'ARID1A', 'ATM', 'BANK1', 'BATF3', 'BCL2', 'BRAF', 'CCL15'] ...
Smooth muscle cell: 158 geni -> ['ACE2', 'ACTA2', 'AIRE', 'AKT1', 'AREG', 'ARG1', 'ARID1A', 'ARPC3

## Validation

### Fisher exact test with PanglaoDB

In [57]:
panglao = pd.read_csv(panglaodb_path, sep="\t")

In [58]:
#panglao['organ'].unique().tolist()

In [59]:
# only protein-coding + human + organs of interest
relevant_organs = ['GI tract', 'Immune system', 'Connective tissue', 'Vasculature', 'Smooth muscle', 'Epithelium', 'Blood', 'Brain']

panglao_human = panglao[
    (panglao['organ'].isin(relevant_organs)) &
    (panglao["gene type"] == "protein-coding gene") &
    (panglao["species"].isin(["Hs", "Mm Hs"]))
]

#### Self Specificity 

In [60]:
panglao_map = {
    'Macrophage': ['Macrophages', 'Alveolar macrophages', 'Kupffer cells', 'Red pulp macrophages', 'Myeloid-derived suppressor cells'],
    'Plasma cell': ['Plasma cells'],
    'Fibroblast': ['Fibroblasts', 'Myofibroblasts', 'Pancreatic stellate cells', 'Hepatic stellate cells', 'Stromal cells'],
    'Cancer cell': ['Epithelial cells', 'Luminal epithelial cells', 'Mammary epithelial cells', 'Enterocytes', 'Goblet cells'],
    'B cell': ['B cells', 'B cells naive', 'B cells memory'],
    'Dendritic cell': ['Dendritic cells', 'Plasmacytoid dendritic cells'],
    'T cell': ['T cells', 'T helper cells', 'T cytotoxic cells', 'T regulatory cells', 'T memory cells', 'T follicular helper cells', 'Gamma delta T cells'],
    'Neutrophil': ['Neutrophils'],
    'Smooth muscle cell': ['Smooth muscle cells', 'Vascular smooth muscle cells', 'Pulmonary vascular smooth muscle cells'],
    'Endothelial cell': ['Endothelial cells', 'Endothelial cells (aorta)', 'Endothelial cells (blood brain barrier)'],
    'Epithelial cell': ['Epithelial cells', 'Enterocytes', 'Goblet cells', 'Crypt cells', 'Luminal epithelial cells'],
    'Mast cell': ['Mast cells', 'Basophils'],
    'Schwann cell': ['Schwann cells', 'Peri-islet Schwann cells', 'Satellite glial cells']
}

In [61]:
# define total genes
panglao_cts_used = [ct for sublist in panglao_map.values() for ct in sublist] # cell types in the map
panglao_filtered = panglao_human[panglao_human['cell type'].isin(panglao_cts_used)]
total_genes = len(set(panglao_filtered['official gene symbol']) | set.union(*[set(g) for g in genes_no_source.values()]))

In [62]:
results = []

# loop on cell types
for ct in genes_no_source.keys():
    panglao_cts = panglao_map.get(ct, [])
    
    known_markers = set(panglao_filtered.loc[panglao_filtered['cell type'].isin(panglao_cts), 'official gene symbol'])
    my_genes = set(genes_no_source[ct])
    
    # compute overlap
    overlap = known_markers & my_genes
    n_overlap = len(overlap)
    
    # contingency table
    table = [
        [n_overlap, len(known_markers - overlap)],
        [len(my_genes - overlap), total_genes - len(known_markers | my_genes)]
    ]
    
    # Fisher test (one-sided)
    odds, pval = fisher_exact(table, alternative='greater')

    results.append({
        'cell_type': ct,
        'n_genes_no_source': len(my_genes),
        'n_known_markers': len(known_markers),
        'n_overlap': n_overlap,
        'overlap': overlap,
        'odds_ratio': odds,
        'p_value': pval
    })

fisher_results = pd.DataFrame(results).sort_values('p_value')
fisher_results

Unnamed: 0,cell_type,n_genes_no_source,n_known_markers,n_overlap,overlap,odds_ratio,p_value
6,T cell,57,215,19,"{CTLA4, IL2RB, PDCD1, TBX21, IL13, CD3D, PTPRC...",3.596939,4.2e-05
0,Macrophage,57,142,14,"{CXCL16, ITGAM, VSIG4, CD86, CCR3, CSF1R, LILR...",3.759448,0.000179
2,Fibroblast,30,203,9,"{FN1, ACTA2, SPON2, PDGFRA, LUM, IL1R1, DCN, C...",3.17894,0.007205
5,Dendritic cell,80,166,8,"{CX3CR1, CXCL16, ITGAE, CD86, PTPRC, CD4, AIF1...",1.00211,0.555549
4,B cell,94,145,8,"{CD40, CD79B, FCGR2B, FKBP11, PTPRC, IRF8, PDC...",0.972331,0.586126
9,Endothelial cell,70,274,9,"{CDKN1C, RGS5, PLVAP, IGFBP7, SPARC, FLT1, SPA...",0.739375,0.841335
11,Mast cell,71,207,6,"{ITGAM, PLEK, CCL7, FCER2, IL4, SOCS2}",0.638806,0.895325
12,Schwann cell,59,70,1,{TNF},0.383558,0.924496
7,Neutrophil,163,69,1,{FCGR1A},0.129993,0.999308
8,Smooth muscle cell,158,86,1,{ACTA2},0.106407,0.999853


#### Non-self Un-specificity

In [63]:
panglao_map = {
    'Macrophage': ['Macrophages'],
    'Plasma cell': ['Plasma cells'],
    'Fibroblast': ['Fibroblasts'],
    'Cancer cell': ['Epithelial cells'],
    'B cell': ['B cells'],
    'Dendritic cell': ['Dendritic cells'],
    'T cell': ['T cells'],
    'Neutrophil': ['Neutrophils'],
    'Smooth muscle cell': ['Smooth muscle cells'],
    'Endothelial cell': ['Endothelial cells'],
    'Epithelial cell': ['Epithelial cells'],
    'Mast cell': ['Mast cells'],
    'Schwann cell': ['Schwann cells']
}

In [64]:
# define total genes
panglao_cts_used = [ct for sublist in panglao_map.values() for ct in sublist] # cell types in the map
panglao_filtered = panglao_human[panglao_human['cell type'].isin(panglao_cts_used)]
total_genes = len(set(panglao_filtered['official gene symbol']) | set.union(*[set(g) for g in genes_no_source.values()]))

In [65]:
results = []

for ct in genes_no_source.keys():
    my_genes = set(genes_no_source[ct])
    other_cts = [o for o in genes_no_source.keys() if o != ct]
    
    for other_ct in other_cts:
        other_markers = set(panglao_filtered.loc[
            panglao_filtered['cell type'].isin(panglao_map.get(other_ct, [])),
            'official gene symbol'
        ])
        
        overlap = my_genes & other_markers
        n_overlap = len(overlap)
        
        table = [
            [n_overlap, len(other_markers - overlap)],
            [len(my_genes - overlap), total_genes - len(my_genes | other_markers)]
        ]
        
        odds, pval = fisher_exact(table, alternative='greater')
        
        results.append({
            'cell_type': ct,
            'other_cell_type': other_ct,
            'n_genes_no_source': len(my_genes),
            'n_known_markers_other': len(other_markers),
            'n_overlap': n_overlap,
            'overlap_genes': overlap,
            'odds_ratio': odds,
            'p_value': pval
        })

fisher_results_aspecific = pd.DataFrame(results).sort_values(['p_value'])
#fisher_results_aspecific

significant_results = fisher_results_aspecific[fisher_results_aspecific['p_value'] < 0.05]
significant_results.reset_index(drop=True)

Unnamed: 0,cell_type,other_cell_type,n_genes_no_source,n_known_markers_other,n_overlap,overlap_genes,odds_ratio,p_value
0,Neutrophil,B cell,163,89,21,"{SPIB, CD27, CD86, MS4A1, PDCD1, CD79A, CD38, ...",2.240058,0.002971
1,Epithelial cell,Macrophage,126,128,19,"{CD86, FCGR3A, CD163, GPR34, LILRA5, ITGAX, CD...",1.67144,0.042878
2,Epithelial cell,Dendritic cell,126,121,18,"{CX3CR1, CXCL16, ITGAM, IL6, ITGAE, VSIG4, TRE...",1.669903,0.04743


### Cell Markers 2.0

In [66]:
cellmarker_path = "../input data/Cell_marker_Human.xlsx"
cellmarker = pd.read_excel(cellmarker_path, engine='openpyxl')

In [67]:
relevant_tissue_type = [
    # Colon
    'Colon', 'Colon epithelium', 'Large intestine', 'Rectum', 'Colorectum',
    'Small intestine', 'Intestine', 'Duodenum', 'Ileum', 'Jejunum',
    'Large Intestine', 'Small intestinal crypt', 'Intestinal crypt', 'Myenteric plexus',
    # Stromal
    'Connective tissue', 'Mesenchyme', 'Smooth muscle', 'Vasculature', 'Blood vessel',
    # Immunity
    'Immune system', 'Lymph node', 'Lymphoid tissue', 'Blood', 'Peripheral blood',
    # Others
    'Epithelium', 'Adventitia', 'Peritoneum', 'Pancreas'
]

relevant_tissue_class = [
    'Colon', 'Intestine', 'Gut', 'Gastrointestinal tract',
    'Muscle', 'Smooth muscle', 'Blood vessel', 'Vein', 'Artery', 'Peritoneum', 'Adventitia',
    'Lymph', 'Lymph node', 'Lymphoid tissue', 'Blood',
    'Epithelium', 
    'Pancreas'
]

cellmarker_filtered_tissue = cellmarker[
    (cellmarker['tissue_type'].isin(relevant_tissue_type)) &
    (cellmarker['tissue_class'].isin(relevant_tissue_class))
].copy()

In [68]:
#sorted(cellmarker_filtered_tissue['cell_name'].unique().tolist())

#### Self Specificity 

In [69]:
cellmarker_map = {
    'Macrophage': [
        'Macrophage', 'M1 macrophage', 'M2 macrophage', 'Tissue resident macrophage', 
        'Pro-inflammatory macrophage', 'Foam cell', 'Pan-macrophage'
    ],
    'B cell': [
        'B cell', 'Activated B cell', 'Memory B cell', 'Naive B cell', 
        'Plasma cell', 'Plasmablast', 'Follicular B cell', 'Germinal center B cell', 
        'Transitional B cell', 'Regulatory B cell'
    ],
    'T cell': [
        'T cell', 'CD4+ T cell', 'CD8+ T cell', 'Naive T cell', 'Regulatory T(Treg) cell',
        'Effector T cell', 'Memory T cell', 'Follicular helper(Tfh) cell', 
        'Natural killer T (NKT) cell'
    ],
    'Dendritic cell': [
        'Dendritic cell', 'Conventional dendritic cell', 'Plasmacytoid dendritic cell', 
        'Myeloid dendritic cell', 'Monocyte derived dendritic cell'
    ],
    'Cancer cell': [
        'Cancer cell', 'Cancer Stem cell', 'Cancer-initiating cell', 
        'Malignant cell', 'Cancer-associated fibroblast', 'Cancer associated fibroblast(CAF)'
    ],
    'Fibroblast': [
        'Fibroblast', 'Myofibroblast', 'Cancer-associated fibroblast', 'Mesenchymal cell'
    ],
    'Smooth muscle cell': [
        'Smooth muscle cell', 'Vascular smooth muscle cell(VSMC)', 'Muscle'
    ],
    'Endothelial cell': [
        'Endothelial cell', 'Endothelial progenitor cell', 'Endothelial stem cell',
        'Neovascular endothelial cell', 'Pan-endothelial cell'
    ],
    'Epithelial cell': [
        'Epithelial cell', 'Coloncyte', 'Enterocyte', 'Enteroendocrine cell', 'Goblet cell', 
        'Crypt cell', 'Basal cell', 'Ki-67+ epithelial cell', 'Esophageal cell', 'Mature epithelial cell'
    ],
    'Plasma cell': [
        'Plasma cell', 'Antibody Secreting B cell', 'Plasmablast', 'Plasmablast B cell'
    ],
    'Mast cell': ['Mast cell', 'Basophil'],  # basofili spesso raggruppati insieme
    'Neutrophil': ['Neutrophil', 'Polymorphonuclear neutrophil', 'Pre-neutrophil'],
    'Schwann cell': ['Schwann cell']
}

In [70]:
# define total genes
cellmarker_cts_used = [ct for sublist in cellmarker_map.values() for ct in sublist] # cell types in the map
cellmarker_filtered = cellmarker_filtered_tissue[cellmarker_filtered_tissue['cell_name'].isin(cellmarker_cts_used)].copy()
total_genes = len(set(cellmarker_filtered['Symbol']) | set.union(*[set(g) for g in genes_no_source.values()]))

In [71]:
results = []

# loop on cell types
for ct in genes_no_source.keys():
    cellmarker_cts = cellmarker_map.get(ct, [])
    
    known_markers = set(cellmarker_filtered.loc[cellmarker_filtered['cell_name'].isin(cellmarker_cts), 'Symbol'])
    my_genes = set(genes_no_source[ct])
    
    # compute overlap
    overlap = known_markers & my_genes
    n_overlap = len(overlap)
    
    # contingency table
    table = [
        [n_overlap, len(known_markers - overlap)],
        [len(my_genes - overlap), total_genes - len(known_markers | my_genes)]
    ]
    
    # Fisher test (one-sided)
    odds, pval = fisher_exact(table, alternative='greater')

    results.append({
        'cell_type': ct,
        'n_genes_no_source': len(my_genes),
        'n_known_markers': len(known_markers),
        'n_overlap': n_overlap,
        'overlap': overlap,
        'odds_ratio': odds,
        'p_value': pval
    })

fisher_results = pd.DataFrame(results).sort_values('p_value')
fisher_results

Unnamed: 0,cell_type,n_genes_no_source,n_known_markers,n_overlap,overlap,odds_ratio,p_value
0,Macrophage,57,109,16,"{APOE, ITGAM, VSIG4, CD86, CSF1R, CTSD, FCER2,...",5.186467,3e-06
2,Fibroblast,30,101,8,"{C1R, FN1, IGFBP7, ACTA2, PDGFRA, LUM, DCN, CX...",4.938416,0.000919
6,T cell,57,195,11,"{STAT4, CTLA4, IL4R, CCR5, CD3D, PTPRC, CD3E, ...",1.488067,0.165974
7,Neutrophil,163,64,10,"{CCR7, CD19, CD86, MS4A1, CD80, FCGR1A, PDCD1,...",1.414912,0.211103
3,Cancer cell,158,79,11,"{CX3CR1, CCR7, CD44, KRAS, MS4A1, PDCD1, PIK3C...",1.276511,0.28307
11,Mast cell,71,20,1,{CD19},0.974436,0.653259
8,Smooth muscle cell,158,9,1,{ACTA2},0.971338,0.664681
4,B cell,94,143,7,"{KIT, CD40, CD79B, PTPRC, IRF4, CD37, FCGR2A}",0.683908,0.872495
9,Endothelial cell,70,114,3,"{PLVAP, ENTPD1, FLT1}",0.486083,0.939122
5,Dendritic cell,80,401,7,"{IQGAP2, ITGAE, CD86, CSF1R, PTPRC, CD4, PLD4}",0.22196,0.999999


#### Non-self Un-specificity

In [72]:
cellmarker_map = {
    'Macrophage': ['Macrophage'],
    'B cell': ['B cell'],
    'T cell': ['T cell'],
    'Dendritic cell': ['Dendritic cell'],
    'Cancer cell': ['Cancer cell', 'Epithelial cell'],
    'Fibroblast': ['Fibroblast'],
    'Smooth muscle cell': ['Smooth muscle cell'],
    'Endothelial cell': ['Endothelial cell'],
    'Epithelial cell': ['Epithelial cell'],
    'Plasma cell': ['Plasma cell'],
    'Mast cell': ['Mast cell'],
    'Neutrophil': ['Neutrophil'],
    'Schwann cell': ['Schwann cell']
}

In [73]:
# define total genes
cellmarker_cts_used = [ct for sublist in cellmarker_map.values() for ct in sublist] # cell types in the map
cellmarker_filtered = cellmarker_filtered_tissue[cellmarker_filtered_tissue['cell_name'].isin(cellmarker_cts_used)].copy()
total_genes = len(set(cellmarker_filtered['Symbol']) | set.union(*[set(g) for g in genes_no_source.values()]))

In [74]:
results = []

for ct in genes_no_source.keys():
    my_genes = set(genes_no_source[ct])
    other_cts = [o for o in genes_no_source.keys() if o != ct]
    
    for other_ct in other_cts:
        other_markers = set(cellmarker_filtered.loc[
            cellmarker_filtered['cell_name'].isin(cellmarker_map.get(other_ct, [])),
            'Symbol'
        ])
        
        overlap = my_genes & other_markers
        n_overlap = len(overlap)
        
        table = [
            [n_overlap, len(other_markers - overlap)],
            [len(my_genes - overlap), total_genes - len(my_genes | other_markers)]
        ]
        
        odds, pval = fisher_exact(table, alternative='greater')
        
        results.append({
            'cell_type': ct,
            'other_cell_type': other_ct,
            'n_genes_no_source': len(my_genes),
            'n_known_markers_other': len(other_markers),
            'n_overlap': n_overlap,
            'overlap_genes': overlap,
            'odds_ratio': odds,
            'p_value': pval
        })

fisher_results_aspecific = pd.DataFrame(results).sort_values(['cell_type', 'p_value'])
#fisher_results_aspecific

significant_results = fisher_results_aspecific[fisher_results_aspecific['p_value'] < 0.05]
significant_results.reset_index(drop=True)

Unnamed: 0,cell_type,other_cell_type,n_genes_no_source,n_known_markers_other,n_overlap,overlap_genes,odds_ratio,p_value
0,Epithelial cell,Macrophage,126,77,19,"{CD86, FCGR3A, CD163, ITGAX, ITGAE, CTSB, IL10...",1.702224,0.046326
1,Neutrophil,T cell,163,90,30,"{CD27, KLRB1, CTLA4, CCR6, PDCD1, CXCR6, GZMH,...",1.943609,0.005594


## Save

In [24]:
with open(os.path.join(path, "bg_features.pkl"), "wb") as f:
    pickle.dump(bg_features, f)

In [51]:
with open(os.path.join(path, "bg_features.pkl"), "rb") as f:
    bg_features = pickle.load(f)

In [75]:
filtered_sources.to_csv(os.path.join(path, 'background_features_55_15.csv'))

In [None]:
#filtered_sources = pd.read_csv(os.path.join(path, 'background_features_55_15.csv'), index_col=0)

In [76]:
with open(os.path.join(path, "genes_no_source_55_15.json"), "w") as f:
    json.dump(genes_no_source, f, indent=4)

In [6]:
with open(os.path.join(path, "genes_no_source_55_15.json"), "r") as f: genes_no_source = json.load(f)