**Load ranking results** 

In [1]:
import pickle

# Ranking results from union of senescence markers
with open('results/gene_selection_ranking/all_selected_ranked_hlca_nonsmoker_union_only.pickle', 'rb') as handle:
    ranking_union = pickle.load(handle)

# Ranking results from all expressed genes
with open('results/gene_selection_ranking/all_selected_ranked_hlca_nonsmoker.pickle', 'rb') as handle:
    ranking_all = pickle.load(handle)
    
for ct_column,group,ct,gene_type,use_pca,nz_idx,gene_names,feature_names,pca_model,donor_num,scores,pvals in ranking_all:
    if gene_type == "all_nonsmoker":
        all_genes = gene_names

**Load senescence markers**

In [2]:
from functools import reduce
import pandas as pd
import numpy as np

test_genes = {"fridman":["ALDH1A3", "AOPEP", "CCND1", "CD44", "CDKN1A", "CDKN1C", "CDKN2A", "CDKN2B", "CDKN2D", "CITED2",
                                "CLTB", "COL1A2","CREG1","CRYAB","CCN2","CXCL14","CYP1B1","EIF2S2","ESM1","F3","FILIP1L","FN1","GSN","GUK1","HBS1L",
                                "HPS5","HSPA2","HTATIP2","IFI16","IFNG","IGFBP1","IGFBP2","IGFBP3","IGFBP4","IGFBP5","IGFBP6","IGFBP7","IGSF3",
                                "ING1","IRF5","IRF7","ISG15","MAP1LC3B","MAP2K3","MDM2","MMP1","NDN","NME2","NRG1","OPTN","PEA15","RAB13","RAB31",
                                "RAB5B","RABGGTA","RAC1","RBL2","RGL2","RHOB","RRAS","S100A11","SERPINB2","SERPINE1","SMPD1","SMURF2","SOD1","SPARC",
                                "STAT1","TES","TFAP2A","TGFB1I1","THBS1","TNFAIP2","TNFAIP3","TP53","TSPYL5","VIM","ALDH1A1","BMI1","CCNB1","CDC25B",
                                "CKS1BP7","COL3A1","E2F4","EGR1","ID1","LAMA1","LDB2","MARCKS","CCN4"],
              "sasp2":["VEGFA", "TNFRSF12A", "TNFRSF10C", "TNFRSF10B", "TIMP2", "TIMP1", "TGFB1", "SERPINE1", "TNFRSF1A",
                                    "PLAUR", "PLAU", "MMP14", "MMP13", "MMP7", "MMP3", "MIF", "LMNA", "KITLG", "IL32", "IGFBP7", "IGFBP2",
                                     "ICAM1", "FAS", "EREG", "CXCL17", "CXCL16", "CXCL8", "CXCL1", "CTSB", "CLU", "CCL20", "CCL2", "BTC",
                                     "AREG"
                                  ],
              "senmayo":pd.read_excel("data/senescence_list.xlsx",sheet_name="SenMayo")["symbol"].tolist(),
              "cellage":pd.read_excel("data/senescence_list.xlsx",sheet_name="CellAge Senescence Genes")["Symbol"].tolist()
                }

test_genes["union"] = reduce(np.union1d, [test_genes["fridman"],
                    test_genes["sasp2"],
                    test_genes["senmayo"],
                    test_genes["cellage"]]
      )

**Get significant genes for 'Basal', 'Basal resting', and 'Suprabasal'**

In [3]:
from tqdm import tqdm

ct_scores_all = dict()
for ct_column,group,ct,gene_type,use_pca,nz_idx,gene_names,feature_names,pca_model,donor_num,scores,pvals in tqdm(ranking_all):
    select = (ct == "Basal" or ct == "Basal resting" or ct == "Suprabasal" and group == "nonsmoker")
    
    if select:
        scores["rank"] = np.arange(1,scores.shape[0]+1)
        scores["pval"] = pvals
        ct_scores_all[ct] = scores

100%|██████████| 10/10 [00:00<00:00, 6721.64it/s]


In [4]:
ct_scores_union = dict()
for ct_column,group,ct,gene_type,use_pca,nz_idx,gene_names,feature_names,pca_model,donor_num,scores,pvals in tqdm(ranking_union):
    select = (ct == "Basal" or ct == "Basal resting" or ct == "Suprabasal" and group == "nonsmoker")
    
    if select:
        scores["rank"] = np.arange(1,scores.shape[0]+1)
        scores["pval"] = pvals
        ct_scores_union[ct] = scores

100%|██████████| 10/10 [00:00<00:00, 8845.01it/s]


**Compute p-values for the overlap among three cell types**

In [5]:
def compute_pval(ct_scores, background_genes, sample_num=10000):
    
    # Extract significant genes
    sig_genes = []
    for ct,scores in ct_scores.items():
        sig_genes.append(scores.loc[scores["pval"] <= 0.05, ].index)
        print(f"{ct}: {sig_genes[-1].shape[0]}")
    
    # Generate null distribution
    distr = []
    for i in tqdm(range(sample_num)):
        sampled_genes = [np.random.choice(background_genes, genes.shape[0], replace = False) for genes in sig_genes]
        distr.append(reduce(np.intersect1d, sampled_genes).shape[0])
    
    # Compute p-value
    distr = np.array(distr)
    overlap = reduce(np.intersect1d, sig_genes)
    pval = distr[distr > overlap.shape[0]].shape[0] / distr.shape[0]
    
    return pval

In [6]:
def get_common_genes_and_rank(ct_scores):
    
    # Extract significant genes
    sig_genes = []
    for ct,scores in ct_scores.items():
        sig_genes.append(scores.loc[scores["pval"] <= 0.05, ].index)
        
    # Get overlap genes
    overlap = reduce(np.intersect1d, sig_genes)
    sum_rank = reduce(lambda x,y: x.loc[overlap,["rank"]] + y.loc[overlap,["rank"]], ct_scores.values())
    mean_rank = (sum_rank/len(ct_scores)).sort_values(by = "rank")
    
    return mean_rank

Compute p-values for the overlap

In [7]:
pval_all = compute_pval(ct_scores_all, background_genes = all_genes)
pval_union = compute_pval(ct_scores_union, background_genes = test_genes['union'])

Basal resting: 181
Basal: 35
Suprabasal: 202


100%|██████████| 10000/10000 [02:11<00:00, 76.01it/s]


Basal resting: 67
Basal: 63
Suprabasal: 36


100%|██████████| 10000/10000 [00:01<00:00, 8570.63it/s]


In [8]:
print(f"p-value for all expressed genes: {pval_all}; p-value for union of senescence markers:{pval_union}")

p-value for all expressed genes: 0.0; p-value for union of senescence markers:0.0


In [9]:
rnk_all = get_common_genes_and_rank(ct_scores_all)
rnk_union = get_common_genes_and_rank(ct_scores_union)

In [18]:
np.round(rnk_all,3).transpose()

Unnamed: 0,THOC3,PMAIP1,RHOB,INKA1,FAHD2A,CARS1,FGF7
rank,18.667,20.333,27.333,31.667,67.333,80.0,91.667


In [11]:
np.round(rnk_union,3).transpose()

Unnamed: 0,FOS,JUN,IGFBP2,PSMB5,RHOB,MARCKS,EGR1,CD55,VIM,FGF7,GLB1,CBX7,TNFRSF1A,ISG15
rank,2.333,5.667,6.333,7.667,9.0,11.667,12.667,14.667,19.0,19.333,28.0,28.667,30.0,42.0
