In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
dataset = "Liver" # "PBMC" "BM" "COVID" "Liver"
datatype = "seurat" # "row"
datafolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\data\\"+dataset+"\\"

## Geneset - remove empty IDs

In [3]:
geneset_info = pd.read_csv(datafolder+"genesets_modules.csv", index_col=0)
geneset_info.tail()

Unnamed: 0,ID,Title,source,DataBase
210,-,Endothelial cell,CM,CellMarker
35,-,Kupffer cell,CM,CellMarker
41,-,Liver bud hepatic cell,CM,CellMarker
51,-,Mesenchymal cell,CM,CellMarker
61,-,T cell,CM,CellMarker


In [4]:
geneset_info['ID'] = np.where(geneset_info['ID'] == "-", geneset_info['Title'], geneset_info['ID'])
geneset_info.tail()

Unnamed: 0,ID,Title,source,DataBase
210,Endothelial cell,Endothelial cell,CM,CellMarker
35,Kupffer cell,Kupffer cell,CM,CellMarker
41,Liver bud hepatic cell,Liver bud hepatic cell,CM,CellMarker
51,Mesenchymal cell,Mesenchymal cell,CM,CellMarker
61,T cell,T cell,CM,CellMarker


In [5]:
geneset_info.to_csv(datafolder+ "genesets_modules.csv")

## Filter geneset by occuring genes

In [6]:
geneset_info = pd.read_csv(datafolder+"genesets_modules.csv", index_col=0)
gene_expr = pd.read_csv(datafolder + datatype + "_filtered_data.csv", index_col=0)
patients_names = gene_expr.columns.to_list()
genes = gene_expr.index.tolist()
gene_expr = gene_expr.to_numpy().astype(float)

In [7]:
with open(datafolder + "genesets_genes.json") as file:
    genesets = json.load(file)
gs_names = list(genesets.keys())
len(genesets)

168

In [8]:
incorrect = []
for i, (gs_name, geneset_genes) in tqdm(enumerate(genesets.items()), total=len(genesets)):
    genes_in_ds = [gene in geneset_genes for gene in genes]
    in_gs = gene_expr[genes_in_ds, :]
    N_gs = in_gs.shape[0]  # number of genes in GS
    N_tot = len(genes)  # total number of genes
    if N_gs/len(geneset_genes) < 0.35:
        incorrect.append(gs_name)

  0%|          | 0/168 [00:00<?, ?it/s]

100%|██████████| 168/168 [00:13<00:00, 12.50it/s]


In [9]:
filtered_geneset_info = geneset_info[~geneset_info['ID'].isin(incorrect)]
filtered_genesets = {key: genesets[key] for key in genesets.keys() if key not in incorrect}

In [10]:
len(filtered_genesets)

165

In [11]:
filtered_geneset_info.groupby(["DataBase"]).size()

DataBase
CIBERSORT     45
CellMarker     6
KEGG          29
MSigDB        34
PanglaoDB     13
tmod          31
dtype: int64

In [12]:
filtered_geneset_info.to_csv(datafolder+ "filtered_genesets_modules.csv")
with open(datafolder + "filtered_genesets_genes.json", 'w') as fp:
    json.dump(filtered_genesets, fp)

## Get genesets for classification scores

In [110]:
data_folder =  "C:\\Users\\amruk\\source\\enrichment-auc\\data\\"
paths = pd.read_csv(data_folder+"paths_to_cells.txt", sep='\t')

In [None]:
by_celltype = paths[~paths['Celltype'].str.contains("monocyte|dendritic", case=False)]
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace('CD4+ T cell; Cytotoxic T cell','T cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace('Cytotoxic T cell; CD4+ T cell','T cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace('CD4+ T cell','T cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace('Cytotoxic T cell','T cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace("T cell; T cell",'T cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace("Natural killer cell",'NK cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace("T cell; NK cell",'NK cell; T cell')

by_celltype['Celltype'] = by_celltype['Celltype'].str.replace("T cell; NK cell",'NK cell; T cell')

In [100]:
title_condition = by_celltype['Title'].str.contains("nk cell|natural killer|b cell|t cell", case=False)
other_cells = by_celltype['Celltype'].str.contains("other|unknown", case=False)
not_mast = ~by_celltype['Title'].str.contains("mast cell", case=False)
other_applicable = by_celltype[title_condition & other_cells & not_mast]
other_applicable

Unnamed: 0,ID,Title,Category,Database,GSsize,GenesInGS,Perc,Celltype,Celltype_unclear
Sig.:21,Sig.:21,T CELLS GAMMA DELTA,Signature of PBMC,Signature,36,35,97.222222,other,no


In [None]:
other_applicable["Celltype"] = " T cell"

In [106]:
chosen = pd.concat([by_celltype[by_celltype['Celltype'].str.contains("nk cell|natural killer|t cell|b cell", case=False)], other_applicable])

In [114]:
chosen.head()

Unnamed: 0,ID,Title,Category,Database,GSsize,GenesInGS,Perc,Celltype,Celltype_unclear
hsa04062,hsa04062,KEGG: Chemokine signaling pathway,KEGG: Organismal Systems; Immune system,KEGG,192,139,72.395833,T cell,no
hsa04610,hsa04610,KEGG: Complement and coagulation cascades,KEGG: Organismal Systems; Immune system,KEGG,85,46,54.117647,B cell; T cell,no
hsa04612,hsa04612,KEGG: Antigen processing and presentation,KEGG: Organismal Systems; Immune system,KEGG,78,66,84.615385,NK cell; T cell,no
hsa04650,hsa04650,KEGG: Natural killer cell mediated cytotoxicity,KEGG: Organismal Systems; Immune system,KEGG,131,94,71.755725,NK cell,no
hsa04657,hsa04657,KEGG: IL-17 signaling pathway,KEGG: Organismal Systems; Immune system,KEGG,94,63,67.021277,NK cell; T cell,no


In [128]:
chosen.shape

(69, 9)

In [129]:
chosen.to_csv(data_folder+"chosen_paths.txt", sep='\t', mode='a')