In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
dataset = "PBMC" # "PBMC" "BM" "COVID" "Liver"
datatype = "seurat" # "row"
datafolder =  "C:\\Users\\amruk\\source\\enrichment-auc\\data\\"+dataset+"\\"

## Geneset - remove empty IDs

In [3]:
geneset_info = pd.read_csv(datafolder+"genesets_modules.csv", index_col=0)
geneset_info.tail()

Unnamed: 0,ID,Title,source,DataBase
101,Monocyte,Monocyte,CM,CellMarker
111,Myeloid,Myeloid,CM,CellMarker
121,Natural killer cell,Natural killer cell,CM,CellMarker
131,Neutrophil,Neutrophil,CM,CellMarker
141,T cell,T cell,CM,CellMarker


In [4]:
geneset_info['ID'] = np.where(geneset_info['ID'] == "-", geneset_info['Title'], geneset_info['ID'])
geneset_info.tail()

Unnamed: 0,ID,Title,source,DataBase
101,Monocyte,Monocyte,CM,CellMarker
111,Myeloid,Myeloid,CM,CellMarker
121,Natural killer cell,Natural killer cell,CM,CellMarker
131,Neutrophil,Neutrophil,CM,CellMarker
141,T cell,T cell,CM,CellMarker


In [5]:
geneset_info.to_csv(datafolder+ "genesets_modules.csv")

## Filter geneset by occuring genes

In [7]:
geneset_info = pd.read_csv(datafolder+"genesets_modules.csv", index_col=0)
gene_expr = pd.read_csv(datafolder + datatype + "_filtered_data.csv", index_col=0)
patients_names = gene_expr.columns.to_list()
genes = gene_expr.index.tolist()
gene_expr = gene_expr.to_numpy().astype(float)

In [8]:
with open(datafolder + "genesets_genes.json") as file:
    genesets = json.load(file)
gs_names = list(genesets.keys())
len(genesets)

159

In [8]:
incorrect = []
for i, (gs_name, geneset_genes) in tqdm(enumerate(genesets.items()), total=len(genesets)):
    genes_in_ds = [gene in geneset_genes for gene in genes]
    in_gs = gene_expr[genes_in_ds, :]
    N_gs = in_gs.shape[0]  # number of genes in GS
    N_tot = len(genes)  # total number of genes
    if round(N_gs/len(geneset_genes), 2) < 0.65:
        incorrect.append(gs_name)
        print(gs_name, N_gs/len(geneset_genes), N_gs, len(geneset_genes))

  0%|          | 0/159 [00:00<?, ?it/s]

Cancer cell 0.3333333333333333 5 15


  7%|▋         | 11/159 [00:00<00:06, 23.47it/s]

Endothelial cell 0.375 3 8


 10%|█         | 16/159 [00:00<00:06, 20.75it/s]

hsa04610 0.5411764705882353 46 85
hsa04613 0.5157894736842106 98 190


 25%|██▍       | 39/159 [00:02<00:06, 17.99it/s]

hsa05320 0.5283018867924528 28 53
hsa05322 0.3014705882352941 41 136


 33%|███▎      | 52/159 [00:03<00:05, 20.77it/s]

M39214 0.5979381443298969 58 97


 75%|███████▌  | 120/159 [00:04<00:00, 43.90it/s]

LI.M156.0 0.5116279069767442 22 43
LI.M152.0 0.12 3 25
LI.M152.1 0.09523809523809523 2 21
LI.M152.2 0.05555555555555555 1 18
LI.M217 0.4 4 10


 98%|█████████▊| 156/159 [00:05<00:00, 49.78it/s]

LI.M61.0 0.5833333333333334 14 24
LI.M61.1 0.38461538461538464 5 13


100%|██████████| 159/159 [00:05<00:00, 26.89it/s]


In [9]:
filtered_geneset_info = geneset_info[~geneset_info['ID'].isin(incorrect)]
filtered_genesets = {key: genesets[key] for key in genesets.keys() if key not in incorrect}

In [10]:
len(list(filtered_genesets.keys()))

145

In [11]:
filtered_geneset_info.groupby(["DataBase"]).size()

DataBase
CIBERSORT     42
CellMarker    12
KEGG          25
MSigDB        11
PanglaoDB      7
tmod          41
dtype: int64

In [12]:
filtered_geneset_info.to_csv(datafolder+ "filtered_genesets_modules.csv")
with open(datafolder + "filtered_genesets_genes.json", 'w') as fp:
    json.dump(filtered_genesets, fp)

## Get genesets for classification scores

In [110]:
data_folder =  "C:\\Users\\amruk\\source\\enrichment-auc\\data\\"
paths = pd.read_csv(data_folder+"paths_to_cells.txt", sep='\t')

In [None]:
by_celltype = paths[~paths['Celltype'].str.contains("monocyte|dendritic", case=False)]
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace('CD4+ T cell; Cytotoxic T cell','T cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace('Cytotoxic T cell; CD4+ T cell','T cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace('CD4+ T cell','T cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace('Cytotoxic T cell','T cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace("T cell; T cell",'T cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace("Natural killer cell",'NK cell')
by_celltype['Celltype'] = by_celltype['Celltype'].str.replace("T cell; NK cell",'NK cell; T cell')

by_celltype['Celltype'] = by_celltype['Celltype'].str.replace("T cell; NK cell",'NK cell; T cell')

In [100]:
title_condition = by_celltype['Title'].str.contains("nk cell|natural killer|b cell|t cell", case=False)
other_cells = by_celltype['Celltype'].str.contains("other|unknown", case=False)
not_mast = ~by_celltype['Title'].str.contains("mast cell", case=False)
other_applicable = by_celltype[title_condition & other_cells & not_mast]
other_applicable

Unnamed: 0,ID,Title,Category,Database,GSsize,GenesInGS,Perc,Celltype,Celltype_unclear
Sig.:21,Sig.:21,T CELLS GAMMA DELTA,Signature of PBMC,Signature,36,35,97.222222,other,no


In [None]:
other_applicable["Celltype"] = " T cell"

In [106]:
chosen = pd.concat([by_celltype[by_celltype['Celltype'].str.contains("nk cell|natural killer|t cell|b cell", case=False)], other_applicable])

In [114]:
chosen.head()

Unnamed: 0,ID,Title,Category,Database,GSsize,GenesInGS,Perc,Celltype,Celltype_unclear
hsa04062,hsa04062,KEGG: Chemokine signaling pathway,KEGG: Organismal Systems; Immune system,KEGG,192,139,72.395833,T cell,no
hsa04610,hsa04610,KEGG: Complement and coagulation cascades,KEGG: Organismal Systems; Immune system,KEGG,85,46,54.117647,B cell; T cell,no
hsa04612,hsa04612,KEGG: Antigen processing and presentation,KEGG: Organismal Systems; Immune system,KEGG,78,66,84.615385,NK cell; T cell,no
hsa04650,hsa04650,KEGG: Natural killer cell mediated cytotoxicity,KEGG: Organismal Systems; Immune system,KEGG,131,94,71.755725,NK cell,no
hsa04657,hsa04657,KEGG: IL-17 signaling pathway,KEGG: Organismal Systems; Immune system,KEGG,94,63,67.021277,NK cell; T cell,no


In [128]:
chosen.shape

(69, 9)

In [129]:
chosen.to_csv(data_folder+"chosen_paths.txt", sep='\t', mode='a')