In [1]:
import numpy as np
import pandas as pd
import json
import collections

### **Generate pathway input from NCATS bioplanet csv**

In [2]:
#read pathway from bioplanetcsv
df_pathway = pd.read_csv('/data/scPAFA_paper/pathway/bioplanet_pathway.csv')

In [11]:
#show bioplanet csv
df_pathway[0:10]

Unnamed: 0,PATHWAY_ID,PATHWAY_NAME,GENE_ID,GENE_SYMBOL,pathway_full_name
0,bioplanet_1,Inhibition of matrix metalloproteinases,3265,HRAS,bioplanet_1_Inhibition of matrix metalloprotei...
1,bioplanet_1,Inhibition of matrix metalloproteinases,4313,MMP2,bioplanet_1_Inhibition of matrix metalloprotei...
2,bioplanet_1,Inhibition of matrix metalloproteinases,4318,MMP9,bioplanet_1_Inhibition of matrix metalloprotei...
3,bioplanet_1,Inhibition of matrix metalloproteinases,4323,MMP14,bioplanet_1_Inhibition of matrix metalloprotei...
4,bioplanet_1,Inhibition of matrix metalloproteinases,7076,TIMP1,bioplanet_1_Inhibition of matrix metalloprotei...
5,bioplanet_1,Inhibition of matrix metalloproteinases,7077,TIMP2,bioplanet_1_Inhibition of matrix metalloprotei...
6,bioplanet_1,Inhibition of matrix metalloproteinases,7078,TIMP3,bioplanet_1_Inhibition of matrix metalloprotei...
7,bioplanet_1,Inhibition of matrix metalloproteinases,7079,TIMP4,bioplanet_1_Inhibition of matrix metalloprotei...
8,bioplanet_1,Inhibition of matrix metalloproteinases,8434,RECK,bioplanet_1_Inhibition of matrix metalloprotei...
9,bioplanet_10,Adhesion and diapedesis of granulocytes,727,C5,bioplanet_10_Adhesion and diapedesis of granul...


In [4]:
#generate pathway input
df_pathway['pathway_full_name'] = df_pathway['PATHWAY_ID']+'_'+df_pathway['PATHWAY_NAME']
pathwaydict_bioplanet = dict(collections.Counter(df_pathway['pathway_full_name']))
for i in pathwaydict_bioplanet.keys():
    pathwaydict_bioplanet[i] = df_pathway[df_pathway['pathway_full_name'] == i]['GENE_SYMBOL'].to_list()

In [12]:
#show pathway input dict
pathwaydict_bioplanet['bioplanet_1_Inhibition of matrix metalloproteinases']

['HRAS', 'MMP2', 'MMP9', 'MMP14', 'TIMP1', 'TIMP2', 'TIMP3', 'TIMP4', 'RECK']

In [7]:
#check unique
print(len(pathwaydict_bioplanet) == len(set(pathwaydict_bioplanet.keys())))
print(len(pathwaydict_bioplanet))

True
1658


In [8]:
#save pathway input dictionary
with open('../example_file/pathwaydict_bioplanet.json', "w") as json_file:
    json.dump(pathwaydict_bioplanet, json_file)

### **Generate pathway input from MsigDB JSON**

In [9]:
#read pathway from MsigDB json(hallmark 50 pathways)
with open('/data/scPAFA_paper/pathway/h.all.v2023.2.Hs.json', 'r') as file:
    hallmark_json = json.load(file)
pathwaydict_hallmark ={i:hallmark_json[i]['geneSymbols'] for i in hallmark_json.keys()}

In [13]:
#show pathway input dict
pathwaydict_hallmark['HALLMARK_TNFA_SIGNALING_VIA_NFKB']

['ABCA1',
 'ACKR3',
 'AREG',
 'ATF3',
 'ATP2B1',
 'B4GALT1',
 'B4GALT5',
 'BCL2A1',
 'BCL3',
 'BCL6',
 'BHLHE40',
 'BIRC2',
 'BIRC3',
 'BMP2',
 'BTG1',
 'BTG2',
 'BTG3',
 'CCL2',
 'CCL20',
 'CCL4',
 'CCL5',
 'CCN1',
 'CCND1',
 'CCNL1',
 'CCRL2',
 'CD44',
 'CD69',
 'CD80',
 'CD83',
 'CDKN1A',
 'CEBPB',
 'CEBPD',
 'CFLAR',
 'CLCF1',
 'CSF1',
 'CSF2',
 'CXCL1',
 'CXCL10',
 'CXCL11',
 'CXCL2',
 'CXCL3',
 'CXCL6',
 'DENND5A',
 'DNAJB4',
 'DRAM1',
 'DUSP1',
 'DUSP2',
 'DUSP4',
 'DUSP5',
 'EDN1',
 'EFNA1',
 'EGR1',
 'EGR2',
 'EGR3',
 'EHD1',
 'EIF1',
 'ETS2',
 'F2RL1',
 'F3',
 'FJX1',
 'FOS',
 'FOSB',
 'FOSL1',
 'FOSL2',
 'FUT4',
 'G0S2',
 'GADD45A',
 'GADD45B',
 'GCH1',
 'GEM',
 'GFPT2',
 'GPR183',
 'HBEGF',
 'HES1',
 'ICAM1',
 'ICOSLG',
 'ID2',
 'IER2',
 'IER3',
 'IER5',
 'IFIH1',
 'IFIT2',
 'IFNGR2',
 'IL12B',
 'IL15RA',
 'IL18',
 'IL1A',
 'IL1B',
 'IL23A',
 'IL6',
 'IL6ST',
 'IL7R',
 'INHBA',
 'IRF1',
 'IRS2',
 'JAG1',
 'JUN',
 'JUNB',
 'KDM6B',
 'KLF10',
 'KLF2',
 'KLF4',
 'KLF6',
 'KLF9

In [14]:
#show pathway input dict
print(len(pathwaydict_hallmark) == len(set(pathwaydict_hallmark.keys())))
print(len(pathwaydict_hallmark))

True
50
