In [None]:
"""
This file takes the output of gestalt_results.csv, the output from DynGENIE3 'genes_in_low_or_high_with_source.csv',
the KEGG ID mapping file 'user_ko.tsv', and the output of WGCNA, 'gene_module_colors.csv'.
It subsets to KEGG IDs, human ortholog IDs, and pathway descriptions.
It merges it with alpha and weights values from DynGENIE3. 
The output file from WGCNA is merged to create a network in Cytoscape.

"""

In [1]:
import pandas as pd
import numpy as np

In [3]:
#read output of gestalt
gestalt_results = pd.read_csv('gestalt_results.csv')
#read output of DynGENIE3
dosed_genes = pd.read_csv('genes_in_low_or_high_with_source.csv')
#get KEGG ID mapping file
kegg_ids = pd.read_csv('../user_ko.tsv',sep='\t')
#get wgcna results
modules = pd.read_csv('gene_module_colors.csv')
#get tfs 
tfs = pd.read_csv('../ortholog/enzyme_proteins.csv')
gestalt_results.head()

Unnamed: 0.1,Unnamed: 0,geneSet,description,link,enrichmentScore,normalizedEnrichmentScore,pValue,FDR,size,plotPath,leadingEdgeNum,leadingEdgeId,userId,method,condition,timepoint,overlap,expect,enrichmentRatio,overlapId
0,0,hsa03010,Ribosome,http://www.kegg.jp/kegg-bin/show_pathway?hsa03...,0.690073,2.551845,0.0,0.0,123,./Project_wg_result1722354351_GSEA/hsa03010.png,90.0,64963;51116;51081;63875;9801;65005;29088;6207;...,K02863,Deseq2,low,2H,,,,
1,1,hsa00190,Oxidative phosphorylation,http://www.kegg.jp/kegg-bin/show_pathway?hsa00...,0.658854,2.412224,0.0,0.0,105,./Project_wg_result1722354351_GSEA/hsa00190.png,84.0,525;526;9296;516;517;518;527;10312;23545;50617...,K00235,Deseq2,low,2H,,,,
2,2,hsa04966,Collecting duct acid secretion,http://www.kegg.jp/kegg-bin/show_pathway?hsa04...,0.801551,2.146047,0.0,0.0,21,./Project_wg_result1722354351_GSEA/hsa04966.png,21.0,525;526;9296;527;10312;23545;50617;535;245972;...,K02145,Deseq2,low,2H,,,,
3,3,hsa05120,Epithelial cell signaling in Helicobacter pylo...,http://www.kegg.jp/kegg-bin/show_pathway?hsa05...,0.660879,2.107016,0.0,0.0,47,./Project_wg_result1722354351_GSEA/hsa05120.png,22.0,1432;5600;5603;6300;525;526;9296;527;10312;235...,K02144,Deseq2,low,2H,,,,
4,4,hsa04145,Phagosome,http://www.kegg.jp/kegg-bin/show_pathway?hsa04...,0.605591,2.101155,0.0,0.0,77,./Project_wg_result1722354351_GSEA/hsa04145.png,40.0,525;526;9296;527;10312;23545;50617;535;245972;...,K02144,Deseq2,low,2H,,,,


In [4]:
#subset output of gestalt to 3 columns
gestalt_subset = gestalt_results[['geneSet','description','userId','enrichmentScore','normalizedEnrichmentScore',
                                  'method','condition','timepoint','pValue']]
#drop NAs in KEGG ID file
kegg_ids = kegg_ids.dropna(subset=['KO'])
# Remove the 't1' suffix from every value in the 'genes' column
kegg_ids['target'] = kegg_ids['target'].str.replace('t1$', '', regex=True)
#merge DynGENIE3 with KEGG IDs on target
dyngenie_kegg = pd.merge(kegg_ids, dosed_genes, on='target')
#rename KEGG IDs so they match on regulatory
kegg_ids_renamed = kegg_ids.rename(columns={"target":"regulatory","KO":"KO_regulatory"})
#merge DynGENIE3 with KEGG IDs on regulatory
dyngenie_kegg_merged = pd.merge(kegg_ids_renamed, dyngenie_kegg, on='regulatory')
#filter DynGENIE3 to only those with KEGG IDs
dyngenie_kegg_filtered = dyngenie_kegg_merged.dropna(subset=['KO'])
#subset to important columns
dyngenie_subset = dyngenie_kegg_filtered[['target','KO','regulatory','alpha','weight','out.degree','KO_regulatory']]
#rename columns
gestalt_renamed = gestalt_subset.rename(columns={"userId":"KO"})
#merge on KEGG IDs
gestalt_dyngenie = pd.merge(gestalt_renamed,dyngenie_subset,on="KO")
#inspect
gestalt_dyngenie.head()
#add column with absolute value enrichment score for visualization
gestalt_dyngenie['abs_es'] = gestalt_dyngenie['enrichmentScore'].abs()
#sort by enrichemnt scores
gestalt_sorted = gestalt_dyngenie.sort_values(by=['abs_es'],ascending=False)
#save to csv
gestalt_sorted.to_csv('gestalt_dyngenie.csv')
gestalt_sorted['KO'].unique()

array(['K01047', 'K01897', 'K01672', 'K03283', 'K00002', 'K01084',
       'K18246', 'K10879', 'K00699', 'K09448', 'K00031', 'K05210',
       'K00033', 'K01363', 'K01596', 'K00236', 'K02649', 'K01077',
       'K00889', 'K00710', 'K00182', 'K11251', 'K00507', 'K00623',
       'K02183', 'K00261', 'K04632', 'K00079', 'K00799', 'K01593',
       'K04362', 'K07198', 'K00910', 'K02871', 'K02864', 'K00070',
       'K01539', 'K02295', 'K00012', 'K01081', 'K00604', 'K00016',
       'K01866', 'K01176', 'K00657', 'K00058', 'K00654', 'K00922',
       'K02677', 'K01230', 'K05870', 'K04962', 'K00234', 'K02326',
       'K01349', 'K05175', 'K06237', 'K13524', 'K00444', 'K00416',
       'K00572', 'K04615', 'K09201'], dtype=object)

In [5]:
#merge this file with wgcna to get module colors
modules_renamed = modules.rename(columns={"Gene":"regulatory"})
dyngenie_wgcna = pd.merge(gestalt_sorted,modules_renamed,on='regulatory')

dyngenie_wgcna.to_csv('dyngenie_wgcna.csv')
dyngenie_wgcna.head()

Unnamed: 0,geneSet,description,KO,enrichmentScore,normalizedEnrichmentScore,method,condition,timepoint,pValue,target,regulatory,alpha,weight,out.degree,KO_regulatory,abs_es,ModuleColor
0,hsa00591,Linoleic acid metabolism,K01047,0.999187,1.900848,Deseq2,high,6H,0.0,Dapma7bEVm028843,Dapma7bEVm010536,0.004584,0.018643,36.885746,K09073,0.999187,#E41A1C
1,hsa00591,Linoleic acid metabolism,K01047,0.999187,1.900848,Deseq2,high,6H,0.0,Dapma7bEVm006657,Dapma7bEVm010536,0.017974,0.00065,36.885746,K09073,0.999187,#E41A1C
2,hsa00591,Linoleic acid metabolism,K01047,0.999187,1.900848,Deseq2,high,6H,0.0,Dapma7bEVm005643,Dapma7bEVm010536,0.005027,0.00013,36.885746,K09073,0.999187,#E41A1C
3,hsa00592,alpha-Linolenic acid metabolism,K01047,0.997786,1.912247,Deseq2,high,6H,0.0,Dapma7bEVm005643,Dapma7bEVm010536,0.005027,0.00013,36.885746,K09073,0.997786,#E41A1C
4,hsa00592,alpha-Linolenic acid metabolism,K01047,0.997786,1.912247,Deseq2,high,6H,0.0,Dapma7bEVm028843,Dapma7bEVm010536,0.004584,0.018643,36.885746,K09073,0.997786,#E41A1C


In [6]:
#output the same file with TFs

#rename KO to KO_regulatory
tfs = tfs.rename(columns={'KO':'KO_regulatory'})
tfs_wgcna = pd.merge(dyngenie_wgcna,tfs,on='KO_regulatory')
tfs_wgcna.head()
tfs_wgcna.to_csv("tfs_wgcna.csv")

In [35]:
#get table for DESEq2
deseq_table = dyngenie_wgcna[['description','KO','condition','pValue']]
deseq_table = deseq_table.drop_duplicates()
deseq_table = deseq_table.dropna()
deseq_table.dtypes
deseq_table_sig = deseq_table.loc[deseq_table['pValue'] != 0]
deseq_table_sig.head()
deseq_table_sig.shape
deseq_table_sig.to_csv('deseq_table_sig.csv')