In [None]:
"""
This file takes the output of gestalt_results.csv, the output from DynGENIE3 'genes_in_low_or_high_with_source.csv',
the KEGG ID mapping file 'user_ko.tsv', and the output of WGCNA, 'gene_module_colors.csv'.
It subsets to KEGG IDs, human ortholog IDs, and pathway descriptions.
It merges it with alpha and weights values from DynGENIE3. 
The output file from WGCNA is merged to create a network in Cytoscape.

"""

In [2]:
import pandas as pd
import numpy as np

In [10]:
#read output of gestalt
gestalt_results = pd.read_csv('gestalt_results.csv')
#read output of DynGENIE3
dosed_genes = pd.read_csv('genes_in_low_or_high_with_source.csv')
#get KEGG ID mapping file
kegg_ids = pd.read_csv('../user_ko.tsv',sep='\t')
#get wgcna results
modules = pd.read_csv('gene_module_colors.csv')

In [27]:
#subset output of gestalt to 3 columns
gestalt_subset = gestalt_results[['geneSet','description','userId','enrichmentScore','normalizedEnrichmentScore']]
#drop NAs in KEGG ID file
kegg_ids = kegg_ids.dropna(subset=['KO'])
# Remove the 't1' suffix from every value in the 'genes' column
kegg_ids['target'] = kegg_ids['target'].str.replace('t1$', '', regex=True)
#merge DynGENIE3 with KEGG IDs on target
dyngenie_kegg = pd.merge(kegg_ids, dosed_genes, on='target')
#rename KEGG IDs so they match on regulatory
kegg_ids_renamed = kegg_ids.rename(columns={"target":"regulatory","KO":"KO_regulatory"})
#merge DynGENIE3 with KEGG IDs on regulatory
dyngenie_kegg_merged = pd.merge(kegg_ids_renamed, dyngenie_kegg, on='regulatory')
#filter DynGENIE3 to only those with KEGG IDs
dyngenie_kegg_filtered = dyngenie_kegg_merged.dropna(subset=['KO'])
#subset to important columns
dyngenie_subset = dyngenie_kegg_filtered[['target','KO','regulatory','alpha','weight','out.degree','KO_regulatory']]
#rename columns
gestalt_renamed = gestalt_subset.rename(columns={"userId":"KO"})
#merge on KEGG IDs
gestalt_dyngenie = pd.merge(gestalt_renamed,dyngenie_subset,on="KO")
#inspect
gestalt_dyngenie.head()
#add column with absolute value enrichment score for visualization
gestalt_dyngenie['abs_es'] = gestalt_dyngenie['enrichmentScore'].abs()
#sort by enrichemnt scores
gestalt_sorted = gestalt_dyngenie.sort_values(by=['abs_es'],ascending=False)
#save to csv
gestalt_sorted.to_csv('gestalt_dyngenie.csv')
gestalt_sorted['KO'].unique()

array(['K01047', 'K01897', 'K01672', 'K03283', 'K00002', 'K01084',
       'K18246', 'K10879', 'K00699', 'K09448', 'K00031', 'K05210',
       'K00033', 'K01363', 'K01596', 'K00236', 'K02649', 'K01077',
       'K00889', 'K00710', 'K00182', 'K11251', 'K00507', 'K00623',
       'K02183', 'K00261', 'K04632', 'K00079', 'K00799', 'K01593',
       'K04362', 'K07198', 'K00910', 'K02871', 'K02864', 'K00070',
       'K01539', 'K02295', 'K00012', 'K01081', 'K00604', 'K00016',
       'K01866', 'K01176', 'K00657', 'K00058', 'K00654', 'K00922',
       'K02677', 'K01230', 'K05870', 'K04962', 'K00234', 'K02326',
       'K01349', 'K05175', 'K06237', 'K13524', 'K00444', 'K00416',
       'K00572', 'K04615', 'K09201'], dtype=object)

In [42]:
#merge this file with wgcna to get module colors
modules_renamed = modules.rename(columns={"Gene":"KO"})
modules_renamed['KO'] = modules_renamed['KO'].str.replace(r'\.\d+$', '', regex=True)
dyngenie_wgcna = pd.merge(gestalt_sorted,modules_renamed,on='KO')

dyngenie_wgcna.to_csv('dyngenie_wgcna.csv')
dyngenie_wgcna.head()
dyngenie_wgcna['description'].unique()

array(['Linoleic acid metabolism', 'alpha-Linolenic acid metabolism',
       'Fat digestion and absorption', 'Arachidonic acid metabolism',
       'Vascular smooth muscle contraction', 'Ether lipid metabolism',
       'Pancreatic secretion', 'Fatty acid biosynthesis', 'Ferroptosis',
       'Fatty acid degradation', 'Fatty acid metabolism',
       'PPAR signaling pathway', 'Thermogenesis', 'Nitrogen metabolism',
       'Antigen processing and presentation',
       'Estrogen signaling pathway', 'Legionellosis',
       'Longevity regulating pathway', 'Measles', 'Toxoplasmosis',
       'Protein processing in endoplasmic reticulum',
       'Lipid and atherosclerosis', 'Spliceosome',
       'Ascorbate and aldarate metabolism',
       'Pentose and glucuronate interconversions', 'Pyruvate metabolism',
       'Adipocytokine signaling pathway', 'Starch and sucrose metabolism',
       'Galactose metabolism', 'Insulin resistance',
       'Carbohydrate digestion and absorption', 'AMPK signaling pat