In [None]:
'''
This file takes the top 10 enrichment results files from Gestalt for each condition/timepoint.
It concats the data and adds annotation about where the data came from based on file names (low, high, timepoint, Deseq2/dyngenie).
It also cleans the KEGG IDs so that one ID is retained for each row.
This df is saved to a csv for visualization in Tableau.

'''

In [11]:
import pandas as pd
import numpy as np
import glob

In [19]:
LRT_genes = pd.read_csv('GSEA tables/Deseq2_LRT_GSEA_results.txt',sep='\t')
LRT_genes.head()

Unnamed: 0,geneSet,description,link,enrichmentScore,normalizedEnrichmentScore,pValue,FDR,size,plotPath,leadingEdgeNum,leadingEdgeId,userId
0,hsa05168,Herpes simplex virus 1 infection,http://www.kegg.jp/kegg-bin/show_pathway?hsa05...,0.733297,4.059869,0,0.0,360,./Project_wg_result1747077891_GSEA/hsa05168.png,334,1967;100129543;100129842;100131017;100131827;1...,K03239;K09228;K09228;K09228;K09228;K09228;K092...
1,hsa00514,Other types of O-glycan biosynthesis,http://www.kegg.jp/kegg-bin/show_pathway?hsa00...,0.893547,3.299792,0,0.0,31,./Project_wg_result1747077891_GSEA/hsa00514.png,8,10585;29954;56913;283464;727936;3955;4242;5986,K00728;K00728;K00731;K05948;K05948;K05948;K136...
2,hsa05322,Systemic lupus erythematosus,http://www.kegg.jp/kegg-bin/show_pathway?hsa05...,0.733045,3.288947,0,0.0,73,./Project_wg_result1747077891_GSEA/hsa05322.png,29,2904;115482686;221613;3012;3013;3014;3015;3177...,K05210;K11090;K11251;K11251;K11251;K11251;K112...
3,hsa03082,ATP-dependent chromatin remodeling,http://www.kegg.jp/kegg-bin/show_pathway?hsa03...,0.785735,3.158816,0,0.0,42,./Project_wg_result1747077891_GSEA/hsa03082.png,27,115482686;221613;3012;3013;3014;3015;317772;47...,K11251;K11251;K11251;K11251;K11251;K11251;K112...
4,hsa04613,Neutrophil extracellular trap formation,http://www.kegg.jp/kegg-bin/show_pathway?hsa04...,0.651065,3.077983,0,0.0,87,./Project_wg_result1747077891_GSEA/hsa04613.png,28,5335;115482686;221613;3012;3013;3014;3015;3177...,K01116;K11251;K11251;K11251;K11251;K11251;K112...


In [12]:
# Define the folder path
folder_path = "GSEA tables"

# Get a list of all text files in the folder
file_list = glob.glob(f"{folder_path}/*.txt")

# Initialize a list to store DataFrames
dataframes = []

# Loop through the file list and read each file into a DataFrame
for file in file_list:
    df = pd.read_csv(file, sep="\t")
    
    # Extract information from the file name
    base_name = file.split('/')[-1].replace('.txt', '')
    parts = base_name.split('_')
    
    if 'alphas' in parts:
        method = 'alphas'
    elif 'Deseq2' in parts:
        method = 'Deseq2'
    
    if 'low' in parts:
        condition = 'low'
    elif 'high' in parts:
        condition = 'high'
    
    timepoints = ['1H', '2H', '6H', '12H', '24H', '4D', '5D', '6D', '7D']
    timepoint = 'NA'
    for tp in timepoints:
        if tp in parts:
            timepoint = tp
            break
    
    # Add new columns to the DataFrame
    df['method'] = method
    df['condition'] = condition
    df['timepoint'] = timepoint
    
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames into one
final_df = pd.concat(dataframes, ignore_index=True)

In [13]:
# Function to retain only the first ID
def clean_user_ids(user_ids):
    return user_ids.split(';')[0]

# Apply the function to the 'UserIDs' column
final_df['userId'] = final_df['userId'].apply(clean_user_ids)

#final_df.to_csv('GSEA tables/gestalt_results.csv')

In [10]:
final_df.head()

Unnamed: 0,geneSet,description,link,enrichmentScore,normalizedEnrichmentScore,pValue,FDR,size,plotPath,leadingEdgeNum,leadingEdgeId,userId,method,condition,timepoint
0,hsa05322,Systemic lupus erythematosus,http://www.kegg.jp/kegg-bin/show_pathway?hsa05...,0.756303,2.669571,0.0,0.0,89,./Project_wg_result1747073913_GSEA/hsa05322.png,60,2904;126961;3020;3021;333932;440093;653604;829...,K05210,Deseq2,low,7D
1,hsa04613,Neutrophil extracellular trap formation,http://www.kegg.jp/kegg-bin/show_pathway?hsa04...,0.596004,2.221414,0.0,0.0,129,./Project_wg_result1747073913_GSEA/hsa04613.png,70,3146;5335;126961;3020;3021;333932;440093;65360...,K01116,Deseq2,low,7D
2,hsa00982,Drug metabolism,http://www.kegg.jp/kegg-bin/show_pathway?hsa00...,-0.704906,-2.131923,0.0,0.0,45,./Project_wg_result1747073913_GSEA/hsa00982.png,19,10720;10941;54490;54575;54576;54577;54578;5457...,K00699,Deseq2,low,7D
3,hsa00830,Retinol metabolism,http://www.kegg.jp/kegg-bin/show_pathway?hsa00...,-0.790168,-2.170939,0.0,0.0,25,./Project_wg_result1747073913_GSEA/hsa00830.png,20,10720;10941;54490;54575;54576;54577;54578;5457...,K00699,Deseq2,low,7D
4,hsa00860,Porphyrin metabolism,http://www.kegg.jp/kegg-bin/show_pathway?hsa00...,-0.766302,-2.276019,0.0,0.0,38,./Project_wg_result1747073913_GSEA/hsa00860.png,27,3145;3163;211;212;2058;1371;10720;10941;54490;...,K00228,Deseq2,low,7D


In [24]:
#filter pathways by if they are significant across time according to LRT
filtered_pathways = final_df[final_df['geneSet'].isin(LRT_genes['geneSet'])]
filtered_pathways.to_csv('GSEA tables/gestalt_results.csv')