In [None]:
'''
This file takes the top 10 enrichment results files from Gestalt for each condition/timepoint.
It concats the data and adds annotation about where the data came from based on file names (low, high, timepoint, limma/dyngenie).
It also cleans the KEGG IDs so that one ID is retained for each row.
This df is saved to a csv for visualization in Tableau.

'''

In [17]:
import pandas as pd
import numpy as np
import glob

In [21]:
# Define the folder path
folder_path = "gestalt_tables"

# Get a list of all text files in the folder
file_list = glob.glob(f"{folder_path}/*.txt")

# Initialize a list to store DataFrames
dataframes = []

# Loop through the file list and read each file into a DataFrame
for file in file_list:
    df = pd.read_csv(file, sep="\t")
    
    # Extract information from the file name
    base_name = file.split('/')[-1].replace('.txt', '')
    parts = base_name.split('_')
    
    if 'alphas' in parts:
        method = 'alphas'
    elif 'limma' in parts:
        method = 'limma'
    
    if 'low' in parts:
        condition = 'low'
    elif 'high' in parts:
        condition = 'high'
    
    timepoints = ['1H', '2H', '6H', '12H', '24H', '4D', '5D', '6D', '7D']
    timepoint = 'NA'
    for tp in timepoints:
        if tp in parts:
            timepoint = tp
            break
    
    # Add new columns to the DataFrame
    df['method'] = method
    df['condition'] = condition
    df['timepoint'] = timepoint
    
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames into one
final_df = pd.concat(dataframes, ignore_index=True)

In [25]:
# Function to retain only the first ID
def clean_user_ids(user_ids):
    return user_ids.split(';')[0]

# Apply the function to the 'UserIDs' column
final_df['userId'] = final_df['userId'].apply(clean_user_ids)

final_df.to_csv('gestalt_results.csv')