In [None]:
'''
This file can be used for DEGs output from DeSeq2 and files from output of DynGENIE3.

It adds KEGG IDs to both files, sorts the files by their scores (logFC and alphas, respectively), and filters so 10k genes
or less are in each file.

Alphas tend to have a lot of duplicate scores. Jitter is added and scores are aggregated for duplicate genes. Unique values of 
genes and scores must be used for Gestalt.

'''

In [36]:
import pandas as pd
import numpy as np
import re
import glob
import os

In [37]:
#import original gene expression files
expression_levels = pd.read_csv('../Data/omics/rna_vst_proc.csv')
#import DEGs from DynGENIE3
#dosed_genes = pd.read_csv('genes_in_low_or_high_with_source.csv')
#import human orthologs
human_orthologs = pd.read_csv('../Data/ortholog/dma_hsa.tsv',sep='\t')
#import DEGs from Deseq2
data_dir_deseq2 = "Deseq2 Results"
csv_files = glob.glob(f"{data_dir_deseq2}/*.csv")
deseq_dfs = [pd.read_csv(file) for file in csv_files]
#import pathways
#pathways = pd.read_csv('

  expression_levels = pd.read_csv('../Data/omics/rna_vst_proc.csv')


In [15]:
#split combined file to two conditions
#dosed_genes_low = dosed_genes.loc[dosed_genes['source'] == 'low']
#dosed_genes_high = dosed_genes.loc[dosed_genes['source'] == 'high']

Unnamed: 0,treatment,HSA
0,Dapma7bEVm020240,THAP9
1,Dapma7bEVm022481,
2,Dapma7bEVm029341,
3,Dapma7bEVm016833,C8orf33;LOC105373926;LOC105374103;LOC105376526...
4,Dapma7bEVm022524,


In [38]:
#get kegg IDs and orthologs, matched on Dapma gene
human_renamed = human_orthologs.rename(columns={"Daphnia_magna": "treatment"})
expression_levels_renamed = expression_levels.rename(columns={"Unnamed: 0": "KEGG"})
kegg_human = pd.merge(expression_levels_renamed,human_renamed,on='treatment')

In [39]:
#function to prepare Deseq2 files for gestalt
def deseq_gestalt(df,file_name):

    #rename index column
    df = df.rename(columns={df.columns[0]: 'GeneID'})
    #rename kegg_human['target'] to GeneID
    kegg_human_renamed = kegg_human.rename(columns={"treatment":"GeneID"})
    kegg_human_subset = kegg_human_renamed[['KEGG','GeneID']]
    #drop any values where KEGG ID is not found
    kegg_human_subset = kegg_human_subset.dropna(subset=['KEGG'])
    #clean human orthologs so only one entry for one gene
    def retain_first_gene(gene_string):
        return gene_string.split(';')[0]
    # Apply the function to the 'genes' column
    kegg_human_subset['KEGG'] = kegg_human_subset['KEGG'].apply(retain_first_gene)
    #add kegg IDs on geneID
    merged_kegg_deseq = pd.merge(kegg_human_subset,df,on='GeneID')
    #subset df to only two columns - KEGG and logfc
    kegg_deseq_subset = merged_kegg_deseq[['KEGG','log2FoldChange']]
    #order by absolute value of logfc
    deseq_sorted = kegg_deseq_subset.reindex(kegg_deseq_subset['log2FoldChange'].abs().sort_values(ascending=False).index)
    #get top 10k genes and save rnk for Gestalt
    deseq_head = deseq_sorted.head(10000)
    #rename columns
    deseq_head_renamed = deseq_head.rename(columns={"KEGG":"GeneID","log2FoldChange":"score"})
    #save to rnk file
    deseq_head_renamed.to_csv(file_name, sep='\t', index=False,header=False)

    return deseq_head

In [40]:
#run gestalt prep for 18 deseq files
for file in range(len(csv_files)):
    deseq_gestalt(deseq_dfs[file],os.path.splitext(csv_files[file])[0]+'_filtered.rnk')

In [12]:
#prepare output of DynGENIE3 for Gestalt
def process_gestalt(df): 
    
    #process files for GESTALT, requires target genes and their alphas
    target = df[['target','alpha']]
    
    # Rename the 'target' column to 'regulatory' in control_target
    target_renamed = target.rename(columns={'target': 'treatment'})
    
    #add KEGG IDs and remove NANs
    kegg_gene_file = expression_levels_renamed[['KEGG','treatment']]
    merged_keggs = pd.merge(kegg_gene_file,target_renamed,on='treatment')
    
    # Sort by 'alpha' column
    sorted_df = merged_keggs.sort_values(by='alpha',ascending=False).reset_index(drop=True)

    #drop nans and remove semi-colons from kegg ids, rename to prepare for GMT
    nans_dropped = sorted_df.dropna(subset=['KEGG'])
    filtered_df = nans_dropped[['KEGG','alpha']]
    filtered_df['KEGG'] = filtered_df['KEGG'].str.split(';').str[0]
    renamed_df = filtered_df.rename(columns={'KEGG': 'NAME'})

    return renamed_df

In [17]:
#run initial processing for treatment conditions
ranked_genes_low = process_gestalt(dosed_genes_low)
ranked_genes_high = process_gestalt(dosed_genes_high)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['KEGG'] = filtered_df['KEGG'].str.split(';').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['KEGG'] = filtered_df['KEGG'].str.split(';').str[0]


In [19]:
#add jitter and aggregate scores - Gestalt cannot handle duplicate scores or genes
def prepare_gestalt(df, file_name):
    
    # Add jitter to scores to handle ties
    np.random.seed(42)  # For reproducibility
    df['alpha'] = df['alpha'] + np.random.uniform(-0.01, 0.01, df.shape[0])

    # Aggregate scores for duplicated genes
    aggregated_scores = df.groupby('NAME')['alpha'].mean().reset_index()

    # Save to .rnk file
    aggregated_scores.to_csv(file_name, sep="\t", index=False, header=False)

# Prepare and save the .rnk files
prepare_gestalt(ranked_genes_low, 'gestalt_alphas_low.rnk')
prepare_gestalt(ranked_genes_high, 'gestalt_alphas_high.rnk')