Purpose: Extract top predictors (genes) from random forest results and combine into a putative core stress-responsive gene list.<br>
Author: Anna Pardo<br>
Date initiated: August 9, 2023

In [7]:
# load modules
import pandas as pd
import statistics
import scipy.stats as stats
import matplotlib.pyplot as plt
import os

In [34]:
# write function(s) for predictor extraction
def fc_enrichment(n,fcfidf):
    """
    Args:
        n = set size/number of top genes to pull out
        fcfidf = dataframe containing both feature importance (descending sorted) and fold change information, including
        whether fold change indicates the gene is "DE" or not
    """
    # pull out the top n genes
    topn = fcfidf.head(n=n)
    # set up a variable saying whether each gene is in the top set or not
    topset = []
    for i in list(fcfidf["GeneID"]):
        if i in list(topn["GeneID"]):
            topset.append("Top")
        else:
            topset.append("Bottom")
    fcfidf["TopBottom"] = topset
    # set up data table for Fisher's exact test
    data = pd.crosstab(index=fcfidf["DE"],columns=fcfidf["TopBottom"])
    # run Fisher's exact test
    odd_ratio, p_value = stats.fisher_exact(data)
    return p_value

In [42]:
def extract_top_predictors(setsizes,fcfile,fifile):
    """
    Args:
        setsizes = list of set sizes to use
        fcfile = full path to file containing fold change information (tab delimited)
        fifile = full path to file containing feature importance information, sorted in descending order by importance (tab delimited)
    """
    # load files
    fcdf = pd.read_csv(fcfile,sep="\t",header="infer")
    fidf = pd.read_csv(fifile,sep="\t",header="infer")
    # merge dataframes
    merged = fidf.merge(fcdf[["GeneID","FoldChange"]])
    # set up a variable for whether genes are "DE" or not
    deornot = []
    for i in list(merged["FoldChange"]):
        if i >= 2:
            deornot.append("DE")
        elif i <= 0.5:
            deornot.append("DE")
        else:
            deornot.append("Not DE")
    merged["DE"] = deornot
    
    # iterate through set size options
    # for each set size option, save whether or not it is significantly enriched in "DEGs", and the p-value from Fisher's
    ## exact test
    sigenrich = []
    pvals = []
    for s in setsizes:
        p = fc_enrichment(s,merged)
        if p<0.05:
            sigenrich.append("Yes")
        else:
            sigenrich.append("No")
        pvals.append(p)
    # bind into a dataframe
    sigdf = pd.DataFrame(list(zip(setsizes,sigenrich,pvals)),columns=["SetSize","Significant","p_value"])
    
    # subset dataframe to only significant set sizes
    if "Yes" in list(sigdf["Significant"]):
        sigsets = sigdf[sigdf["Significant"]=="Yes"]
    else:
        print("Error! No significant set sizes. Try a different set size list.")
    
    # find maximum significant set size
    minsig = max(list(sigsets["SetSize"]))
    
    # pull out the list of genes of that set size and return it
    top_predictors_df = merged.head(n=minsig)
    return list(top_predictors_df["GeneID"])

In [5]:
# set directory in which to look for input feature importance files
directory = "../../data/rf_outputs/upsample_13-Jul-2023/"

In [11]:
# build a list of input FI files
fifiles = []
for d in os.listdir(directory):
    for f in os.listdir(directory+d+"/"):
        if f.endswith("_sorted.tsv"):
            fifiles.append(directory+d+"/"+f)

In [17]:
# build a list of input FC files
fcfiles = []
for f in os.listdir("../../data/"):
    if f.endswith("_meanTPM.tsv"):
        if f != "log2FoldChange_allStress_meanTPM.tsv":
            fcfiles.append("../../data/"+f)

In [18]:
fcfiles

['../../data/log2FoldChange_Flooding_meanTPM.tsv',
 '../../data/log2FoldChange_Drought_meanTPM.tsv',
 '../../data/log2FoldChange_Salt_meanTPM.tsv',
 '../../data/log2FoldChange_Heat_meanTPM.tsv',
 '../../data/log2FoldChange_Low_Nitrogen_meanTPM.tsv',
 '../../data/log2FoldChange_Cold_meanTPM.tsv']

In [22]:
fcfiles

['../../data/log2FoldChange_Flooding_meanTPM.tsv',
 '../../data/log2FoldChange_Drought_meanTPM.tsv',
 '../../data/log2FoldChange_Salt_meanTPM.tsv',
 '../../data/log2FoldChange_Heat_meanTPM.tsv',
 '../../data/log2FoldChange_Low_Nitrogen_meanTPM.tsv',
 '../../data/log2FoldChange_Cold_meanTPM.tsv']

In [19]:
# make a list of stressors
stressors = ["Drought","Salt","Cold","Flooding","Heat","Low_Nitrogen"]

In [25]:
# re-order lists of files and bind into dataframe such that it's aligned with the stressors list
fi = []
fc = []
for i in range(len(stressors)):
    s = stressors[i]
    for j in fifiles:
        xfi = j.strip().split("/")[5].split("Test")[0]
        if xfi == s:
            fi.append(j)
    for k in fcfiles:
        yfc = k.strip().split("e_")[1].split("_m")[0]
        if yfc == s:
            fc.append(k)

# make dataframe
files = pd.DataFrame(list(zip(stressors,fi,fc)),columns=["Stressor","FI_file","FC_file"])

In [32]:
files.head()

Unnamed: 0,Stressor,FI_file,FC_file
0,Drought,../../data/rf_outputs/upsample_13-Jul-2023/Dro...,../../data/log2FoldChange_Drought_meanTPM.tsv
1,Salt,../../data/rf_outputs/upsample_13-Jul-2023/Sal...,../../data/log2FoldChange_Salt_meanTPM.tsv
2,Cold,../../data/rf_outputs/upsample_13-Jul-2023/Col...,../../data/log2FoldChange_Cold_meanTPM.tsv
3,Flooding,../../data/rf_outputs/upsample_13-Jul-2023/Flo...,../../data/log2FoldChange_Flooding_meanTPM.tsv
4,Heat,../../data/rf_outputs/upsample_13-Jul-2023/Hea...,../../data/log2FoldChange_Heat_meanTPM.tsv


In [29]:
# make list of set sizes
sets = []
for i in range(50,5050,50):
    sets.append(i)

In [43]:
# make a dictionary of the lists of top predictors
top_preds = {}
for i in range(len(files.index)):
    s = files.iloc[i,0]
    fcfile = files.iloc[i,2]
    fifile = files.iloc[i,1]
    top_preds[s] = extract_top_predictors(sets,fcfile,fifile)

In [44]:
# what are the set sizes for each stressor?
for k in top_preds.keys():
    ngenes = len(top_preds[k])
    print(k+": "+str(ngenes))

Drought: 5000
Salt: 5000
Cold: 5000
Flooding: 5000
Heat: 5000
Low_Nitrogen: 5000


In [45]:
# try taking the intersection of all the gene sets and see how long it is
drought = set(top_preds["Drought"])
salt = set(top_preds["Salt"])
cold = set(top_preds["Cold"])
flood = set(top_preds["Flooding"])
heat = set(top_preds["Heat"])
ln = set(top_preds["Low_Nitrogen"])

In [46]:
core = drought.intersection(salt,cold,flood,heat,ln)

In [47]:
len(core)

174