Purpose: To find out information about core genes and stress-responsive genes in coexpression modules.<br>
Author: Anna Pardo<br>
Date initiated: Sept. 26, 2023

In [1]:
# import modules
import pandas as pd
import os

We have already found out that modules containing core genes of enriched or near-enriched TF families are enriched in other core genes.

In [2]:
# load coexpression module data
directory = "../../data/WGCNA_output/outputs_from_hpcc/"
modgenes = {}
for filename in os.listdir(directory):
    if filename.endswith("_genes.csv"):
        modname = filename.split("_")[0]
        df = pd.read_csv(directory+filename,sep="\t",header="infer")
        #print(df.head())
        genelist = list(df["x"])
        modgenes[modname] = genelist

In [3]:
# load the core genes
cg = pd.read_csv("../../data/CoreGenes_Final_Info.tsv",sep="\t",header="infer")
cg.head()

Unnamed: 0,GeneID,Name,Upregulated,Downregulated,From_RF,From_Set_Ops
0,Zm00001eb418060,glutathione S-transferase2,Y,N,N,Y
1,Zm00001eb225440,,Y,Y,Y,N
2,Zm00001eb197060,,Y,N,N,Y
3,Zm00001eb322670,,Y,Y,Y,N
4,Zm00001eb089800,aconitase8,Y,Y,Y,N


In [4]:
# load tpm
tpm = pd.read_csv("../../data/rawtpm_bptreat_noPEG.tsv",sep="\t",header="infer")

In [5]:
# wrangle tpm data
ttpm = tpm.set_index("Sample").drop(["BioProject","Treatment"],axis=1).transpose().reset_index().rename(columns={"index":"GeneID"})
ttpm.head()

Sample,GeneID,SRR11933261,SRR11933272,SRR11933250,SRR11933029,SRR11933040,SRR11932822,SRR11932811,SRR11933230,SRR11932879,...,Ms71D3C,Ki3D1C,CML228D1D,CML333D3D,MO18WD3C,B73D3C,NC358D3C,P39D3D,M162WD3D,M162WD1D
0,Zm00001eb000010,12.553818,16.255838,9.028815,8.20134,10.371251,37.430009,39.925873,30.677016,23.393003,...,1.417104,1.923525,1.427602,9.580153,1.2281,2.966207,1.791556,4.286976,3.435711,3.498243
1,Zm00001eb000020,2.321077,3.110372,2.984479,2.385748,2.799099,27.508819,22.44068,24.648455,7.595576,...,0.0,1.799671,0.0,0.0,1.925157,0.561768,0.176413,0.781353,0.379497,0.463832
2,Zm00001eb000050,0.04252,0.405226,0.0,0.0,0.0,0.0,0.0,0.0,0.304751,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Zm00001eb000060,12.932676,7.214039,3.092442,1.726808,1.280629,29.510498,22.148225,22.170584,14.727189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Zm00001eb000070,5.253755,1.902461,2.586555,1.926412,3.771234,7.005587,7.590336,5.274585,2.177748,...,0.0,0.451827,0.0,1.018369,0.0,0.0,0.0,0.0,1.660372,0.748587


In [6]:
# load all the fold change dataframes into a dictionary
stress_fc = {}
for f in os.listdir("../../data"):
    if f != "log2FoldChange_allStress_meanTPM.tsv":
        if f.endswith("_meanTPM.tsv"):
            s = f.strip().split("_m")[0].split("e_")[1]
            stress_fc[s] = pd.read_csv("../../data/"+f,sep="\t",header="infer")

In [7]:
# for each fold change dataframe, subset to only stress-responsive genes, i.e. those with FC >2 or <0.5
srgenes = {}
upgenes = {}
dngenes = {}
for k in stress_fc.keys():
    df = stress_fc[k]
    low = df[df["FoldChange"]<0.5]
    high = df[df["FoldChange"]>2]
    subdf = pd.concat([low,high])
    srgenes[k] = subdf
    upgenes[k] = high
    dngenes[k] = low

In [8]:
# load core gene TFs of interest
cgtf = pd.read_csv("../../data/core_genes_TF_fam_of_interest.tsv",sep="\t",header="infer")
cgtf.head()

Unnamed: 0,gene ID,Name,Upregulated,Downregulated,From_RF,From_Set_Ops,family,protein name
0,Zm00001eb042240,DRE-binding protein 1,Y,N,Y,N,AP2/ERF-ERF,ZmEREB204
1,Zm00001eb249290,AP2-EREBP-transcription factor 195,N,Y,N,Y,AP2/ERF-AP2,ZmEREB195
2,Zm00001eb318890,AP2-EREBP-transcription factor 36,Y,Y,Y,N,AP2/ERF-AP2,ZmEREB36
3,Zm00001eb296430,AP2-EREBP-transcription factor 34,Y,Y,Y,N,AP2/ERF-AP2,ZmEREB34
4,Zm00001eb146690,bZIP-transcription factor 95,Y,N,Y,N,bZIP,ZmbZIP95


In [9]:
# load list of transcription factors
tf = pd.read_csv("../../data/data.csv",sep=",",header="infer")
tf.head()

Unnamed: 0,number for sorting purposes,protein name,family,gene ID,synonym,clone,all gene IDs,Unnamed: 7
0,1,ZmABI1,ABI3-VP1,Zm00001eb143690,VP1,pUT3351,Zm00001eb143690 Zm00001d042396 GRMZM2G133398,
1,2,ZmABI2,ABI3-VP1,Zm00001eb427970,ABI2,pUT3663,Zm00001d026005 Zm00001eb427970 GRMZM2G018485,
2,3,ZmABI3,ABI3-VP1,Zm00001eb066270,ABI3,pUT3236,Zm00001eb066270 Zm00001d001838 GRMZM2G149940,
3,4,ZmABI4,ABI3-VP1,Zm00001eb051330,ABI4,pUT4675,Zm00001d033313 GRMZM2G098063 Zm00001eb051330,
4,5,ZmABI5,ABI3-VP1,Zm00001eb218990,ABI5,pUT4673,Zm00001d013722 Zm00001eb218990 GRMZM2G320754,


In [10]:
# unified list of transcription factors:
tfl = list(tf["gene ID"].unique())
cgtfl = list(cgtf["gene ID"].unique())
tflist = list(set(tfl+cgtfl))

In [14]:
# start with the easy one: what percent of each module is core genes?
## to do this, need to note for each module whether it is a core gene (Y/N) and whether it is responsive to each stress (Y/N)
## also whether upregulated or downregulated in response to each stress
moddfs = {}
for m in modgenes.keys():
    genelist = modgenes[m]
    iscore = []
    istf = []
    stressors = {"Flooding":[],"Drought":[],"Salt":[],"Heat":[],"Low_Nitrogen":[],"Cold":[]}
    for g in genelist:
        # find if it is a core gene
        if g in list(cg["GeneID"].unique()):
            iscore.append("Y")
        else:
            iscore.append("N")
        # find if it is a TF
        if g in tflist:
            istf.append("Y")
        else:
            istf.append("N")
        # find what its regulation is (Up, Down, or None) in each stressor
        for k in srgenes.keys():
            up = upgenes[k]
            dn = dngenes[k]
            if g in list(up["GeneID"].unique()):
                stressors[k].append("Up")
            elif g in list(dn["GeneID"].unique()):
                stressors[k].append("Down")
            else:
                stressors[k].append("None")
    # build dataframe
    df = pd.DataFrame(list(zip(genelist,iscore,istf,stressors["Flooding"],stressors["Drought"],stressors["Salt"],
                              stressors["Heat"],stressors["Low_Nitrogen"],stressors["Cold"])),
                     columns=["GeneID","isCore","isTF","FloodingReg","DroughtReg","SaltReg","HeatReg","Low_NitrogenReg",
                             "ColdReg"])
    # add Module column
    df["Module"] = m
    # add dataframe to dictionary
    moddfs[m] = df

In [17]:
# stick all the module dataframes together to make one big dataframe
allmods = pd.concat(moddfs.values(),axis=0)

In [18]:
# what percent of each module is core genes?
# build a dataframe to answer this question
modules = []
percent_core = []
for m in list(allmods["Module"].unique()):
    df = allmods[allmods["Module"]==m]
    modules.append(m)
    # subset to just core genes
    cdf = df[df["isCore"]=="Y"]
    # find the percentage
    pct = (len(cdf["GeneID"].unique())/len(df["GeneID"].unique()))*100
    percent_core.append(pct)

pcdf = pd.DataFrame(list(zip(modules,percent_core)),columns=["Module","Percent_Core_Genes"])

In [19]:
pcdf

Unnamed: 0,Module,Percent_Core_Genes
0,tan,2.431118
1,grey,33.333333
2,green,1.328413
3,turquoise,1.254871
4,darkgreen,0.0
5,greenyellow,5.487805
6,yellow,6.449975
7,royalblue,0.956938
8,darkred,2.926829
9,red,4.43459


In [20]:
# make a list of the modules enriched in core genes (both including and excluding TFs of families of interest)
enrichmod = ["green","turquoise","greenyellow","yellow","red","magenta","lightcyan","pink","lightgreen","black","cyan","brown",
            "grey60","midnightblue"]
# make another list of modules containing TFs of families of interest
tfintmod = ["turquoise","greenyellow","yellow","red","magenta","lightcyan","lightgreen","black","brown","grey60","midnightblue"]

In [21]:
# make a column in the dataframe indicating whether each module is in each list or not
isenriched = []
istfint = []
for m in list(pcdf["Module"]):
    if m in enrichmod:
        isenriched.append("Y")
    else:
        isenriched.append("N")
        
    if m in tfintmod:
        istfint.append("Y")
    else:
        istfint.append("N")
        
pcdf["CG_enriched"] = isenriched
pcdf["contains_TF_ofint"] = istfint

In [23]:
pcdf[pcdf["CG_enriched"]=="Y"]

Unnamed: 0,Module,Percent_Core_Genes,CG_enriched,contains_TF_ofint
2,green,1.328413,Y,N
3,turquoise,1.254871,Y,Y
5,greenyellow,5.487805,Y,Y
6,yellow,6.449975,Y,Y
9,red,4.43459,Y,Y
10,magenta,4.639805,Y,Y
12,lightcyan,7.356948,Y,Y
14,pink,1.086957,Y,N
15,lightgreen,9.363296,Y,Y
16,black,0.965195,Y,Y


In [None]:
# what percent of each module is responsive to each stressor? (exclude core genes)
# for this part, we are not separating upregulated & downregulated, just looking broadly at responsive genes
pdr = []
pfr = []
psr = []
plnr = []
phr = []
pcr = []
