In [2]:
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import numpy as np
from statsmodels.formula.api import GLM
import statsmodels as sm
from statsmodels.genmod.families.family import NegativeBinomial, Gaussian

In [3]:
compound_list = ["HMDA", "PUTR", "HEXA", "OCTA", "IBUA", "COUM"]

In [5]:
def calc_deviance(formula, data, family=NegativeBinomial()):
    model = GLM.from_formula(formula=formula, data=data, family=family).fit()
    return model.deviance

def analysis_of_deviance(data, response, factor1, factor2):
    """Simple analysis of deviance for a 2-factor design."""
    f_values = {}
    interaction = factor1 + ":" + factor2
    interaction_formula = "{} ~ {} + {} + {}".format(response, factor1, factor2, interaction)
    two_fac_formula = "{} ~ {} + {}".format(response, factor1, factor2)
    two_fac_deviance = calc_deviance(two_fac_formula, data)
    f_values[interaction] = two_fac_deviance - calc_deviance(interaction_formula, data)
    fac1_formula = "{} ~ {}".format(response, factor1)
    fac2_formula = "{} ~ {}".format(response, factor2)
    f_values[factor1] = calc_deviance(fac2_formula, data) - two_fac_deviance
    f_values[factor2] = calc_deviance(fac1_formula, data) - two_fac_deviance
    return f_values

for comp in compound_list:
    print(comp)
    df = pd.read_csv("../Data/RNA-Seq/GSEA/FPKM_counts_{}.txt".format(comp), sep="\t")
    genes = []
    for gene in df["NAME"]:
        if gene in ("def"):
            gene = "_" + gene
        genes.append(gene)
    df["NAME"] = genes
    df.index = df["NAME"]
    del df["NAME"]
    del df["DESCRIPTION"]
    gene_list = list(df.index)
    df = df.transpose()

    with open("../Data/RNA-Seq/GSEA/classes_{}.cls".format(comp)) as infile:
        classes = infile.readlines()[2].strip().split()

    df["CLASSES"] = classes
    df["STRAIN"] = df["CLASSES"].map(lambda x: 0 if "MG1655" in x else 1)
    df["MEDIUM"] = df["CLASSES"].map(lambda x: 0 if "m9" in x.lower() else 1)
    df = df[["CLASSES", "STRAIN", "MEDIUM"] + gene_list]
    
    data = []
    for gene in gene_list:
        #print("  ", gene)
        #model = ols("{} ~ STRAIN + MEDIUM + STRAIN*MEDIUM".format(gene), df).fit()
        #an = anova_lm(model, type=3)
        #di = dict(an["F"])
        if df[gene].var() == 0:
            di = {"MEDIUM": 0, "GENE": 0, "MEDIUM:STRAIN": 0}
        else:
            di = analysis_of_deviance(df, gene, "MEDIUM", "STRAIN")
        di["GENE"] = gene
        data.append(di)

    res_df = pd.DataFrame(data)
    #del res_df["Residual"]

    res_df = res_df[pd.notnull(res_df).all(1)]

    media_directions = np.sign(df[df["MEDIUM"] == 1].mean() - df[df["MEDIUM"] == 0].mean())
    strain_directions = np.sign(df[df["STRAIN"] == 1].mean() - df[df["STRAIN"] == 0].mean())
    interaction_directions = np.sign(
        (
            df[(df["MEDIUM"] == 1) & (df["STRAIN"] == 1)].mean() -
            df[(df["MEDIUM"] == 0) & (df["STRAIN"] == 1)].mean()
        ) - 
        (
            df[(df["MEDIUM"] == 1) & (df["STRAIN"] == 0)].mean() -
            df[(df["MEDIUM"] == 0) & (df["STRAIN"] == 0)].mean()
        )
    )

    res_df["medium_direction"] = res_df["GENE"].map(media_directions.get)
    res_df["strain_direction"] = res_df["GENE"].map(strain_directions.get)
    res_df["interaction_direction"] = res_df["GENE"].map(interaction_directions.get)

    res_df["GENE"] = res_df["GENE"]
    res_df.to_csv("../Data/RNA-Seq/ANOVA_results_{}.tsv".format(comp), sep="\t", index=False)

HMDA
PUTR
HEXA
OCTA
IBUA
COUM


### Run the R-script "Gene_set_analysis.R" on the exported files



### This code loads and processes the results of "Gene_set_analysis.R"

In [6]:
for comp in compound_list:
    df = pd.read_csv("../Data/RNA-Seq/gsa_result_{}.tsv".format(comp), sep="\t")
    for col in ("Medium_p", "Strain_p", "Interaction_p"):
        df[col + "_fdr"] = df[col + "_UP"].fillna(0) + df[col + "_DN"].fillna(0)
        del df[col + "_UP"]
        del df[col + "_DN"]
    df = df[["Medium_ES", "Medium_p_fdr", "Strain_ES", "Strain_p_fdr", "Interaction_ES", "Interaction_p_fdr"]]
    df = df.sort_values("Strain_p_fdr")
    df.to_excel("../Data/RNA-Seq/Cleaned/GSA_Result_{}.xlsx".format(comp))

In [9]:
df.sort_values("Strain_p_fdr")

Unnamed: 0,Medium_ES,Medium_p_fdr,Strain_ES,Strain_p_fdr,Interaction_ES,Interaction_p_fdr
SIGMA38_POS,0.34889,1.00000,-0.77014,0.000000,0.41382,0.95880
RCSB_POS,0.41094,1.00000,-0.96429,0.006656,-0.93283,0.76580
GADX_POS,-0.45471,0.93866,-0.92691,0.008190,-0.86333,0.75893
PHOP_POS,-0.68401,0.51026,-0.86377,0.008190,0.46301,0.95376
GADE_POS,0.58735,1.00000,-0.89042,0.011503,-0.87850,1.00000
FLIZ_REP,-0.46335,0.92553,-0.91657,0.023037,-0.41843,0.96889
ECO03010,-0.57164,0.70636,-0.78685,0.052764,-0.35755,0.95393
H-NS_POS,0.43063,1.00000,-0.79626,0.102830,0.44798,0.96777
ECO00410,-0.46395,0.93607,-0.89868,0.110980,-0.52093,0.90660
YDEO_POS,0.47905,1.00000,-0.90518,0.112550,-0.76959,0.64170


In [106]:
pd.read_csv("../Data/RNA-Seq/gsa_result_HEXA.tsv", sep="\t")

Unnamed: 0,Medium_ES,Medium_p_UP,Medium_p_DN,Strain_ES,Strain_p_UP,Strain_p_DN,Interaction_ES,Interaction_p_UP,Interaction_p_DN
ECO00010,0.68754,0.184060,,-0.57859,,0.482520,0.60296,0.399610,
ECO00020,0.70950,0.290650,,0.37911,0.98934,,0.74320,0.146060,
ECO00030,0.74573,0.158920,,-0.43404,,0.818220,0.61804,0.428690,
ECO00040,-0.31367,,1.000000,0.46732,0.98153,,0.34157,0.912680,
ECO00051,0.75455,0.056491,,0.47836,0.96122,,0.74348,0.088827,
ECO00052,0.46761,0.679500,,-0.45006,,0.764620,0.68161,0.218830,
ECO00053,0.64636,0.610550,,0.61343,0.83083,,0.63020,0.556850,
ECO00061,-0.83731,,0.120250,0.80716,0.38073,,-0.69690,,0.480000
ECO00071,-0.52896,,0.844850,-0.86408,,0.071723,0.55290,0.650050,
ECO00130,-0.73962,,0.216670,0.46073,0.99087,,-0.58422,,0.599920


In [77]:
res_df[pd.isnull(res_df["medium_direction"])]

Unnamed: 0,GENE,MEDIUM,STRAIN,STRAIN:MEDIUM,medium_direction,strain_direction,interaction_direction
631,def,42.89509,22.186114,8.137389,,,


In [17]:
df[(df["MEDIUM"] == 0) & (df["STRAIN"] == 0)]

NAME,CLASSES,STRAIN,MEDIUM,thrL,rnhB,yedL,intG,fliH,fliF,fliG,...,ftnA,tyrP,yecF,fliD,fliT,fliS,amyA,yedF,yedE,yedK
old_MG1655-M9-_1,MG1655_M9,0,0,0.0,126.027,175.638,0.0,11.7411,9.89887,6.56088,...,148.234,37.5162,315.02,4.81542,10.5005,12.2357,173.368,284.34,184.532,8.87312
old_MG1655-M9-_2,MG1655_M9,0,0,0.0,68.6303,117.979,0.0,5.54898,6.03487,11.35,...,92.5561,31.1865,148.914,7.32016,6.03769,14.0922,173.436,420.158,217.457,12.6152
old_MG1655-M9-_3,MG1655_M9,0,0,0.0,80.5611,126.208,0.0,7.24025,5.296,17.9025,...,110.928,39.7186,221.895,5.00129,2.73799,2.1092,187.652,212.647,165.767,13.7224


In [49]:
df["STRAIN"]

old_COUM4-2-coum-_1    mut
old_COUM4-2-coum-_2    mut
old_COUM4-2-coum-_3    mut
old_COUM4-2-M9-_1      mut
old_COUM4-2-M9-_2      mut
old_COUM4-2-M9-_3      mut
old_MG1655-coum-_1      wt
old_MG1655-coum-_2      wt
old_MG1655-coum-_3      wt
old_MG1655-M9-_1        wt
old_MG1655-M9-_2        wt
old_MG1655-M9-_3        wt
Name: STRAIN, dtype: object

In [14]:
len(set(df["NAME"]))

4240

In [96]:
gs_path = pd.read_csv("../Data/RNA-Seq/GSEA/pathway_genesets_KEGG_2016_UPPER.gmx", sep="\t").transpose()
gs_path[0] = "NA"
gs_path = gs_path.fillna("")

gs_reg = pd.read_csv("../Data/RNA-Seq/GSEA/regulon_genesets_2016_dualdeconvolute_UPPER.gmx", sep="\t").transpose()
gs_reg[0] = "NA"
gs_reg = gs_reg.fillna("")

gs_comb = pd.concat([gs_path, gs_reg])

gs_comb.to_csv("../Data/RNA-Seq/GSEA/Combined_genesets_pathway_regulon.gmt", sep="\t", header=None)

In [98]:
pd.read_csv("../Data/RNA-Seq/gsa_result_COUM.tsv", sep="\t")

Unnamed: 0,Medium_ES,Medium_p_UP,Medium_p_DN,Strain_ES,Strain_p_UP,Strain_p_DN,Interaction_ES,Interaction_p_UP,Interaction_p_DN
ECO00010,0.47566,0.93726,,-0.51236,,0.766060,-0.74298,,0.044794
ECO00020,0.87034,0.40241,,-0.50414,,0.812660,0.41033,0.88522,
ECO00030,-0.44089,,0.85966,-0.50072,,0.811920,-0.80017,,0.045401
ECO00040,0.39040,0.95388,,0.59301,0.94993,,0.73558,0.39431,
ECO00051,-0.48046,,0.81260,0.45975,0.87719,,-0.40234,,0.829610
ECO00052,-0.65717,,0.54013,0.37314,0.92663,,-0.43908,,0.791810
ECO00053,0.83518,0.51766,,0.62503,0.96690,,0.80769,0.38619,
ECO00061,-0.79946,,0.41355,0.79171,0.75151,,-0.59628,,0.696470
ECO00071,-0.22466,,0.99676,-0.70231,,0.540090,0.65168,0.49522,
ECO00130,0.49004,0.92774,,-0.40967,,0.939470,0.32161,1.00000,
