This notebook contains code for doing GWAS regressions using the PyStan software.

In [2]:
import pandas as pd
import json
import pystan
from scipy import stats

In [34]:
growth_df = pd.read_csv("../Data/Growth_data/Cross_tolerance/Cleaned_growth_rates.tsv", sep="\t", index_col=0)

with open("../Data/Mutation_data/Strain_to_genes.json") as infile:
    genotypes = json.load(infile)
gene_to_strains = {}
for s, genes in genotypes.items():
    for gene in genes:
        gene_to_strains.setdefault(gene, []).append(s)
gene_list = sorted(gene_to_strains)

with open("../Data/Mutation_data/Strain_to_all_genes.json") as infile:
    full_genotypes = json.load(infile)
gene_to_strains_all = {}
for s, genes in full_genotypes.items():
    for gene in genes:
        gene_to_strains_all.setdefault(gene, []).append(s)
all_gene_list = sorted(gene_to_strains_all)

with open("../Data/Mutation_data/Mutations_to_gene_names.json") as infile:
    mut_to_genes = json.load(infile)
gene_to_muts = {}
for mut, genes in mut_to_genes.items():
    for gene in genes:
        gene_to_muts.setdefault(gene, []).append(mut)

In [12]:
gwas_df = growth_df.copy()
gwas_df = gwas_df[gwas_df["strain"].isin(genotypes)]
gwas_df = gwas_df.groupby(["strain", "grown_compound"]).mean().reset_index()
gwas_df = gwas_df[pd.notnull(gwas_df["rel_growth_rate"])]
gwas_genes = []
for gene in gene_list:
    if len(gene_to_strains[gene]) > 1:
        gwas_genes.append(gene)
        gwas_df[gene] = gwas_df["strain"].map(lambda x: 1 if x in gene_to_strains[gene] else 0)

In [14]:
gwas_df

Unnamed: 0,strain,grown_compound,tray,repl,growth_rate,tOD1,rel_growth_rate,growth_ratio,argG,atpI,...,yicC,yihQ,yijD,yjiP,ymfE,yobF,yphF,ypjA,ypjC,ytfR
0,12PD4-6,"1,2-propanediol",11.000000,2.00,0.649652,13.000000,0.268549,1.704664,0,0,...,0,0,0,0,0,0,0,0,0,0
1,12PD4-6,"2,3-butanediol",4.000000,2.00,0.441688,15.833333,0.241234,2.203432,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12PD4-6,HMDA,6.000000,2.00,0.216226,46.750000,-0.009386,0.958399,0,0,...,0,0,0,0,0,0,0,0,0,0
3,12PD4-6,NaCl,12.000000,2.00,0.169823,44.000000,0.052829,1.451549,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12PD4-6,adipate,7.000000,2.00,0.180535,37.333333,-0.007204,0.961628,0,0,...,0,0,0,0,0,0,0,0,0,0
5,12PD4-6,butanol,1.000000,2.00,0.624626,47.166667,0.153901,1.326944,0,0,...,0,0,0,0,0,0,0,0,0,0
6,12PD4-6,coumarate,3.000000,2.00,0.383346,38.000000,0.082534,1.274372,0,0,...,0,0,0,0,0,0,0,0,0,0
7,12PD4-6,glutarate,2.000000,2.00,0.182738,38.416667,-0.002124,0.988512,0,0,...,0,0,0,0,0,0,0,0,0,0
8,12PD4-6,hexanoate,9.000000,2.00,0.602023,12.916667,0.174753,1.408998,0,0,...,0,0,0,0,0,0,0,0,0,0
9,12PD4-6,isobutyrate,8.000000,2.00,0.297726,48.750000,-0.020414,0.935833,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
len(gwas_df.columns[8:])

123

In [16]:
# Define the regression model

stan_model = """
data {
    int N; // Number of observations
    int M; // Number of genes
    vector[N] y; // Growth rates
    matrix[N, M] X; // Mutation data
}

parameters {
    vector[M] beta; // coefficients
    real intercept; // intercept of the fit
    real log_sigma; // Measuring STD
    real log_tau; // Hyperparameter: STD of prior on betas
}

model {
    beta ~ normal(0, exp(log_tau));
    y ~ normal(intercept + X*beta, exp(log_sigma));
}

"""

In [17]:
compound_list = [
    "HMDA", "putrescine", "1,2-propanediol", "2,3-butanediol",
    "glutarate", "adipate", "hexanoate", "octanoate", "isobutyrate", "coumarate", "butanol"]

chains = 4

stan_data_list = []

fits = {}
for comp in compound_list:
    print(comp)
    compound_df = gwas_df[gwas_df["grown_compound"] == comp]

    y = compound_df["rel_growth_rate"].values
    X = compound_df[gwas_genes]
    X = X - X.mean()
    X = X.values

    stan_data = {
        "N": len(y),
        "M": X.shape[1],
        "y": y,
        "X": X
    }
    stan_data_list.append(stan_data)
    fit = pystan.stan(model_code=stan_model, data=stan_data, chains=chains, n_jobs=chains, iter=500)
    fits[comp] = fit

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3040f2a787fc9a0a7519a2ea4cb77896 NOW.


HMDA


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3040f2a787fc9a0a7519a2ea4cb77896 NOW.


putrescine


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3040f2a787fc9a0a7519a2ea4cb77896 NOW.


1,2-propanediol


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3040f2a787fc9a0a7519a2ea4cb77896 NOW.


2,3-butanediol


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3040f2a787fc9a0a7519a2ea4cb77896 NOW.


glutarate


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3040f2a787fc9a0a7519a2ea4cb77896 NOW.


adipate


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3040f2a787fc9a0a7519a2ea4cb77896 NOW.


hexanoate


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)


octanoate


INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3040f2a787fc9a0a7519a2ea4cb77896 NOW.
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3040f2a787fc9a0a7519a2ea4cb77896 NOW.


isobutyrate


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3040f2a787fc9a0a7519a2ea4cb77896 NOW.


coumarate


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_3040f2a787fc9a0a7519a2ea4cb77896 NOW.


butanol


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)


In [18]:
def calc_significance(ar, b=0):
    perc = stats.percentileofscore(ar, b)
    perc = min(perc, 100-perc)
    perc = perc*2
    perc = perc / 100
    if perc == 0:
        perc = 1. / len(ar)
    return perc

p_values = {}
coef_means = {}
for comp, fit in fits.items():
    samples = fit.extract()
    pvals = dict(pd.Series((calc_significance(samples["beta"][:, i]) for i in range(len(gwas_genes))), index=gwas_genes))
    p_values[comp] = pvals
    means = dict(pd.Series(samples["beta"].mean(0), index=gwas_genes))
    coef_means[comp] = means
        
p_val_df = pd.DataFrame(p_values)[compound_list]
coef_mean_df = pd.DataFrame(coef_means)[compound_list]

In [30]:
significance_threshold = 0.15

info = """Gene coefficients estimated using Bayesian regression, with a normal prior (centered on 0) on coefficients.
Posterior 'pvalues' are reported, calculated as 1 - largest credible interval that contains 0.
Significant genes are selected where the posterior p-value is < {}
""".format(significance_threshold)

writer = pd.ExcelWriter("../Data/GWAS/Relative_diff_Bayesian_ridge_regression_coefs.xlsx")

p_val_df.to_excel(writer, sheet_name="posterior p_vals")
coef_mean_df.to_excel(writer, sheet_name="coef means")

significant_genes = {}
for comp in p_val_df:
    comp_ser = p_val_df[comp]
    comp_genes = list(comp_ser[comp_ser < significance_threshold].index)
    comp_gene_coefs = dict(coef_mean_df[comp][comp_genes])
    significant_genes[comp] = comp_gene_coefs
pd.DataFrame(significant_genes).to_excel(writer, sheet_name="significant genes")

pd.DataFrame({"Info": info.split("\n")}).to_excel(writer, sheet_name="info", index=None)
writer.save()

In [35]:
writer = pd.ExcelWriter("../Data/GWAS/Coefficients_and_significance.xlsx")

comp_conversion = {
    "1,2-propanediol": "12PD", "2,3-butanediol": "23BD", "putrescine": "PUTR", "HMDA": "HMDA", "glutarate": "GLUT",
    "adipate": "ADIP", "hexanoate": "HEXA", "octanoate": "OCTA", "coumarate": "COUM", "isobutyrate": "IBUA",
    "butanol": "BUT"
}

for comp in fits:
    comp_df = pd.DataFrame({"mean": coef_mean_df[comp], "significance": p_val_df[comp]})
    #comp_df["found"] = comp_df.index.map(keio_genes[comp_conversion[comp]].__contains__)
    comp_df["count"] = comp_df.index.map(lambda x: len(gene_to_strains[x]))
    comp_df["mutations"] = comp_df.index.map(lambda x: ", ".join(gene_to_muts[x]))
    comp_df = comp_df.sort_values("mean", ascending=False)
    comp_df.to_excel(writer, comp)

writer.save()

In [36]:
gene_to_muts["hlyE"]

['SNP-1230727-A']