This notebook contains code for doing GWAS regressions using the PyStan software.

In [1]:
import pandas as pd
import json
import pystan
from scipy import stats

In [12]:
growth_df = pd.read_csv("../Data/Growth_data/Cross_tolerance/Processed_growth_rates.tsv", sep="\t", index_col=0)

with open("../Data/Mutation_data/Strain_to_genes.json") as infile:
    genotypes = json.load(infile)
gene_to_strains = {}
for s, genes in genotypes.items():
    for gene in genes:
        gene_to_strains.setdefault(gene, []).append(s)
gene_list = sorted(gene_to_strains)

with open("../Data/Mutation_data/Mutations_to_gene_names.json") as infile:
    mut_to_genes = json.load(infile)
gene_to_muts = {}
for mut, genes in mut_to_genes.items():
    for gene in genes:
        gene_to_muts.setdefault(gene, []).append(mut)

In [4]:
gwas_df = growth_df.copy()
gwas_df = gwas_df.groupby(["strain", "grown_compound"]).mean().reset_index()
gwas_df = gwas_df[pd.notnull(gwas_df["rel_growth_rate"])]
gwas_genes = []
for gene in gene_list:
    if len(gene_to_strains[gene]) > 1:
        gwas_genes.append(gene)
        gwas_df[gene] = gwas_df["strain"].map(lambda x: 1 if x in gene_to_strains[gene] else 0)

In [5]:
# Define the regression model

stan_model = """
data {
    int N; // Number of observations
    int M; // Number of genes
    vector[N] y; // Growth rates
    matrix[N, M] X; // Mutation data
}

parameters {
    vector[M] beta; // coefficients
    real intercept; // intercept of the fit
    real log_sigma; // Measuring STD
    real log_tau; // Hyperparameter: STD of prior on betas
}

model {
    // increment_log_prob(normal_log(beta, 0, exp(log_tau)) + double_exponential_log(beta, 0, exp(log_tau)));
    // beta ~ double_exponential(0, exp(log_tau));
    beta ~ normal(0, exp(log_tau));
    y ~ normal(intercept + X*beta, exp(log_sigma));
}

"""

In [6]:
compound_list = [
    "HMDA", "putrescine", "1,2-propanediol", "2,3-butanediol",
    "glutarate", "adipate", "hexanoate", "octanoate", "isobutyrate", "coumarate", "butanol"]

chains = 4

stan_data_list = []

fits = {}
for comp in compound_list:
    print(comp)
    compound_df = gwas_df[gwas_df["grown_compound"] == comp]

    y = compound_df["rel_growth_rate"].values
    X = compound_df[gwas_genes]
    X = X - X.mean()
    X = X.values

    stan_data = {
        "N": len(y),
        "M": X.shape[1],
        "y": y,
        "X": X
    }
    stan_data_list.append(stan_data)
    fit = pystan.stan(model_code=stan_model, data=stan_data, chains=chains, n_jobs=chains, iter=500)
    fits[comp] = fit

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ad6b17b3e0394de93c979fff36836b1b NOW.


HMDA


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ad6b17b3e0394de93c979fff36836b1b NOW.


putrescine


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ad6b17b3e0394de93c979fff36836b1b NOW.


1,2-propanediol


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ad6b17b3e0394de93c979fff36836b1b NOW.


2,3-butanediol


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ad6b17b3e0394de93c979fff36836b1b NOW.


glutarate


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ad6b17b3e0394de93c979fff36836b1b NOW.


adipate


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ad6b17b3e0394de93c979fff36836b1b NOW.


hexanoate


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ad6b17b3e0394de93c979fff36836b1b NOW.


octanoate


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ad6b17b3e0394de93c979fff36836b1b NOW.


isobutyrate


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ad6b17b3e0394de93c979fff36836b1b NOW.


coumarate


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ad6b17b3e0394de93c979fff36836b1b NOW.


butanol


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  cls(buf, protocol).dump(obj)


In [7]:
def calc_significance(ar, b=0):
    perc = stats.percentileofscore(ar, b)
    perc = min(perc, 100-perc)
    perc = perc*2
    perc = perc / 100
    if perc == 0:
        perc = 1/len(ar)
    return perc

p_values = {}
coef_means = {}
for comp, fit in fits.items():
    samples = fit.extract()
    pvals = dict(pd.Series((calc_significance(samples["beta"][:, i]) for i in range(len(gwas_genes))), index=gwas_genes))
    p_values[comp] = pvals
    means = dict(pd.Series(samples["beta"].mean(0), index=gwas_genes))
    coef_means[comp] = means
        
p_val_df = pd.DataFrame(p_values)[compound_list]
coef_mean_df = pd.DataFrame(coef_means)[compound_list]

In [8]:
info = """
Gene coefficients estimated using Bayesian regression, with a normal prior (centered on 0) on coefficients.
Posterior 'pvalues' are reported, calculated as 1 - largest credible interval that contains 0.
"""
writer = pd.ExcelWriter("../Data/GWAS/Relative_diff_Bayesian_ridge_regression_coefs.xlsx")

p_val_df.to_excel(writer, sheet_name="posterior p_vals")
coef_mean_df.to_excel(writer, sheet_name="coef means")

pd.DataFrame({"Info": [info]}).to_excel(writer, sheet_name="info")
writer.save()

In [14]:
writer = pd.ExcelWriter("../Data/GWAS/Coefficients_and_significance.xlsx")

comp_conversion = {
    "1,2-propanediol": "12PD", "2,3-butanediol": "23BD", "putrescine": "PUTR", "HMDA": "HMDA", "glutarate": "GLUT",
    "adipate": "ADIP", "hexanoate": "HEXA", "octanoate": "OCTA", "coumarate": "COUM", "isobutyrate": "IBUA",
    "butanol": "BUT"
}

for comp in fits:
    comp_df = pd.DataFrame({"mean": coef_mean_df[comp], "significance": p_val_df[comp]})
    #comp_df["found"] = comp_df.index.map(keio_genes[comp_conversion[comp]].__contains__)
    comp_df["count"] = comp_df.index.map(lambda x: len(gene_to_strains[x]))
    comp_df["mutations"] = comp_df.index.map(lambda x: ", ".join(gene_to_muts[x]))
    comp_df = comp_df.sort_values("mean", ascending=False)
    comp_df.to_excel(writer, comp)

writer.save()