In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import submitit
import sys
from os.path import join

sys.path.append("../../src")
import simulate
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import subprocess
import itertools

/u/project/pasaniuc/kangchen/2021-h2gene/experiments/01-simulate


In [2]:
CHROM = 1
SUMSTATS_DIR = "/u/project/pasaniuc/pasaniucdata/UKBB_IMPUTED_LD_SUMSTATS"
PLINK_PATH = join(SUMSTATS_DIR, f"genotype/raw/chr{CHROM}")
LD_PATH = join(SUMSTATS_DIR, "ld/")

In [3]:
df_params = pd.DataFrame(
    [params for params in itertools.product([20, 50, 100], [5], [3], [0.001, 0.01])]
    + [params for params in itertools.product([20, 50, 100], [10], [6], [0.001, 0.01])],
    columns=[
        "n_causal_gene",
        "n_body_causal_snp",
        "n_tss_causal_snp",
        "prob_background_causal_snp",
    ],
)
df_params["h2_total"] = 0.05
df_params["h2_body"] = 0.03
df_params["h2_tss"] = 0.01

In [4]:
df_params

Unnamed: 0,n_causal_gene,n_body_causal_snp,n_tss_causal_snp,prob_background_causal_snp,h2_total,h2_body,h2_tss
0,20,5,3,0.001,0.05,0.03,0.01
1,20,5,3,0.01,0.05,0.03,0.01
2,50,5,3,0.001,0.05,0.03,0.01
3,50,5,3,0.01,0.05,0.03,0.01
4,100,5,3,0.001,0.05,0.03,0.01
5,100,5,3,0.01,0.05,0.03,0.01
6,20,10,6,0.001,0.05,0.03,0.01
7,20,10,6,0.01,0.05,0.03,0.01
8,50,10,6,0.001,0.05,0.03,0.01
9,50,10,6,0.01,0.05,0.03,0.01


# Simulate GWAS

In [6]:
def submit_simulate(param_i, root_dir="out/simulated_gwas"):
    import sys
    from os.path import join

    sys.path.append("../../src")
    import simulate

    params = df_params.iloc[param_i, :]
    sim = simulate.simulate(
        PLINK_PATH,
        df_gene="data/df_gene.tsv",
        n_causal_gene=int(params.n_causal_gene),
        n_body_causal_snp=int(params.n_body_causal_snp),
        n_tss_causal_snp=int(params.n_tss_causal_snp),
        prob_background_causal_snp=float(params.prob_background_causal_snp),
        h2_total=float(params.h2_total),
        h2_body=float(params.h2_body),
        h2_tss=float(params.h2_tss),
    )
    out_dir = join(root_dir, f"param_{param_i}")

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    np.save(join(out_dir, "beta.npy"), sim["beta"])
    np.save(join(out_dir, "pheno.npy"), sim["pheno"])
    np.save(join(out_dir, "beta_hat.npy"), sim["beta_hat"])

In [7]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=60 * 4,
    memory_g=60,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(submit_simulate, np.arange(len(df_params)))

# Partition association files

In [6]:
def submit_partition_assoc(param_i, root_dir="out/simulated_gwas"):
    with open(PLINK_PATH + ".fam") as f:
        n_indiv = len(f.readlines())
    partition = pd.read_csv("data/partition.bed", delim_whitespace=True)
    snp_info = pd.read_csv("data/snp_info.tsv", delim_whitespace=True)
    snp_info["N"] = n_indiv

    beta_hat = np.load(join(root_dir, f"param_{param_i}", "beta_hat.npy"))
    n_sim = beta_hat.shape[1]

    out_dir = join(root_dir, f"param_{param_i}", "partitioned_assoc")

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    for sim_i in range(n_sim):
        assoc = snp_info.copy()
        assoc["Z"] = np.sqrt(n_indiv) * beta_hat[:, sim_i]

        for par_i, par in partition.iterrows():
            par_snps = np.where(
                (par.CHR == assoc.CHR.values)
                & (par.START <= assoc.BP.values)
                & (assoc.BP.values < par.STOP)
            )[0]
            filename = join(out_dir, f"sim_{sim_i}_par_{par_i}.tsv.gz")
            assoc.iloc[
                par_snps,
            ].to_csv(filename, sep="\t", index=False, float_format="%.6f")

In [7]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=60,
    memory_g=12,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(submit_partition_assoc, np.arange(len(df_params)))

# Estimation

In [4]:
def submit_estimate(param_i, par_i, sim_i=None, n_sim=30, root_dir="out/estimate"):

    gene_list = "data/df_gene.tsv"
    ld_prefix = join(LD_PATH, str(CHROM), f"par_{par_i}")
    out_dir = join(root_dir, f"param_{param_i}")

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    if sim_i is None:
        sim_list = np.arange(n_sim)
    else:
        sim_list = [sim_i]

    for sim_i in sim_list:
        sumstats = f"out/simulated_gwas/param_{param_i}/partitioned_assoc/sim_{sim_i}_par_{par_i}.tsv.gz"

        cmd = " ".join(
            [
                "/u/project/pasaniuc/kangchen/software/miniconda3/envs/r/bin/Rscript",
                "/u/project/pasaniuc/kangchen/2021-h2gene/src/h2gene_cli_v2.R",
                f"--ld_prefix {ld_prefix}",
                f"--gene_list {gene_list}",
                f"--sumstats {sumstats}",
                "--min_cor 0.0",
                "--out",
                join(out_dir, f"sim_{sim_i}_par_{par_i}.rds"),
            ]
        )

        print(cmd)

        subprocess.check_output(cmd, shell=True)


# use GROUP_SIM = True for the first run.
GROUP_SIM = False

executor = submitit.SgeExecutor(folder="./submitit-logs")

with open("data/partition.bed") as f:
    n_par = len(f.readlines()) - 1

if GROUP_SIM:
    executor.update_parameters(
        time_min=120,
        memory_g=12,
        setup=[
            "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
            "export PYTHONNOUSERSITE=True",
        ],
    )

    ### first iteration, to calculate for group of results.
    # `n_par` partitions, 30 simulations
    iter_param, iter_par = list(
        zip(*itertools.product(np.arange(len(df_params)), np.arange(n_par)))
    )

    jobs = executor.map_array(submit_estimate, iter_param, iter_par)
else:
    executor.update_parameters(
        time_min=30,
        memory_g=12,
        setup=[
            "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
            "export PYTHONNOUSERSITE=True",
        ],
    )

    def result_path(g):
        param_i, par_i, sim_i = g
        return join("out/estimate", f"param_{param_i}", f"sim_{sim_i}_par_{par_i}.rds")

    iter_param, iter_par, iter_sim = list(
        zip(
            *[
                g
                for g in itertools.product(
                    np.arange(len(df_params)), np.arange(n_par), np.arange(30)
                )
                if not os.path.exists(result_path(g))
            ]
        )
    )
    jobs = executor.map_array(submit_estimate, iter_param, iter_par, iter_sim)

# Summarize the estimates

In [17]:
def submit_summary(param_i, root_dir="out"):

    cmd = " ".join(
        [
            "/u/project/pasaniuc/kangchen/software/miniconda3/envs/r/bin/Rscript",
            "/u/project/pasaniuc/kangchen/2021-h2gene/src/sim_summary.R",
            f"--sim_dir {root_dir}/simulated_gwas/param_{param_i}",
            f"--ld_dir {LD_PATH}/{CHROM}",
            f"--estimate_dir out/estimate/param_{param_i}",
            "--snp_info data/snp_info.tsv",
            "--partition data/partition.bed",
            "--gene_list data/df_gene.tsv",
            "--PI_prob 0.9",
            f"--out_prefix {root_dir}/summary/param_{param_i}",
        ]
    )

    print(cmd)

    subprocess.check_output(cmd, shell=True)

In [None]:
submit_summary(1)

/u/project/pasaniuc/kangchen/software/miniconda3/envs/r/bin/Rscript /u/project/pasaniuc/kangchen/2021-h2gene/src/sim_summary.R --sim_dir out/simulated_gwas/param_1 --ld_dir /u/project/pasaniuc/pasaniucdata/UKBB_IMPUTED_LD_SUMSTATS/ld//1 --estimate_dir out/estimate/param_1 --snp_info data/snp_info.tsv --partition data/partition.bed --gene_list data/df_gene.tsv --PI_prob 0.9 --out_prefix out/summary/param_1
