In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

import submitit
import sys
from os.path import join

sys.path.append("../../src")
import simulate
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import subprocess
import itertools

/u/project/pasaniuc/kangchen/2021-h2gene/experiments/01_simulate


In [2]:
CHROM = 1
SUMSTATS_DIR = "/u/project/pasaniuc/pasaniucdata/UKBB_IMPUTED_LD_SUMSTATS"
PLINK_PATH = join(SUMSTATS_DIR, f"genotype/raw/chr{CHROM}")
LD_PATH = join(SUMSTATS_DIR, "ld/")

In [7]:
df_params = pd.DataFrame(
    [params for params in itertools.product([20, 50, 100], [5], [3], [0.001, 0.01])]
    + [params for params in itertools.product([20, 50, 100], [10], [6], [0.001, 0.01])],
    columns=[
        "n_causal_gene",
        "n_body_causal_snp",
        "n_tss_causal_snp",
        "prob_background_causal_snp",
    ],
)
df_params["h2_total"] = 0.05
df_params["h2_body"] = 0.03
df_params["h2_tss"] = 0.01

# TEMPORARY: sub-sample
df_params = df_params.iloc[0:2]

# Simulate GWAS

In [8]:
def submit_simulate(param_i, root_dir="out/simulated_gwas"):
    import sys
    from os.path import join

    sys.path.append("../../src")
    import simulate

    params = df_params.iloc[param_i, :]
    sim = simulate.simulate(
        PLINK_PATH,
        df_gene="data/df_gene.tsv",
        n_causal_gene=int(params.n_causal_gene),
        n_body_causal_snp=int(params.n_body_causal_snp),
        n_tss_causal_snp=int(params.n_tss_causal_snp),
        prob_background_causal_snp=float(params.prob_background_causal_snp),
        h2_total=float(params.h2_total),
        h2_body=float(params.h2_body),
        h2_tss=float(params.h2_tss),
    )
    out_dir = join(root_dir, f"param_{param_i}")

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
        
    np.save(join(out_dir, "beta.npy"), sim["beta"])
    np.save(join(out_dir, "pheno.npy"), sim["pheno"])
    np.save(join(out_dir, "beta_hat.npy"), sim["beta_hat"])

In [9]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=60 * 4,
    memory_g=60,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(submit_simulate, np.arange(len(df_params)))

# Partition association files

In [33]:
def submit_partition_assoc(param_i, root_dir="out/simulated_gwas"):
    with open(PLINK_PATH + ".fam") as f:
        n_indiv = len(f.readlines())
    partition = pd.read_csv("data/partition.bed", delim_whitespace=True)
    snp_info = pd.read_csv("data/snp_info.tsv", delim_whitespace=True)
    snp_info["N"] = n_indiv

    beta_hat = np.load(join(root_dir, f"param_{param_i}", "beta_hat.npy"))
    n_sim = beta_hat.shape[1]

    out_dir = join(root_dir, f"param_{param_i}", "partitioned_assoc")

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    for sim_i in range(n_sim):
        assoc = snp_info.copy()
        assoc["Z"] = np.sqrt(n_indiv) * beta_hat[:, sim_i]

        for par_i, par in partition.iterrows():
            par_snps = np.where(
                (par.CHR == assoc.CHR.values)
                & (par.START <= assoc.BP.values)
                & (assoc.BP.values < par.STOP)
            )[0]
            filename = join(out_dir, f"sim_{sim_i}_par_{par_i}.tsv.gz")
            assoc.iloc[
                par_snps,
            ].to_csv(filename, sep="\t", index=False, float_format="%.6f")

In [None]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=60,
    memory_g=12,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(submit_partition_assoc, np.arange(len(df_params)))

# Estimation

In [49]:
def submit_estimate(param_i, par_i, sim_i, root_dir="out/estimate"):

    gene_list = "data/df_gene.tsv"
    ld_prefix = join(LD_PATH, str(CHROM), f"par_{par_i}")
    sumstats = f"out/simulated_gwas/param_{param_i}/partitioned_assoc/sim_{sim_i}_par_{par_i}.tsv.gz"

    out_dir = join(root_dir, f"param_{par_i}")
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    cmd = " ".join(
        [
            "/u/project/pasaniuc/kangchen/software/miniconda3/envs/r/bin/Rscript",
            "/u/project/pasaniuc/kangchen/2021-h2gene/src/h2gene_cli.R",
            f"--ld_prefix {ld_prefix}",
            f"--gene_list {gene_list}",
            f"--sumstats {sumstats}",
            "--out",
            join(out_dir, f"sim_{sim_i}_par_{par_i}.rds"),
        ]
    )

    subprocess.check_output(cmd, shell=True)

In [None]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=20,
    memory_g=8,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

iter_param, iter_par, iter_sim = list(
    zip(*itertools.product(np.arange(len(df_params)), np.arange(100), np.arange(30)))
)

# jobs = executor.map_array(submit_partition_assoc, np.arange(len(df_params)))

In [50]:
submit_estimate(0, 0, 0)

/u/project/pasaniuc/kangchen/software/miniconda3/envs/r/bin/Rscript /u/project/pasaniuc/kangchen/2021-h2gene/src/h2gene_cli.R --ld_prefix /u/project/pasaniuc/pasaniucdata/UKBB_IMPUTED_LD_SUMSTATS/ld/1/par_0 --gene_list data/df_gene.tsv --sumstats out/simulated_gwas/param_0/partitioned_assoc/sim_0_par_0.tsv.gz --out out/estimate/param_0/sim_0_par_0.rds
