# Phenotype simulation from real genotypes

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
from os.path import join
import numpy as np
import pandas as pd
import admix
import matplotlib.pyplot as plt
from os.path import join
import submitit
import dapgen
from tqdm import tqdm
import dask.array as da
import os
import itertools
import utils
import admix_prs

In [3]:
PLINK_DIR = "../00-compile-data/out/PLINK/all"
N_SIM = 100
TRAIN_INDIV_PATH = "../00-compile-data/out/INDIVLIST/eur_train.indiv"
VAL_INDIV_PATH = "../00-compile-data/out/INDIVLIST/eur_val.indiv"
bfile_list = [join(PLINK_DIR, f"chr{chrom}") for chrom in range(1, 23)]

In [4]:
df_params = pd.DataFrame(
    [
        params
        for params in itertools.product(
            [0.05, 0.25],
            [0.001, 0.01],
            ["mafukb", "uniform", "gcta"],
        )
    ],
    columns=["hsq", "pcausal", "hermodel"],
)

df_params["name"] = df_params.apply(
    lambda r: f"hsq-{r.hsq}-pcausal-{r.pcausal}-hermodel-{r.hermodel}", axis=1
)

# Simulation phenotype

In [78]:
import submitit

executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=600,
    memory_g=32,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)


def simulate_pheno(hsq, causal_prop, hermodel, name):
    out_dir = f"out/PHENO/{name}"
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    np.random.seed(42)
    utils.simulate_quant_pheno(
        bfile_list=bfile_list,
        hsq=hsq,
        causal_prop=causal_prop,
        out_prefix=out_dir + "/sim",
        hermodel=hermodel,
        n_sim=N_SIM,
    )


jobs = executor.map_array(
    simulate_pheno,
    df_params.hsq,
    df_params.pcausal,
    df_params.hermodel,
    df_params.name,
)

# Simulate GWAS

In [6]:
def prepare_ldpred2(name, sim_i):
    pheno_dir = f"out/PHENO/{name}"
    out_dir = f"out/LDPRED2/{name}"

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    df_pheno = pd.read_csv(f"{pheno_dir}/sim.pheno.tsv.gz", sep="\t", index_col=0)

    dict_indiv = {}
    for group, path in zip(["train", "validate"], [TRAIN_INDIV_PATH, VAL_INDIV_PATH]):
        df_tmp = pd.read_csv(path, sep="\t", header=None).astype(str)
        dict_indiv[group] = (df_tmp.iloc[:, 0] + "_" + df_tmp.iloc[:, 1]).values

    df_train_pheno = df_pheno.loc[dict_indiv["train"]]
    df_validate_pheno = df_pheno.loc[dict_indiv["validate"]]

    df_assoc = []
    for chrom in range(1, 23):
        out_prefix = f"{out_dir}/sim_{sim_i}.chr{chrom}"
        admix.tools.plink2.gwas(
            bfile=join(PLINK_DIR, f"chr{chrom}"),
            df_sample_info=df_train_pheno,
            pheno_col=f"SIM_{sim_i}",
            out_prefix=out_prefix,
            clean_tmp_file=True,
        )
        df_assoc.append(pd.read_csv(out_prefix + ".assoc", delim_whitespace=True))
        os.remove(out_prefix + ".assoc")
    df_assoc = pd.concat(df_assoc)
    df_assoc.to_csv(f"{out_dir}/sim_{sim_i}.assoc.gz", sep="\t", index=False)

    admix_prs.plink2_assoc_to_ldpred2(f"{out_dir}/sim_{sim_i}.assoc.gz").to_csv(
        f"{out_dir}/sim_{sim_i}.assoc.ldpred2.tsv.gz", index=False, sep="\t"
    )

    # seperating validation phenotype
    df_validate_pheno[["FID", "IID", f"SIM_{sim_i}"]].to_csv(
        f"{out_dir}/sim_{sim_i}.eur_val.pheno.tsv.gz", index=False, sep="\t"
    )

In [74]:
df_ldpred2_params = df_params.merge(
    pd.DataFrame({"sim_i": np.arange(100)}), how="cross"
)

In [7]:
import submitit

executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=120,
    memory_g=30,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)
name = "hsq-0.25-pcausal-0.01-hermodel-gcta"
jobs = executor.map_array(prepare_ldpred2, [name] * 1, np.arange(9, 10))