In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
from os.path import join
import numpy as np
import pandas as pd
import admix
import matplotlib.pyplot as plt
from os.path import join
import submitit
import dapgen
from tqdm import tqdm
import dask.array as da
import os
import itertools
import admix_prs
import statsmodels.api as sm

In [3]:
PLINK_DIR = "../00-compile-data/out/PLINK/all"
PHENO_DIR = "../00-compile-data/out/REAL-PHENO/"
TRAIN_INDIV_PATH = "../00-compile-data/out/INDIVLIST/eur_train.indiv"
VAL_INDIV_PATH = "../00-compile-data/out/INDIVLIST/eur_val.indiv"

In [4]:
def prepare_ldpred2(trait):
    out_dir = f"out/LDPRED2/{trait}"

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    df_pheno = pd.read_csv(f"{PHENO_DIR}/{trait}.raw.pheno", sep="\t").astype(
        {"FID": str, "IID": str}
    )
    df_pheno.index = df_pheno.FID + "_" + df_pheno.IID

    df_covar = pd.read_csv(f"{PHENO_DIR}/all.covar", sep="\t").astype(
        {"FID": str, "IID": str}
    )
    df_covar.index = df_covar.FID + "_" + df_covar.IID
    covar_cols = ["AGE", "SEX"] + [f"PC{i}" for i in range(1, 11)]

    df_pheno = pd.merge(
        df_pheno, df_covar[covar_cols], left_index=True, right_index=True
    )

    # subset individuals (train, validation)
    dict_indiv = {}
    for group, path in zip(["train", "validate"], [TRAIN_INDIV_PATH, VAL_INDIV_PATH]):
        df_tmp = pd.read_csv(path, sep="\t", header=None).astype(str)
        dict_indiv[group] = (df_tmp.iloc[:, 0] + "_" + df_tmp.iloc[:, 1]).values

    df_train_pheno = df_pheno.reindex(dict_indiv["train"])
    # covariates in training is coped with PC
    # calculate residuals on the phenotype for validation set
    df_validate_pheno = df_pheno.reindex(dict_indiv["validate"])
    resids = (
        sm.OLS(
            df_validate_pheno["PHENO"],
            sm.add_constant(df_validate_pheno[covar_cols]),
            missing="drop",
        )
        .fit()
        .resid
    )
    df_validate_pheno["PHENO-RESIDUAL"] = resids.reindex(df_validate_pheno.index)

    df_assoc = []
    for chrom in range(1, 23):
        out_prefix = f"{out_dir}/chr{chrom}"
        admix.tools.plink2.gwas(
            bfile=join(PLINK_DIR, f"chr{chrom}"),
            df_sample_info=df_train_pheno,
            pheno_col="PHENO",
            covar_cols=covar_cols,
            out_prefix=out_prefix,
            clean_tmp_file=False if chrom == 1 else True,
            pheno_quantile_normalize=True,
        )
        df_assoc.append(pd.read_csv(out_prefix + ".assoc", delim_whitespace=True))
        os.remove(out_prefix + ".assoc")
    df_assoc = pd.concat(df_assoc)
    df_assoc.to_csv(f"{out_dir}/assoc.gz", sep="\t", index=False)

    admix_prs.plink2_assoc_to_ldpred2(f"{out_dir}/assoc.gz").to_csv(
        f"{out_dir}/assoc.ldpred2.tsv.gz", index=False, sep="\t"
    )

    # seperating validation phenotype
    df_validate_pheno[["FID", "IID", f"PHENO-RESIDUAL"]].to_csv(
        f"{out_dir}/eur_val.pheno.tsv.gz", index=False, sep="\t"
    )

In [5]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=240,
    memory_g=30,
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)
trait_list = ["height", "bmi", "cholesterol", "hdl_cholesterol", "ldl_direct"]
jobs = executor.map_array(prepare_ldpred2, trait_list)