# Perform prediction for all the individuals

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
import dapgen
from os.path import join
import pandas as pd
import numpy as np
import glob
import submitit
import subprocess

In [3]:
DATA_DIR = "/u/project/sgss/UKBB/PRS-RESEARCH/DATA"
PLINK_DIR = join(DATA_DIR, "PLINK")
PHENO_DIR = join(DATA_DIR, "REAL-PHENO")
WEIGHTS_DIR = "out/PRS-WEIGHTS/"
SCORE_DIR = "out/PRS-SCORE/"

In [4]:
# def submit_summarize(trait):

#     weights_path = join(WEIGHTS_DIR, trait, f"{trait}.weight.tsv.gz")
#     df_weights = pd.read_csv(weights_path, sep="\t").rename(
#         columns={"CHR": "CHROM", "A1": "ALT", "A2": "REF"}
#     )
#     weight_cols = [col for col in df_weights.columns if col.startswith("SAMPLE")]
#     df_score, df_snp = dapgen.score(
#         plink_path=PLINK_DIR, df_weight=df_weights, weight_cols=weight_cols, memory=50
#     )
#     df_summary = pd.DataFrame(
#         {"MEAN": df_score.mean(axis=1), "SD": df_score.std(axis=1)}
#     )
#     q_list = np.linspace(0.05, 0.95, 19)
#     df_quantile = df_score.quantile(q=q_list, axis=1).T
#     df_quantile.columns = [f"QUANTILE_{int(q * 100)}" for q in q_list]
#     df_summary = pd.merge(df_summary, df_quantile, left_index=True, right_index=True)
#     df_summary.to_csv(f"out/PREDICTION/{trait}.tsv.gz", sep="\t", float_format="%.5f")

In [5]:
def submit_summarize(trait):

    weights_path = join(WEIGHTS_DIR, trait, f"{trait}.weight.tsv.gz")
    cmds = [
        "dapgen score",
        f"--plink {PLINK_DIR}",
        f"--weights {join(WEIGHTS_DIR, trait, f'{trait}.weight.tsv.gz')}",
        f"--out {join(SCORE_DIR, f'{trait}.score.tsv')}",
        "--chrom-col CHR --alt-col A1 --ref-col A2",
        "--center True",
        "--memory 40",
    ]
    subprocess.check_call(" ".join(cmds), shell=True)
    df_score = pd.read_csv(
        join(SCORE_DIR, f"{trait}.score.tsv.gz"), sep="\t", index_col=0
    )
    df_summary = pd.DataFrame(
        {"MEAN": df_score.mean(axis=1), "SD": df_score.std(axis=1)}
    )
    q_list = np.linspace(0.05, 0.95, 19)
    df_quantile = df_score.quantile(q=q_list, axis=1).T
    df_quantile.columns = [f"QUANTILE_{int(q * 100)}" for q in q_list]
    df_summary = pd.merge(df_summary, df_quantile, left_index=True, right_index=True)
    df_summary.to_csv(
        join(SCORE_DIR, f"{trait}.score_summary.tsv.gz"),
        sep="\t",
        float_format="%.6f",
    )

In [6]:
trait_list = [p.split("/")[-1] for p in glob.glob("out/PRS-WEIGHTS/*")]
trait_list

['height', 'bmi', 'cholesterol', 'hdl_cholesterol', 'ldl_direct']

In [7]:
executor = submitit.SgeExecutor(folder="./submitit-logs")

executor.update_parameters(
    time_min=240,
    memory_g=60,
    queue="highp",
    setup=[
        "export PATH=~/project-pasaniuc/software/miniconda3/bin:$PATH",
        "export PYTHONNOUSERSITE=True",
    ],
)

jobs = executor.map_array(submit_summarize, trait_list)