# Train PRS on European population and apply to admixed population

1. Divide into training / validation / testing for European
    - prs/pheno/eur_{train, val, test}.indiv
    - Admixed testing population from plink/admix.merged.fam
2. Extract phenotypes
    - Raw phenotypes: prs/pheno/{group}.{trait}.pheno 
    - Raw covariates: prs/covar/{group}.covar
    - Regressed phenotypes: prs/pheno/{group}.{trait}.residual_pheno
    - Regression model (train on European and apply to other populations): prs/pheno/eur_train.{trait}.model
2. Perform GWAS
    - Simple PLINK GWAS
3. Apply PRS uncertainty

# Divide into training / validation / testing for European

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
import admix
import numpy as np
import pandas as pd
from os.path import join
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [3]:
import numpy as np
import pandas as pd

col_dict = {
    "eid": "ID",
    "21003-0.0": "AGE",
    "31-0.0": "SEX",
}
for i in range(1, 41):
    col_dict[f"22009-0.{i}"] = f"PC{i}"

pheno1 = pd.read_csv(
    "/u/project/sriram/ukbiobank/33127/ukb21970.csv",
    usecols=col_dict.keys(),
    encoding="unicode_escape",
).rename(columns=col_dict)
pheno1.ID = pheno1.ID.astype(str)
pheno1 = pheno1.set_index("ID")

# extract phenotype from another file (because in another UKB application)
col_dict = {
    "eid": "ID",
    "31-0.0": "SEX",
    "21000-0.0": "SIRE",
    "50-0.0": "height",
    "23104-0.0": "bmi",
    "30897-0.0": "dilution_factor",
    "30690-0.0": "cholesterol",
    "30780-0.0": "ldl_direct",
    "30760-0.0": "hdl_cholesterol",
}

pheno2 = pd.read_csv(
    "/u/project/sriram/ukbiobank/33127/ukb39967.enc_ukb.converted2.csv",
    usecols=col_dict.keys(),
    encoding="unicode_escape",
).rename(columns=col_dict)
pheno2.ID = pheno2.ID.astype(str)
pheno2 = pheno2.set_index("ID")

df_all = pd.merge(pheno1, pheno2.drop("SEX", axis=1), left_index=True, right_index=True)
df_all.to_csv("out/REAL-PHENO/all-pheno.csv")

In [6]:
trait_list = ["height", "bmi", "cholesterol", "hdl_cholesterol", "ldl_direct"]
OUT_DIR = "out/REAL-PHENO"

In [10]:
import pandas as pd

df_pheno = pd.read_csv("out/REAL-PHENO/all-pheno.csv").astype({"ID": str})
df_covar = df_pheno[["ID", "ID", "SEX", "AGE"] + [f"PC{i}" for i in range(1, 21)]]
df_covar.columns = ["FID", "IID"] + list(df_covar.columns[2:])
df_covar.to_csv(join(OUT_DIR, "all.covar"), sep="\t", index=False, na_rep="NA")

for trait in trait_list:
    df_trait = df_pheno[["ID", "ID", trait]].copy()
    df_trait.columns = ["FID", "IID", "PHENO"]
    df_trait.to_csv(
        join(OUT_DIR, f"{trait}.raw.pheno"), sep="\t", index=False, na_rep="NA"
    )