# Preprocessing and prepare phenotypes

- `qsub 01_plink.sh` Basic filtering
- `qsub 01_plink_pca.sh` Perform in-sample PCA

In [1]:
import numpy as np
from os.path import join
import xarray as xr
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.preprocessing import quantile_transform

# Extract phenotypes

In [3]:
path_pheno = "00_data/trait_pheno.csv"

out_dir = "01_plink"
pheno = pd.read_csv(path_pheno, index_col=0)

# extract phenotype
for col_trait in ["cholesterol", "ldl_direct"]:
    plink_pheno = pheno[[col_trait]].copy()
    # quantile normalization
    plink_pheno[col_trait] = quantile_transform(plink_pheno[[col_trait]], output_distribution="normal", n_quantiles=pheno.shape[0])
    plink_pheno["FID"] = plink_pheno["IID"] = plink_pheno.index
    plink_pheno[["FID", "IID", col_trait]].to_csv(join(out_dir, f"{col_trait}.pheno"), index=False, sep='\t', na_rep='NA', float_format="%.8f")
    
# extract covariance
col_covar = ["AGE", "SEX", "dilution_factor"]
plink_covar = pheno[col_covar].copy()
plink_covar["FID"] = plink_covar["IID"] = plink_covar.index
df_pc = pd.read_csv("01_plink/pca.eigenvec", delim_whitespace=True, header=None, names=["FID", "IID"] + [f"PC{i + 1}" for i in range(20)])
plink_covar = pd.merge(plink_covar, df_pc, on=["FID", "IID"])
# impute missing covariates with median
plink_covar = plink_covar.fillna(plink_covar.median())
plink_covar[["FID", "IID", *(col_covar + [f"PC{i + 1}" for i in range(10)])]].to_csv(join(out_dir, "covar.txt"), index=False, sep='\t', na_rep='NA', float_format="%.8f")