In [1]:
%load_ext lab_black

import numpy as np
import pandas as pd
import os
import pyreadr
from glob import glob
import matplotlib.pyplot as plt

In [2]:
PHENO_DIR = "/u/project/pasaniuc/ziqixu09/phenotype/"

In [3]:
pheno_eid = pd.read_csv(os.path.join(PHENO_DIR, "pheno.csv"), usecols=["eid"])[
    "eid"
].values.astype(str)
df_covar = pd.read_csv(os.path.join(PHENO_DIR, "covar.csv"), index_col=0)[
    ["AGE", "SEX", "DEPRIVATION_INDEX"] + [f"PC{i}" for i in range(1, 17)]
]
df_covar.index = df_covar.index.astype(str)
df_covar = pd.merge(
    pd.DataFrame(
        {"FID": df_covar.index, "IID": df_covar.index},
        index=df_covar.index,
    ),
    df_covar,
    left_index=True,
    right_index=True,
)
df_covar.to_csv(
    "out/covar.tsv", sep="\t", index=False, float_format="%.6g", na_rep="NA"
)
assert np.all(df_covar.index == pheno_eid)

In [5]:
df_pheno = df_covar[["FID", "IID"]].copy()
df_pheno["PHENO"] = np.nan

for trait_type in ["quant", "binary"]:
    trait_dir = f"data/ukbb-{trait_type}-pheno/"
    for trait_f in glob(f"{trait_dir}/*.rds"):
        trait = trait_f.split("/")[-1].split(".")[0]
        trait_values = pyreadr.read_r(trait_f)[None].values
        df_pheno["PHENO"] = trait_values.astype(float)
        print(
            f"Proportion of non-NaNs: {trait}: {1 - df_pheno['PHENO'].isna().mean():.2g}"
        )
        df_pheno.to_csv(
            f"out/{trait}.tsv",
            sep="\t",
            index=False,
            float_format="%.6g",
            na_rep="NA",
        )

Proportion of non-NaNs: hand_grip_strength: 0.99
Proportion of non-NaNs: log_waist_circ: 1
Proportion of non-NaNs: log_hip_circ: 1
Proportion of non-NaNs: height: 1
Proportion of non-NaNs: log_pulse_rate: 0.94
Proportion of non-NaNs: income: 0.85
Proportion of non-NaNs: geek_time: 0.98
Proportion of non-NaNs: log_sleep: 0.99
Proportion of non-NaNs: more_evening: 0.89
Proportion of non-NaNs: insomnia: 1
Proportion of non-NaNs: water_intake: 0.99
Proportion of non-NaNs: less_alcohol: 1
Proportion of non-NaNs: darker_skin: 0.98
Proportion of non-NaNs: darker_skin0: 0.98
Proportion of non-NaNs: less_tanned: 0.97
Proportion of non-NaNs: darker_hair: 0.94
Proportion of non-NaNs: darker_hair0: 0.94
Proportion of non-NaNs: log_age_first_sex: 0.86
Proportion of non-NaNs: poorer_health: 0.99
Proportion of non-NaNs: more_sunscreen: 0.99
Proportion of non-NaNs: M_less_hair: 0.45
Proportion of non-NaNs: F_menarche: 0.53
Proportion of non-NaNs: F_age_first_birth: 0.37
Proportion of non-NaNs: ankle_s