In [1]:
%load_ext lab_black

import numpy as np
import pandas as pd
import os
import pyreadr
from glob import glob
import matplotlib.pyplot as plt

In [2]:
PHENO_DIR = "/u/project/pasaniuc/ziqixu09/phenotype/"

In [3]:
# extract covariates
phecode_eid = pd.read_csv(os.path.join(PHENO_DIR, "phecode.csv"), usecols=["eid"])[
    "eid"
].values.astype(str)

pheno_eid = pd.read_csv(os.path.join(PHENO_DIR, "pheno.csv"), usecols=["eid"])[
    "eid"
].values.astype(str)

assert np.all(phecode_eid == pheno_eid)

df_covar = pd.read_csv(os.path.join(PHENO_DIR, "covar.csv"), index_col=0)[
    ["AGE", "SEX", "DEPRIVATION_INDEX"] + [f"PC{i}" for i in range(1, 17)]
]
df_covar.index = df_covar.index.astype(str)
df_covar = pd.merge(
    pd.DataFrame(
        {"FID": df_covar.index, "IID": df_covar.index},
        index=df_covar.index,
    ),
    df_covar,
    left_index=True,
    right_index=True,
)
df_covar.to_csv(
    "out/covar.tsv", sep="\t", index=False, float_format="%.6g", na_rep="NA"
)
assert np.all(df_covar.index == pheno_eid)

In [4]:
df_pheno = df_covar[["FID", "IID"]].copy()
df_pheno["PHENO"] = np.nan

print("Proportion of non-NaNs: ")
for trait_type in ["quant", "binary"]:
    trait_dir = f"data/ukbb-{trait_type}-pheno/"
    for trait_f in glob(f"{trait_dir}/*.rds"):
        trait = trait_f.split("/")[-1][:-4]
        trait_values = pyreadr.read_r(trait_f)[None].values
        df_pheno["PHENO"] = trait_values.astype(float)
        print(f"{trait}: {1 - df_pheno['PHENO'].isna().mean():.2g}")
        df_pheno.to_csv(
            f"out/{trait}.tsv",
            sep="\t",
            index=False,
            float_format="%.6g",
            na_rep="NA",
        )

Proportion of non-NaNs: 
hand_grip_strength: 0.99
log_waist_circ: 1
log_hip_circ: 1
height: 1
log_pulse_rate: 0.94
income: 0.85
geek_time: 0.98
log_sleep: 0.99
more_evening: 0.89
insomnia: 1
water_intake: 0.99
less_alcohol: 1
darker_skin: 0.98
darker_skin0: 0.98
less_tanned: 0.97
darker_hair: 0.94
darker_hair0: 0.94
log_age_first_sex: 0.86
poorer_health: 0.99
more_sunscreen: 0.99
M_less_hair: 0.45
F_menarche: 0.53
F_age_first_birth: 0.37
ankle_spacing: 0.56
log_heel_BUA: 0.56
log_heel_SoS: 0.56
log_heel_BMD: 0.56
F_length_menstrual_cycle: 0.094
diastolic_BP: 0.94
systolic_BP: 0.94
logMAR: 0.26
years_of_edu: 0.82
sitting_height: 1
birth_weight: 0.55
neuroticism: 0.8
FEV1: 0.71
FVC: 0.71
fluid_intelligence: 0.25
avMSE: 0.23
less_happy: 0.31
less_happy_with_health: 0.31
log_BMI: 1
log_mean_carotid_IMT: 0.05
fat_perc: 0.98
log_fat_mass: 0.98
log_fat_free_mass: 0.98
log_water_mass: 0.98
log_impedance: 0.98
log_ventricular_rate: 0.075
ECG_P_duration: 0.071
log_ECG_QRS_duration: 0.075
ECG_PQ_

In [5]:
# check the consistency with old version of covar
df_covar_old = pd.read_csv(
    "/u/home/k/kangchen/admix-prs-uncertainty/experiments/00-compile-data/out/covar.tsv",
    sep="\t",
    index_col=0,
)
df_covar_old.index = df_covar_old.index.astype(str)
assert np.all(df_covar_old.index == df_covar.index)
for col in set(df_covar_old.columns) & set(df_covar.columns):
    assert np.allclose(df_covar_old[col], df_covar[col], equal_nan=True)

In [6]:
# check the consistency with old version of pheno

trait_list = [t.split("/")[-1][:-4] for t in glob("out/OLD/*.tsv")]
trait_list = set(trait_list) - set(["covar"])
for trait in trait_list:
    df1 = pd.read_csv(f"out/OLD/{trait}.tsv", sep="\t")
    df2 = pd.read_csv(f"out/{trait}.tsv", sep="\t")
    assert df1.equals(df2)