# Prepare a lipids phenotype for a GWAS study

In this notebook we combine the _All of Us_ lipids phenotype with the UK Biobank lipids phenotype to prepare a pooled lipids phenotype for a GWAS study.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://docs.google.com/document/d/19ZS0z_-7FEM37pNDAXaWaqBSLnqyd9MZEkiOmtF3n_0/edit#). Specifically, this notebook combines the results from `aou_workbench_siloed_analyses/3_aou_lipids_gwas_phenotype.ipynb` and `aou_workbench_pooled_analyses/6_ukb_lipids_gwas_phenotype.ipynb`.

TODO update project description link to biorxiv paper after it is posted.

# Setup 

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench. It runs fine on the default Cloud Analysis Environment for type <i>Standard VM</i>. 
</div>

In [None]:
lapply(c('skimr'), function(pkg_name) { if(! pkg_name %in% installed.packages()) { install.packages(pkg_name)} } )

In [None]:
library(lubridate)
library(skimr)
library(tidyverse)

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
# Created via aou_workbench_pooled_analyses/02_merge_lipids_phenotypes.ipynb
MERGED_PHENO <- 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/pheno/20220214/aou_alpha3_ukb_lipids_phenotype.csv'
# Created via aou_workbench_pooled_analyses/5_plink_ld_and_pca.ipynb
PCS <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/plink/20210907/aou_alpha2_ukb_lipids_plink_pca.eigenvec'

#---[ Outputs ]---
# Create a timestamp for a folder of results generated today.
DATESTAMP <- strftime(now(), '%Y%m%d')
DESTINATION <- str_glue('{Sys.getenv("WORKSPACE_BUCKET")}/data/pooled/pheno/{DATESTAMP}/')
GWAS_PHENOTYPE_FILENAME <- 'aou_alpha3_ukb_lipids_gwas_phenotype.tsv'

# Load data

## Retrieve PCs

In [None]:
pcs <- read_tsv(pipe(str_glue('gsutil cat {PCS}')))

dim(pcs)
head(pcs)

## Retrieve AoU+UKB merged lipids phenotype

In [None]:
long_pooled_pheno <- read_csv(pipe(str_glue('gsutil cat {MERGED_PHENO}')))

dim(long_pooled_pheno)
head(long_pooled_pheno)

In [None]:
skim(long_pooled_pheno)

# Pivot from long to wide

In [None]:
# Note: for UKB everyone had the same age and statin use for all lipids. This is not the case for
# AoU since the lipids measurements come from EHR and may have occurred on different days. For AoU
# age was already set to `max_age` in aou_workbench_siloed_analyses/01_aou_lipids_phenotype.ipynb so it
# can be used as an id column in the pivot, but `statin_use` cannot since in this dataframe its value is still
# specific to the particular lipid measurement.
pooled_pheno <- long_pooled_pheno %>%
    pivot_wider(
        id_cols = c(id, sample_id, cohort, age, age2, sex_at_birth, IID, FID),
        names_from = lipid_type,
        values_from = mg_dl
    )

dim(pooled_pheno)
head(pooled_pheno)

## Confirm that we have one row per participant

In [None]:
nrow(pooled_pheno)
length(unique(pooled_pheno$IID))
stopifnot(nrow(pooled_pheno) == length(unique(pooled_pheno$IID)))

## Check the categorical fields

In [None]:
table(pooled_pheno$sex_at_birth, useNA = 'always')

In [None]:
table(pooled_pheno$cohort, useNA = 'always')

# Add the ancestry covariates

In [None]:
pooled_pheno <- inner_join(pooled_pheno, pcs, by = c('IID' = '#IID'))

dim(pooled_pheno)
head(pooled_pheno)

# Normalize lipids values

In [None]:
pooled_pheno$TC_adjusted_resid = resid(lm(TC_adjusted ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                          data=pooled_pheno, na.action=na.exclude))
pooled_pheno$LDL_adjusted_resid = resid(lm(LDL_adjusted ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                           data=pooled_pheno, na.action=na.exclude))
pooled_pheno$HDL_resid = resid(lm(HDL ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                  data=pooled_pheno, na.action=na.exclude))
pooled_pheno$TG_adjusted_resid = resid(lm(TG_adjusted ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                          data=pooled_pheno, na.action=na.exclude))

In [None]:
pooled_pheno$TC_adjusted_norm <- sd(pooled_pheno$TC_adjusted_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pooled_pheno$TC_adjusted_resid, na.last = 'keep') - 0.5) / sum(!is.na(pooled_pheno$TC_adjusted_resid)) ))
pooled_pheno$LDL_adjusted_norm <- sd(pooled_pheno$LDL_adjusted_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pooled_pheno$LDL_adjusted_resid, na.last = 'keep') - 0.5) / sum(!is.na(pooled_pheno$LDL_adjusted_resid)) ))
pooled_pheno$HDL_norm <- sd(pooled_pheno$HDL_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pooled_pheno$HDL_resid, na.last = 'keep') - 0.5) / sum(!is.na(pooled_pheno$HDL_resid)) ))
pooled_pheno$TG_adjusted_norm <- sd(pooled_pheno$TG_adjusted_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pooled_pheno$TG_adjusted_resid, na.last = 'keep') - 0.5) / sum(!is.na(pooled_pheno$TG_adjusted_resid)) ))

### Check that NAs were handled correctly

In [None]:
head(pooled_pheno %>% filter(!is.na(LDL_adjusted)) %>% select(starts_with('LDL'), starts_with('TG')))

In [None]:
head(pooled_pheno %>% filter(is.na(LDL_adjusted)) %>% select(starts_with('LDL'), starts_with('TG')))

### Convert matrix columns to vectors

In [None]:
head(pooled_pheno %>% select(contains('norm')))

In [None]:
class(pooled_pheno$TC_adjusted_norm)
dim(pooled_pheno$TC_adjusted_norm)
length((pooled_pheno$TC_adjusted_norm))

In [None]:
class(pooled_pheno$TC_adjusted_norm[,1])
dim(pooled_pheno$TC_adjusted_norm[,1])
length((pooled_pheno$TC_adjusted_norm[,1]))

In [None]:
pooled_pheno <- pooled_pheno %>%
    mutate(
        TC_adjusted_norm = TC_adjusted_norm[,1],
        LDL_adjusted_norm = LDL_adjusted_norm[,1],
        HDL_norm = HDL_norm[,1],
        TG_adjusted_norm = TG_adjusted_norm[,1]
    )

head(pooled_pheno)

# Write phenotypes to workspace bucket

In [None]:
# Write the dataframe to a file.
write_tsv(pooled_pheno %>% select(FID, IID, everything()), GWAS_PHENOTYPE_FILENAME)

In [None]:
# Copy the file to the workspace bucket.
system(str_glue('gsutil cp {GWAS_PHENOTYPE_FILENAME} {DESTINATION}'), intern = T)

In [None]:
# Check the destination.
system(str_glue('gsutil ls -lh {DESTINATION}'), intern = T)

# Provenance

In [None]:
devtools::session_info()