# Normalize a lipids phenotype for a GWAS study

In this notebook we use the UK Biobank data to normalize a prepared lipids phenotype for use in a GWAS study.

  
Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://docs.google.com/document/d/19ZS0z_-7FEM37pNDAXaWaqBSLnqyd9MZEkiOmtF3n_0/edit#). Specifically this is for the portion of the project that is the **siloed** analysis.

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the UK Biobank Research Analysis Platform.
    <ul>
        <li>Use compute type 'Single Node' with sufficient CPU and RAM (e.g. start with 4 CPUs and 15 GB RAM, increase if needed).</li>
        <li>This notebook is pretty fast, but in general it is recommended to be run in the background via <kbd>dx run dxjupyterlab</kbd> to capture provenance.</li>
    </ul>
</div>

```
dx run dxjupyterlab \
    --instance-type=mem2_ssd1_v2_x4 \
    -icmd="papermill 07_ukb_lipids_phenotype_for_gwas.ipynb 07_ukb_lipids_phenotype_for_gwas_$(date +%Y%m%d).ipynb" \
    -iin=07_ukb_lipids_phenotype_for_gwas.ipynb \
    --folder=outputs/r-prepare-phenotype-for-gwas/$(date +%Y%m%d)/
```
See also https://platform.dnanexus.com/app/dxjupyterlab

In [None]:
lapply(c('lubridate', 'skimr', 'tidyverse'),
       function(pkg) { if(! pkg %in% installed.packages()) { install.packages(pkg)} } )

In [None]:
library(lubridate)
library(skimr)
library(tidyverse)

In [None]:
## Plot setup.
theme_set(theme_bw(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
# This was created via ukb_rap_siloed_analyses/02_ukb_lipids_phenotype.ipynb
PHENOTYPES = '/mnt/project/outputs/r-prepare-phenotype/20220217/ukb_200kwes_lipids_phenotype.tsv'
# This was created via ukb_rap_siloed_analyses/06_ukb_plink_ld_and_pca.ipynb
PCS = '/mnt/project/outputs/plink-ld-pca/20220219/ukb_200kwes_lipids_plink_pca.eigenvec'

#---[ Outputs ]---
GWAS_PHENOTYPE_FILENAME = 'ukb_200kwes_lipids_gwas_phenotype.tsv'

# Load data 

In [None]:
system(str_glue('cp {PHENOTYPES} .'), intern=TRUE)
pheno <- read_tsv(basename(PHENOTYPES))

In [None]:
skim(pheno)

In [None]:
system(str_glue('cp {PCS} .'), intern=TRUE)
pcs <- read_tsv(basename(PCS))

In [None]:
head(pcs)

# Add the ancestry covariates

In [None]:
# Confirm that the id sets are identical.
stopifnot(sort(pcs$IID) == sort(pheno$IID))

In [None]:
pheno <- left_join(pheno, pcs, by = c('FID' = '#FID', 'IID'))

# Normalize lipids values

In [None]:
pheno$TC_adj_mg_dl_resid = resid(lm(TC_adj_mg_dl ~ sex+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data=pheno, na.action=na.exclude))
pheno$LDL_adj_mg_dl_resid = resid(lm(LDL_adj_mg_dl ~ sex+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data=pheno, na.action=na.exclude))
pheno$HDL_mg_dl_resid = resid(lm(HDL_mg_dl ~ sex+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data=pheno, na.action=na.exclude))
pheno$TG_log_mg_dl_resid = resid(lm(TG_log_mg_dl ~ sex+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data=pheno, na.action=na.exclude))

In [None]:
pheno$TC_adj_mg_dl_norm <- sd(pheno$TC_adj_mg_dl_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pheno$TC_adj_mg_dl_resid, na.last = 'keep') - 0.5) / sum(!is.na(pheno$TC_adj_mg_dl_resid)) ))
pheno$LDL_adj_mg_dl_norm <- sd(pheno$LDL_adj_mg_dl_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pheno$LDL_adj_mg_dl_resid, na.last = 'keep') - 0.5) / sum(!is.na(pheno$LDL_adj_mg_dl_resid)) ))
pheno$HDL_mg_dl_norm <- sd(pheno$HDL_mg_dl_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pheno$HDL_mg_dl_resid, na.last = 'keep') - 0.5) / sum(!is.na(pheno$HDL_mg_dl_resid)) ))
pheno$TG_log_mg_dl_norm <- sd(pheno$TG_log_mg_dl_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pheno$TG_log_mg_dl_resid, na.last = 'keep') - 0.5) / sum(!is.na(pheno$TG_log_mg_dl_resid)) ))

## Check that NAs were handled correctly

In [None]:
head(pheno %>% filter(!is.na(LDL_adj_mg_dl)) %>% select(starts_with('LDL'), starts_with('TG')))

In [None]:
head(pheno %>% filter(is.na(LDL_adj_mg_dl)) %>% select(starts_with('LDL'), starts_with('TG')))

## Convert matrix columns to vectors

In [None]:
class(pheno$TC_adj_mg_dl_resid)
dim(pheno$TC_adj_mg_dl_resid)
class(pheno$TC_adj_mg_dl_norm)
dim(pheno$TC_adj_mg_dl_norm)
dim(pheno$LDL_adj_mg_dl_norm)
dim(pheno$HDL_mg_dl_norm)
dim(pheno$TG_log_mg_dl_norm)

In [None]:
class(pheno$TC_adj_mg_dl_norm[,1])
dim(pheno$TC_adj_mg_dl_norm[,1])
length((pheno$TC_adj_mg_dl_norm[,1]))

In [None]:
pheno <- pheno %>%
    mutate(
        TC_adj_mg_dl_norm = TC_adj_mg_dl_norm[,1],
        LDL_adj_mg_dl_norm = LDL_adj_mg_dl_norm[,1],
        HDL_mg_dl_norm = HDL_mg_dl_norm[,1],
        TG_log_mg_dl_norm = TG_log_mg_dl_norm[,1]
    )

head(pheno)

# Write the prepared phenotype to TSV for regenie

In [None]:
skim(pheno)

In [None]:
write_tsv(pheno, GWAS_PHENOTYPE_FILENAME)

# Provenance 

In [None]:
devtools::session_info()