# Normalize a lipids phenotype for a GWAS study

In this notebook we use the *All of Us* data to normalize a prepared lipids phenotype for use in a GWAS study.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://docs.google.com/document/d/19ZS0z_-7FEM37pNDAXaWaqBSLnqyd9MZEkiOmtF3n_0/edit#). Specifically this is for the portion of the project that is the **siloed** analysis.

# Setup 

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench.
    <ul>
        <li>Use "Recommended Environment" <kbd><b>General Analysis</b></kbd> which creates compute type <kbd><b>Standard VM</b></kbd> with reasonable defaults for CPU, RAM, and disk.</li>
        <li>This notebook only takes a few minutes to run interactively. You can also it in the background via <kbd>run_notebook_in_the_background</kbd> for the sake of provenance and reproducibility.</li>
    </ul>
</div>

In [None]:
lapply(c('skimr'), function(pkg_name) { if(! pkg_name %in% installed.packages()) { install.packages(pkg_name)} } )

In [None]:
library(bigrquery)
library(lubridate)
library(skimr)
library(tidyverse)

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
# This was created via notebook aou_workbench_siloed_analyses/01_aou_lipids_phenotype.ipynb
AOU_PHENO <- 'gs://fc-secure-098ff3db-05c2-4426-8914-a26608668529/data/aou/pheno/20220307/aou_alpha3_lipids_phenotype.csv'
# Created via aou_workbench_siloed_analyses/04_plink_ld_and_pca.ipynb
PCS <- 'gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/ld-pca/20220208/aou_alpha3_lipids_plink_pca.eigenvec'

#---[ Outputs ]---
# Create a timestamp for a folder of results generated today.
DATESTAMP <- strftime(now(), '%Y%m%d')
DESTINATION <- str_glue('{Sys.getenv("WORKSPACE_BUCKET")}/data/aou/pheno/{DATESTAMP}/')
GWAS_PHENOTYPE_FILENAME <- 'aou_alpha3_lipids_gwas_phenotype.tsv'

# Retrieve PCs

In [None]:
pcs <- read_tsv(pipe(str_glue('gsutil cat {PCS}')))

dim(pcs)
head(pcs)

# Retrieve AoU lipids phenotype

In [None]:
aou_pheno <- read_csv(pipe(str_glue('gsutil cat {AOU_PHENO}')))

dim(aou_pheno)
head(aou_pheno)

In [None]:
skim(aou_pheno)

# Reshape the data for regenie

In [None]:
long_siloed_pheno <- aou_pheno %>%
    mutate(
        sample_id = person_id,
        cohort = 'AOU'
    ) %>%
    # In AoU 'id' and 'sample_id' are the same, but in other studies, such as UKB, 'sample_id' can be
    # different from 'id'.
    select(id=person_id, sample_id, cohort, age, age2, sex_at_birth, lipid_type, mg_dl = value_as_number) %>%
    mutate(
        IID = sample_id,
        FID = IID
    )

dim(long_siloed_pheno)
head(long_siloed_pheno)

## Pivot from long to wide

In [None]:
siloed_pheno <- long_siloed_pheno %>%
    pivot_wider(
        id_cols = c(id, sample_id, cohort, age, age2, sex_at_birth, IID, FID),
        names_from = lipid_type,
        values_from = mg_dl
    )

dim(siloed_pheno)
head(siloed_pheno)

In [None]:
skim(siloed_pheno)

## Confirm that we have one row per participant

In [None]:
nrow(siloed_pheno)
length(unique(siloed_pheno$IID))
stopifnot(nrow(siloed_pheno) == length(unique(siloed_pheno$IID)))

## Check the categorical fields

In [None]:
table(siloed_pheno$sex_at_birth, useNA = 'always')

In [None]:
table(siloed_pheno$cohort, useNA = 'always')

## Add the ancestry covariates

In [None]:
siloed_pheno <- inner_join(siloed_pheno, pcs, by = c('IID', 'FID' = '#FID'))

dim(siloed_pheno)
head(siloed_pheno)

## Normalize lipids values

In [None]:
siloed_pheno$TC_adjusted_resid = resid(lm(TC_adjusted ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                          data=siloed_pheno, na.action=na.exclude))
siloed_pheno$LDL_adjusted_resid = resid(lm(LDL_adjusted ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                           data=siloed_pheno, na.action=na.exclude))
siloed_pheno$HDL_resid = resid(lm(HDL ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                  data=siloed_pheno, na.action=na.exclude))
siloed_pheno$TG_adjusted_resid = resid(lm(TG_adjusted ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                          data=siloed_pheno, na.action=na.exclude))

In [None]:
siloed_pheno$TC_adjusted_norm <- sd(siloed_pheno$TC_adjusted_resid, na.rm = TRUE) * scale(qnorm(
    (rank(siloed_pheno$TC_adjusted_resid, na.last = 'keep') - 0.5) / sum(!is.na(siloed_pheno$TC_adjusted_resid)) ))
siloed_pheno$LDL_adjusted_norm <- sd(siloed_pheno$LDL_adjusted_resid, na.rm = TRUE) * scale(qnorm(
    (rank(siloed_pheno$LDL_adjusted_resid, na.last = 'keep') - 0.5) / sum(!is.na(siloed_pheno$LDL_adjusted_resid)) ))
siloed_pheno$HDL_norm <- sd(siloed_pheno$HDL_resid, na.rm = TRUE) * scale(qnorm(
    (rank(siloed_pheno$HDL_resid, na.last = 'keep') - 0.5) / sum(!is.na(siloed_pheno$HDL_resid)) ))
siloed_pheno$TG_adjusted_norm <- sd(siloed_pheno$TG_adjusted_resid, na.rm = TRUE) * scale(qnorm(
    (rank(siloed_pheno$TG_adjusted_resid, na.last = 'keep') - 0.5) / sum(!is.na(siloed_pheno$TG_adjusted_resid)) ))

### Check that NAs were handled correctly

In [None]:
head(siloed_pheno %>% filter(!is.na(LDL_adjusted)) %>% select(starts_with('LDL'), starts_with('TG')))

In [None]:
head(siloed_pheno %>% filter(is.na(LDL_adjusted)) %>% select(starts_with('LDL'), starts_with('TG')))

### Convert matrix columns to vectors

In [None]:
head(siloed_pheno %>% select(contains('norm')))

In [None]:
class(siloed_pheno$TC_adjusted_norm)
dim(siloed_pheno$TC_adjusted_norm)
length((siloed_pheno$TC_adjusted_norm))

In [None]:
class(siloed_pheno$TC_adjusted_norm[,1])
dim(siloed_pheno$TC_adjusted_norm[,1])
length((siloed_pheno$TC_adjusted_norm[,1]))

In [None]:
siloed_pheno <- siloed_pheno %>%
    mutate(
        TC_adjusted_norm = TC_adjusted_norm[,1],
        LDL_adjusted_norm = LDL_adjusted_norm[,1],
        HDL_norm = HDL_norm[,1],
        TG_adjusted_norm = TG_adjusted_norm[,1]
    )

head(siloed_pheno)

# Write phenotypes to workspace bucket

In [None]:
# Write the dataframe to a file.
write_tsv(siloed_pheno %>% select(FID, IID, everything()), GWAS_PHENOTYPE_FILENAME)

In [None]:
# Copy the file to the workspace bucket.
system(str_glue('gsutil cp {GWAS_PHENOTYPE_FILENAME} {DESTINATION}'), intern = T)

In [None]:
# Check the destination.
system(str_glue('gsutil ls -lh {DESTINATION}'), intern = T)

# Provenance

In [None]:
devtools::session_info()