# Prepare a lipids phenotype for a GWAS study

In this notebook we combine the _All of Us_ lipids phenotype with the UK Biobank lipids phenotype to prepare a pooled lipids phenotype for a GWAS study.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://docs.google.com/document/d/19ZS0z_-7FEM37pNDAXaWaqBSLnqyd9MZEkiOmtF3n_0/edit#). Specifically, this notebook combines the results from `aou_workbench_siloed_analyses/3_aou_lipids_gwas_phenotype.ipynb` and `aou_workbench_pooled_analyses/6_ukb_lipids_gwas_phenotype.ipynb`.

TODO update project description link to biorxiv paper after it is posted.

# Setup 

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the All of Us Workbench. It runs fine on the default Cloud Environment for type <i>Standard VM</i>. 
</div>

In [None]:
lapply(c('skimr'), function(pkg_name) { if(! pkg_name %in% installed.packages()) { install.packages(pkg_name)} } )

In [None]:
library(lubridate)
library(skimr)
library(tidyverse)

## Define constants

In [None]:
# Inputs
# Created via aou_workbench_pooled_analyses/5_plink_ld_and_pca.ipynb
PCS <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/plink/20210907/aou_alpha2_ukb_lipids_plink_pca.eigenvec'
# Created via aou_workbench_siloed_analyses/3_aou_lipids_gwas_phenotype.ipynb
AOU_PHENO <- 'gs://fc-secure-440c511e-7fff-417c-9c86-f8ab51bfc618/data/phenotypes/20211006/AOU_lipids_phenotype.csv'
# Created via aou_workbench_pooled_analyses/6_ukb_lipids_gwas_phenotype.ipynb
UKB_PHENO <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/ukb/phenotypes/20211221/UKB_lipids_phenotype.csv'

# Outputs
# Create a timestamp for a folder of results generated today.
DATESTAMP <- strftime(now(), '%Y%m%d')
FILENAME <- 'alpha2_pooled_lipids_phenotype.tsv'
DESTINATION <- str_glue('{Sys.getenv("WORKSPACE_BUCKET")}/data/pooled/phenotypes/{DATESTAMP}/')

# Load data

## Retrieve PCs

In [None]:
pcs <- read_tsv(pipe(str_glue('gsutil cat {PCS}')))

dim(pcs)
head(pcs)

## Retrieve AoU lipids phenotype

In [None]:
aou_pheno <- read_csv(pipe(str_glue('gsutil cat {AOU_PHENO}')))

dim(aou_pheno)
head(aou_pheno)

In [None]:
skim(aou_pheno)

# TODO: move this section to the AoU notebook

Note: for the AoU participants, their lipids were not necessarily measured on the same day (at the same age).

For the GWAS age covariate, use the age of their most recent individual lipid measurement, but also discard any lipids measurements older than five years prior to their most recent individual lipid measurement.

In [None]:
aou_pheno %>%
    group_by(person_id) %>%
    summarize(
        min_age = min(age),
        max_age = max(age),
        age_diff = max_age - min_age
    ) %>%
    group_by(age_diff) %>%
    summarize(
        num_participants_with_this_age_diff = n()
    )

In [None]:
# Print out a few person_ids to manually check in the measurements data.
aou_pheno %>%
    group_by(person_id) %>%
    summarize(
        min_age = min(age),
        max_age = max(age),
        age_diff = max_age - min_age
    ) %>%
    filter(age_diff > 12)

Remove the individual measurements lipids that are older than 5 years from the most recent measurement.

In [None]:
aou_pheno <- aou_pheno %>%
    inner_join(aou_pheno %>%
                   group_by(person_id) %>%
                   summarize(max_age = max(age)), by = c('person_id' = 'person_id')) %>%
    filter(max_age - age <= 5) %>%
    mutate(
        age = max_age,
        age2 = max_age ^ 2
    )

dim(aou_pheno)

## Retrieve UKB lipids phenotype

In [None]:
ukb_pheno <- read_csv(pipe(str_glue('gsutil cat {UKB_PHENO}')))

dim(ukb_pheno)
head(ukb_pheno)

In [None]:
skim(ukb_pheno)

# Pool the phenotypes

Add the `IID` and `FID` columns needed by regenie and the cohort covariate. Also keep in mind that UKB data has a sample id that is different than the eid.

In [None]:
long_pooled_pheno <- bind_rows(
    aou_pheno %>%
        mutate(
            sample_id = person_id,
            cohort = 'AOU',            
        ) %>%
        select(id=person_id, sample_id, cohort, age, age2, sex_at_birth, lipid_type, mg_dl = value_as_number),
    ukb_pheno %>%
        mutate(
            sample_id = eid_31063,
            cohort = 'UKB',
        ) %>%
        select(id=eid, sample_id, cohort, age, age2, sex_at_birth, lipid_type, mg_dl)
    ) %>%
    mutate(
        IID = str_glue('{sample_id}_{cohort}'),
        FID = IID
    ) %>%
    select(IID, FID, everything())

dim(long_pooled_pheno)
head(long_pooled_pheno)

In [None]:
skim(long_pooled_pheno)

## Pivot from long to wide

In [None]:
pooled_pheno <- long_pooled_pheno %>%
    pivot_wider(
        id_cols = c(id, sample_id, cohort, age, age2, sex_at_birth, IID, FID),
        names_from = lipid_type,
        values_from = mg_dl
    )

dim(pooled_pheno)
head(pooled_pheno)

## Confirm that we have one row per participant

In [None]:
nrow(pooled_pheno)
length(unique(pooled_pheno$IID))
stopifnot(nrow(pooled_pheno) == length(unique(pooled_pheno$IID)))

## Add the ancestry covariates

In [None]:
pooled_pheno <- inner_join(pooled_pheno, pcs, by = c('IID' = '#IID'))

dim(pooled_pheno)
head(pooled_pheno)

In [None]:
table(pooled_pheno$cohort)

## Normalize lipids values

In [None]:
pooled_pheno$TC_adjusted_resid = resid(lm(TC_adjusted ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                          data=pooled_pheno, na.action=na.exclude))
pooled_pheno$LDL_adjusted_resid = resid(lm(LDL_adjusted ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                           data=pooled_pheno, na.action=na.exclude))
pooled_pheno$HDL_resid = resid(lm(HDL ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                  data=pooled_pheno, na.action=na.exclude))
pooled_pheno$TG_adjusted_resid = resid(lm(TG_adjusted ~ sex_at_birth+age+age2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,
                                          data=pooled_pheno, na.action=na.exclude))

In [None]:
pooled_pheno$TC_adjusted_norm <- sd(pooled_pheno$TC_adjusted_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pooled_pheno$TC_adjusted_resid, na.last = 'keep') - 0.5) / sum(!is.na(pooled_pheno$TC_adjusted_resid)) ))
pooled_pheno$LDL_adjusted_norm <- sd(pooled_pheno$LDL_adjusted_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pooled_pheno$LDL_adjusted_resid, na.last = 'keep') - 0.5) / sum(!is.na(pooled_pheno$LDL_adjusted_resid)) ))
pooled_pheno$HDL_norm <- sd(pooled_pheno$HDL_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pooled_pheno$HDL_resid, na.last = 'keep') - 0.5) / sum(!is.na(pooled_pheno$HDL_resid)) ))
pooled_pheno$TG_adjusted_norm <- sd(pooled_pheno$TG_adjusted_resid, na.rm = TRUE) * scale(qnorm(
    (rank(pooled_pheno$TG_adjusted_resid, na.last = 'keep') - 0.5) / sum(!is.na(pooled_pheno$TG_adjusted_resid)) ))

### Check that NAs were handled correctly

In [None]:
head(pooled_pheno %>% filter(!is.na(LDL_adjusted)) %>% select(starts_with('LDL'), starts_with('TG')))

In [None]:
head(pooled_pheno %>% filter(is.na(LDL_adjusted)) %>% select(starts_with('LDL'), starts_with('TG')))

### Convert matrix columns to vectors

In [None]:
head(pooled_pheno %>% select(contains('norm')))

In [None]:
class(pooled_pheno$TC_adjusted_norm)
dim(pooled_pheno$TC_adjusted_norm)
length((pooled_pheno$TC_adjusted_norm))

In [None]:
class(pooled_pheno$TC_adjusted_norm[,1])
dim(pooled_pheno$TC_adjusted_norm[,1])
length((pooled_pheno$TC_adjusted_norm[,1]))

In [None]:
pooled_pheno <- pooled_pheno %>%
    mutate(
        TC_adjusted_norm = TC_adjusted_norm[,1],
        LDL_adjusted_norm = LDL_adjusted_norm[,1],
        HDL_norm = HDL_norm[,1],
        TG_adjusted_norm = TG_adjusted_norm[,1]
    )

head(pooled_pheno)

# Write phenotypes to workspace bucket

In [None]:
table(pooled_pheno$cohort)

In [None]:
write_phenotype_to_bucket <- function(my_dataframe, destination_filename) {
    message(str_glue('Phenotype files will be written to folder {DESTINATION}.'))
    
    # Store the dataframe in current workspace.
    if (str_ends(destination_filename, 'tsv')) {
        write_tsv(my_dataframe, destination_filename)
    } else {
        write_csv(my_dataframe, destination_filename)        
    }

    # Copy the file from current workspace to the bucket.
    system(str_glue('gsutil cp -v ./ {destination_filename} {DESTINATION}'), intern = T)

    # Check if file is in the bucket.
    system(str_glue('gsutil ls {DESTINATION}'), intern = T)
}

In [None]:
write_phenotype_to_bucket(pooled_pheno, FILENAME)

# TODO: compare this to the prior pooled phenotype

# Provenance

In [None]:
devtools::session_info()