# Pooled lipids phenotypes and covariates

In this notebook we review and explore the pooled All of Us and UK Biobank data for lipids phenotypes and covariates.

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on All of Us Workbench. It runs fine on the default Cloud Environment. 
</div>

In [None]:
lapply(c('hrbrthemes', 'skimr', 'tidyverse', 'qqman'),
       function(pkg) { if(! pkg %in% installed.packages()) { install.packages(pkg)} } )

In [None]:
library(hrbrthemes)
library(qqman)
library(skimr)
library(tidyverse)

In [None]:
# Set some visualiation defaults.
theme_set(theme_ipsum(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

# Constants

In [None]:
AOU_PHENO <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/AOU_Full_Data.csv'
UKB_PHENO <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/UKB_Full_Data.csv'

In [None]:
REGENIE_RESULTS <- c(
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210816/aou_alpha1_ukb_lipids_regenie_part2_HDLnorm.regenie',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210816/aou_alpha1_ukb_lipids_regenie_part2_LDLnorm.regenie',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210816/aou_alpha1_ukb_lipids_regenie_part2_TCnorm.regenie',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210816/aou_alpha1_ukb_lipids_regenie_part2_TGnorm.regenie'
#    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210803/aou_alpha1_ukb_lipids_HDL_norm.regenie',
#    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210803/aou_alpha1_ukb_lipids_LDL_norm.regenie',
#    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210803/aou_alpha1_ukb_lipids_TC_norm.regenie',
#    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210803/aou_alpha1_ukb_lipids_TG_norm.regenie'
    )

In [None]:
PLOT_SUBTITLE = 'Source: All of Us v5 and UK Biobank data'

# Join the phenotypes

In [None]:
aou_pheno = read_csv(pipe(str_glue('gsutil cat {AOU_PHENO}')))

dim(aou_pheno)
spec(aou_pheno)

In [None]:
aou_pheno %>%
    group_by(cohort) %>%
    summarize(count = n())

## Possible problem for AoU: looks like we may be using gender identity instead of sex at birth

In [None]:
aou_pheno %>%
    group_by(sex) %>%
    summarize(count = n())

In [None]:
aou_pheno %>%
    group_by(statin_use) %>%
    summarize(count = n())

In [None]:
ukb_pheno = read_csv(pipe(str_glue('gsutil cat {UKB_PHENO}')))

dim(ukb_pheno)
spec(ukb_pheno)

In [None]:
ukb_pheno %>%
    group_by(cohort) %>%
    summarize(count = n())

In [None]:
ukb_pheno %>%
    group_by(Sex_numeric) %>%
    summarize(count = n())

In [None]:
ukb_pheno %>%
    group_by(statin0) %>%
    summarize(count = n())

## Modify the input dataframes so that they can be combined.

In [None]:
pheno <- bind_rows(
    aou_pheno %>%
        mutate(
            # There were some NAs in the input cohort field for people in the alpha2 release. Retain those rows.
            cohort = 'AoU',
            # Consolidate the values in this field.
            sex = case_when(
                sex == 'Female' ~ 'Female',
                sex == 'Male' ~ 'Male',
                TRUE ~ 'other'
            )
        ) %>%
        select(id = person_id, cohort, age, sex, statin_use,
               TC, TG, LDL, HDL, TCadjusted, TGadjusted, LDLadjusted),
    ukb_pheno %>%
        # There were some NAs in the input cohort field for people without WES. Drop those rows.
        filter(cohort == 'ukb') %>%
        mutate(
            # Upper case the cohort.
            cohort = 'UKB',
            # Convert numeric coding to strings per https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=9.
            sex = case_when(
                Sex_numeric == 0 ~ 'Female',
                Sex_numeric == 1 ~ 'Male',
                TRUE ~ 'other'
            ),
            # Convert numeric coding to boolean.
            statin_use = case_when(
                statin0 == 1 ~ TRUE,
                TRUE ~ FALSE
            )
        ) %>%
        select(id = eid, cohort, age, sex, statin_use,
               TC = chol, TG = trig, LDL = ldl, HDL = hdl,
               TCadjusted = choladj, TGadjusted = trigadj, LDLadjusted = ldladj
              )
    ) %>%
    mutate(
        age_group = cut_width(age, width = 15, boundary = 0)
    )

In [None]:
pheno %>%
    group_by(cohort) %>%
    summarize(count = n())

In [None]:
pheno %>%
    group_by(sex) %>%
    summarize(count = n())

In [None]:
pheno %>%
    group_by(statin_use) %>%
    summarize(count = n())

# Plot lipids

In [None]:
plot_vars <- function(data, xvar, yvar, fillvar, title_detail = '', log_scale = FALSE) {
    xvar_sym <- sym(xvar)
    xvar_name <- xvar
    yvar_sym <- sym(yvar)
    yvar_name <- yvar
    fillvar_sym <- sym(fillvar)
    fillvar_name <- fillvar

    options(repr.plot.width = 16, repr.plot.height = 8)
    
    p <- data %>%
        filter(!is.na(!!yvar_sym)) %>%
        ggplot(aes(x = !!xvar_sym, y = !!yvar_sym, fill = !!fillvar_sym)) +
        geom_boxplot() +
        stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                     position = position_dodge(width = 0.9), vjust = -0.8) +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{yvar_name} mg/dL per person by {xvar_name} and {fillvar_name} {title_detail}'),
             caption = PLOT_SUBTITLE)

    if(log_scale) {
        p = p + scale_y_log10()
    }

    p
}

## By age group

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'age_group', yvar = lipid, fillvar = 'cohort'))
}

In [None]:
# Filter high outliers from TG and plot again.
plot_vars(data = pheno %>% filter(TG < 750), xvar = 'age_group', yvar = 'TG', fillvar = 'cohort',
         title_detail = '[high outliers removed]')

In [None]:
plot_vars(data = pheno, xvar = 'age_group', yvar = 'TG', fillvar = 'cohort',
          log_scale = TRUE, title_detail = '[log scale]')

## By statin use

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'statin_use'))
}

In [None]:
# Filter high outliers from TG and plot again.
plot_vars(data = pheno %>% filter(TG < 750), xvar = 'cohort', yvar = 'TG', fillvar = 'statin_use',
          title_detail = '[high outliers removed]')

In [None]:
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TG', fillvar = 'statin_use',
          log_scale = TRUE, title_detail = '[log scale]')

## By statin use and adjusted

In [None]:
for (lipid in c('LDLadjusted', 'TCadjusted', 'TGadjusted')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'statin_use'))
}

In [None]:
# Filter high outliers from TG and plot again.
plot_vars(data = pheno %>% filter(TGadjusted < 750), xvar = 'cohort', yvar = 'TGadjusted', fillvar = 'statin_use',
          title_detail = '[high outliers removed]')

In [None]:
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TGadjusted', fillvar = 'statin_use',
          log_scale = TRUE, title_detail = '[log scale]')

## By sex at birth

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'sex'))
}

In [None]:
# Filter high outliers from TG and plot again.
plot_vars(data = pheno %>% filter(TG < 750), xvar = 'cohort', yvar = 'TG', fillvar = 'sex',
          title_detail = '[high outliers removed]')

In [None]:
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TG', fillvar = 'sex',
          log_scale = TRUE, title_detail = '[log scale]')

# Plot regenie results

This code was inspired by [regenie.wdl](https://github.com/briansha/Regenie_WDL/blob/master/regenie.wdl#L515).

In [None]:
retval <- lapply(REGENIE_RESULTS, function(lipid_result) {
    regenie_results <- read_delim(pipe(str_glue('gsutil cat {lipid_result}')), delim = ' ')
    gc_score <- qchisq(median(regenie_results$LOG10P), 1, lower.tail=FALSE) / 0.456
    message(str_glue('nrow: {nrow(regenie_results)} ncol: {ncol(regenie_results)} in {lipid_result}'))
    message(str_glue('GC: {gc_score}'))
    message(
        regenie_results %>%
            group_by(TEST) %>%
        summarize(count = n())
    )
        
    options(repr.plot.width = 14, repr.plot.height = 14)
    manhattan(regenie_results,
              chr="CHROM",
              bp="GENPOS",
              snp="ID",
              p="LOG10P",
              logp=FALSE,
              annotatePval = 1E-5,
              main = str_glue('{basename(lipid_result)} results\nfrom {dirname(lipid_result)}'),
              sub = PLOT_SUBTITLE
             )

    qq(10 ^ (-1 * regenie_results$LOG10P),
       main = str_glue('{basename(lipid_result)} results\nfrom {dirname(lipid_result)}\n GC: {gc_score}'),
       sub = PLOT_SUBTITLE)

})

# Appendix - plot AoU data

# Appendix - plot UKB data

## Connect to the data

In [None]:
BILLING_PROJECT_ID <- Sys.getenv('GOOGLE_PROJECT')

HAAS_PHENO_TABLE <- 'single_values_table_ukb9222_20210111'
HAAS_MULTI_INSTANCE_PHENO_TABLE <- 'instance_values_table_ukb9222_20210111'
HAAS_ARRAY_PHENO_TABLE <- 'array_values_table_ukb9222_20210111'

### Natarajan lipids data

In [None]:
# Create a 'virtual dataframe' backed by a BigQuery table.
natarajan_pheno_raw_tbl <- dplyr::tbl(
    bigrquery::src_bigquery(project = 'uk-biobank-sek-data',
                            dataset = 'raw_phenotypes',
                            billing = BILLING_PROJECT_ID),
    'lipids_pheno_raw')

head(colnames(natarajan_pheno_raw_tbl))

In [None]:
dim(natarajan_pheno_raw_tbl)

In [None]:
str_subset(colnames(natarajan_pheno_raw_tbl), '(?i)age|ldl|hdl|tc|statin|choles|trigly')

In [None]:
# Create a 'virtual dataframe' backed by a BigQuery table.
natarajan_pheno_qced_tbl <- dplyr::tbl(
    bigrquery::src_bigquery(project = 'uk-biobank-sek-data',
                            dataset = 'phenotypes',
                            billing = BILLING_PROJECT_ID),
    'lipids_pheno_qced')

head(colnames(natarajan_pheno_qced_tbl))

In [None]:
dim(natarajan_pheno_raw_tbl)

In [None]:
# Create a 'virtual dataframe' backed by a BigQuery table.
natarajan_lipids_tbl <- dplyr::tbl(
    bigrquery::src_bigquery(project = 'uk-biobank-sek-data',
                            dataset = 'phenotypes',
                            billing = BILLING_PROJECT_ID),
    'lipids')

head(colnames(natarajan_lipids_tbl))

In [None]:
dim(natarajan_lipids_tbl)

In [None]:
str_subset(colnames(natarajan_lipids_tbl), '(?i)age|ldl|hdl|tc|statin|choles|trigly')

### Haas albuminuria data

In [None]:
# Create a 'virtual dataframe' backed by a BigQuery table.
haas_pheno_dbcon <- bigrquery::src_bigquery(project = 'uk-biobank-sek-data',
                                 dataset = 'pivoted_phenotypes',
                                 billing = BILLING_PROJECT_ID)

In [None]:
haas_pheno_tbl <- dplyr::tbl(haas_pheno_dbcon, HAAS_PHENO_TABLE)

head(colnames(haas_pheno_tbl))

In [None]:
str_subset(colnames(haas_pheno_tbl), '(?i)birth|medication|statin|cholesterol')

In [None]:
haas_instanced_pheno_tbl <- dplyr::tbl(haas_pheno_dbcon, HAAS_MULTI_INSTANCE_PHENO_TABLE)

head(colnames(haas_instanced_pheno_tbl))

In [None]:
str_subset(colnames(haas_instanced_pheno_tbl), '(?i)f_21003_|medication|statin|cholesterol')

In [None]:
haas_arrayed_pheno_tbl <- dplyr::tbl(haas_pheno_dbcon, HAAS_ARRAY_PHENO_TABLE)

head(colnames(haas_arrayed_pheno_tbl))

In [None]:
str_subset(colnames(haas_arrayed_pheno_tbl), '(?i)medication|statin|cholesterol|f_20003_')

## Retrieve the data

In [None]:
colnames(natarajan_lipids_tbl)

In [None]:
phenotypes <- natarajan_lipids_tbl %>%
    select(eid, ldl, hdl, trig, chol) %>%
    collect() %>%    # <--- the collect() operation is what transfers the data from BigQuery to memory
    inner_join(  # <--- do an inner join so that we drop the withdrawn samples from the Natarajan data
        haas_instanced_pheno_tbl %>%
        select(eid, instanceId, f_21003_Age_when_attended_assessment_centre_years) %>%
        filter(instanceId == 0) %>%
        collect()  # <--- the collect() operation is what transfers the data from BigQuery to memory
    ) 

dim(phenotypes)

In [None]:
head(phenotypes)

In [None]:
summary(phenotypes$hdl)

## Pivot and plot the data 

In [None]:
assay <- phenotypes %>%
    pivot_longer(
        cols = c(ldl, hdl, trig, chol),
        names_to = 'measurement',
        values_to = 'mg_dl')

In [None]:
# Check the result of the join.
(dim(assay))
(nrow(phenotypes) * 4)
stopifnot(nrow(assay) == nrow(phenotypes) * 4)

In [None]:
head(assay)

In [None]:
assay %>%
    group_by(measurement) %>%
    summarize(
        count = n(),
        missing = sum(is.na(mg_dl)),
        median = median(mg_dl, na.rm = TRUE),
        mean = mean(mg_dl, na.rm = TRUE),
        stddev = sd(mg_dl, na.rm = TRUE)
    )

In [None]:
options(repr.plot.height = 18, repr.plot.width = 16)

assay %>%
    ggplot(aes(x = cut_width(f_21003_Age_when_attended_assessment_centre_years, width = 15, boundary = 0), y = mg_dl)) +
    geom_boxplot() +
    stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
                 position = position_dodge(width = 0.9), vjust = -0.8) +
#    scale_y_log10() +  # Uncomment if the data looks skewed.
    coord_flip() +
    facet_wrap(~ measurement, nrow = length(unique(assay$measurement)), scales = 'free_x') +
    xlab('age') +
    labs(title = str_glue('Instance 0 measurement per person, by age'),
         caption = 'Source: UK Biobank data')

# Provenance 

In [None]:
devtools::session_info()