# Pooled lipids phenotypes and covariates

In this notebook we review and explore the pooled All of Us and UK Biobank data for lipids phenotypes and covariates.

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on All of Us Workbench. It runs fine on the default Cloud Environment. 
</div>

In [None]:
lapply(c('hrbrthemes', 'skimr', 'tidyverse', 'qqman'),
       function(pkg) { if(! pkg %in% installed.packages()) { install.packages(pkg)} } )

In [None]:
library(hrbrthemes)
library(qqman)
library(skimr)
library(tidyverse)

In [None]:
# Set some visualiation defaults.
theme_set(theme_ipsum(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

# Constants

In [None]:
AOU_PHENO <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/AOU_Full_Data.csv'
UKB_PHENO <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/UKB_Full_Data.csv'

In [None]:
GWAS_PHENO <- c(
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/MergedData_HDL_Iteration2_ForGWAS.csv',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/MergedData_LDL_Iteration2_ForGWAS.csv',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/MergedData_TC_Iteration2_ForGWAS.csv',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/MergedData_TG_Iteration2_ForGWAS.csv'
)

REGENIE_PHENO <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210921/aou_alpha2_ukb_lipids_phenotypes_and_covariates.tsv'

REGENIE_RESULTS <- c(
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210921/aou_alpha2_ukb_lipids_regenie_part2_HDLnorm.regenie',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210921/aou_alpha2_ukb_lipids_regenie_part2_LDLnorm.regenie',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210921/aou_alpha2_ukb_lipids_regenie_part2_TCnorm.regenie',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210921/aou_alpha2_ukb_lipids_regenie_part2_TGnorm.regenie'
)

In [None]:
LD_PRUNED_VARIANTS <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/plink/20210907/aou_alpha2_ukb_lipids_plink_ld.prune.in'

In [None]:
PLOT_SUBTITLE <- 'Source: All of Us v5 and UK Biobank data'

# Join the phenotypes

In [None]:
aou_pheno = read_csv(pipe(str_glue('gsutil cat {AOU_PHENO}')))

dim(aou_pheno)
spec(aou_pheno)

## Possible problem for AoU: why do some participants have an NA Cohort?

In [None]:
aou_pheno %>%
    group_by(Cohort) %>%
    summarize(count = n())

In [None]:
aou_pheno %>%
    group_by(sex) %>%
    summarize(count = n())

In [None]:
aou_pheno %>%
    group_by(statin_use) %>%
    summarize(count = n())

In [None]:
ukb_pheno = read_csv(pipe(str_glue('gsutil cat {UKB_PHENO}')))

dim(ukb_pheno)
spec(ukb_pheno)

## Possible problem for UKB: why do some participants have an NA Cohort?

Are the NA participants those without a WES?

In [None]:
ukb_pheno %>%
    group_by(Cohort) %>%
    summarize(count = n())

In [None]:
ukb_pheno %>%
    group_by(Sex_numeric) %>%
    summarize(count = n())

In [None]:
ukb_pheno %>%
    group_by(statin0) %>%
    summarize(count = n())

## Modify the input dataframes so that they can be combined.

In [None]:
pheno <- bind_rows(
    aou_pheno %>%
        mutate(
            # There were some NAs in the input cohort field for people in the alpha2 release. Retain those rows.
            Cohort = 'AoU',
            # Consolidate the values in this field.
            sex = case_when(
                sex == 'Female' ~ 'Female',
                sex == 'Male' ~ 'Male',
                TRUE ~ 'not specified'
            )
        ) %>%
        select(id = person_id, cohort=Cohort, age, sex, statin_use,
               TC, TG, LDL, HDL, TCadjusted, TGadjusted, LDLadjusted),
    ukb_pheno %>%
        # There were some NAs in the input cohort field for people without WES. Drop those rows.
        filter(Cohort == 'UKB') %>%
        mutate(
            # Convert numeric coding to strings per https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=9.
            sex = case_when(
                Sex_numeric == 0 ~ 'Female',
                Sex_numeric == 1 ~ 'Male',
                TRUE ~ 'not specified'
            ),
            # Convert numeric coding to boolean.
            statin_use = case_when(
                statin0 == 1 ~ TRUE,
                TRUE ~ FALSE
            )
        ) %>%
        select(id = eid, cohort=Cohort, age, sex, statin_use,
               TC = chol, TG = trig, LDL = ldl, HDL = hdl,
               TCadjusted = choladj, TGadjusted = TG_adjusted_log, LDLadjusted = ldladj
              )
    ) %>%
    mutate(
        age_group_smaller_bins = cut_width(age, width = 10, boundary = 0),
        age_group = cut_width(age, width = 20, boundary = 0)
    )

In [None]:
pheno %>%
    group_by(cohort) %>%
    summarize(count = n())

In [None]:
pheno %>%
    group_by(sex) %>%
    summarize(count = n())

In [None]:
pheno %>%
    group_by(statin_use) %>%
    summarize(count = n())

# Plot lipids

In [None]:
plot_vars <- function(data, xvar, yvar, fillvar, title_detail = '', log_scale = FALSE) {
    xvar_sym <- sym(xvar)
    xvar_name <- xvar
    yvar_sym <- sym(yvar)
    yvar_name <- yvar
    fillvar_sym <- sym(fillvar)
    fillvar_name <- fillvar

    options(repr.plot.width = 16, repr.plot.height = 8)
    
    p <- data %>%
        filter(!is.na(!!yvar_sym)) %>%
        ggplot(aes(x = !!xvar_sym, y = !!yvar_sym, fill = !!fillvar_sym)) +
        geom_boxplot() +
        stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                     position = position_dodge(width = 0.9), vjust = -0.8) +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{yvar_name} mg/dL per person by {xvar_name} and {fillvar_name} {title_detail}'),
             caption = PLOT_SUBTITLE)

    if(log_scale) {
        p = p + scale_y_log10()
    }

    p
}

## By age group - bin size 10 [not okay to share, groups too small]

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'age_group_smaller_bins', yvar = lipid, fillvar = 'cohort'))
}

## By age group - larger bins [okay to share]

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'age_group', yvar = lipid, fillvar = 'cohort'))
}

In [None]:
# Filter high outliers from TG and plot again.
plot_vars(data = pheno %>% filter(TG < 750), xvar = 'age_group', yvar = 'TG', fillvar = 'cohort',
         title_detail = '[high outliers removed]')

In [None]:
plot_vars(data = pheno, xvar = 'age_group', yvar = 'TG', fillvar = 'cohort',
          log_scale = TRUE, title_detail = '[log scale y-axis]')

## By statin use

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'statin_use'))
}

In [None]:
# Filter high outliers from TG and plot again.
plot_vars(data = pheno %>% filter(TG < 750), xvar = 'cohort', yvar = 'TG', fillvar = 'statin_use',
          title_detail = '[high outliers removed]')

In [None]:
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TG', fillvar = 'statin_use',
          log_scale = TRUE, title_detail = '[log scale y-axis]')

## By statin use and adjusted

In [None]:
for (lipid in c('LDLadjusted', 'TCadjusted', 'TGadjusted')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'statin_use'))
}

In [None]:
# Special case the title for this plot.
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TGadjusted', fillvar = 'statin_use',
          log_scale = FALSE, title_detail = '[adjusted data is in log space]')

## By sex at birth

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'sex'))
}

In [None]:
# Filter high outliers from TG and plot again.
plot_vars(data = pheno %>% filter(TG < 750), xvar = 'cohort', yvar = 'TG', fillvar = 'sex',
          title_detail = '[high outliers removed]')

In [None]:
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TG', fillvar = 'sex',
          log_scale = TRUE, title_detail = '[log scale y-axis]')

# Plot GWAS phenotypes

<div class="alert alert-block alert-success">
    See notebooks <kbd>AOU_UKB_phenotypes.ipynb</kbd> and <kbd>AOU_UKB_phenotype_refined.ipynb</kbd> for the code that wrangles these phenotypes and covariates.
</div>

In [None]:
plot_var_histograms <- function(data, xvar, facetvar, title_detail = '', log_scale = FALSE) {
    xvar_sym <- sym(xvar)
    xvar_name <- xvar
    facetvar_sym <- sym(facetvar)
    facetvar_name <- facetvar

    options(repr.plot.width = 16, repr.plot.height = 8)
    
    p <- data %>%
        filter(!is.na(!!xvar_sym)) %>%
        ggplot(aes(x = !!xvar_sym)) +
        geom_histogram(bins = 30) +
        facet_wrap(vars(!!facetvar_sym), ncol = 2, scales = 'free') +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{xvar_name} {title_detail}'),
             caption = PLOT_SUBTITLE)

    if(log_scale) {
        p = p + scale_y_log10()
    }

    p
}

In [None]:
retval <- lapply(GWAS_PHENO, function(gwas_pheno) {
    gwas_pheno_data <- read_csv(pipe(str_glue('gsutil cat {gwas_pheno}')), col_types = cols())
    
    raw_col <- str_which(colnames(gwas_pheno_data), 'raw')
    adj_col <- str_which(colnames(gwas_pheno_data), 'adj')
    resid_col <- str_which(colnames(gwas_pheno_data), 'resid')
    norm_col <- str_which(colnames(gwas_pheno_data), 'norm')
    
    print(plot_var_histograms(data = gwas_pheno_data,
                              xvar = colnames(gwas_pheno_data)[[raw_col[[1]]]],
                              facetvar = 'cohort'))

    print(plot_var_histograms(data = gwas_pheno_data,
                              xvar = colnames(gwas_pheno_data)[[adj_col[[1]]]],
                              facetvar = 'cohort'))

    print(plot_var_histograms(data = gwas_pheno_data,
                              xvar = colnames(gwas_pheno_data)[[resid_col[[1]]]],
                              facetvar = 'cohort'))

    print(plot_var_histograms(data = gwas_pheno_data,
                              xvar = colnames(gwas_pheno_data)[[norm_col[[1]]]],
                              facetvar = 'cohort'))

})

# Plot final regenie phenotypes

<div class="alert alert-block alert-success">
    See notebook <kbd>regenie_gwas.ipynb</kbd> for the code that transforms the GWAS phenotypes to regenie's specific format. 
</div>

In [None]:
regenie_pheno = read_tsv(pipe(str_glue('gsutil cat {REGENIE_PHENO}')))

dim(regenie_pheno)
colnames(regenie_pheno)

In [None]:
regenie_pheno %>%
    group_by(cohort) %>%
    summarize(
        count = n()
    )

In [None]:
options(repr.plot.width = 8, repr.plot.height = 8)
for (lipid in c('HDLnorm', 'LDLnorm', 'TCnorm', 'TGnorm')) {
    print(plot_var_histograms(data = regenie_pheno, xvar = lipid, facetvar = 'cohort'))
}

# Plot regenie results

This code was inspired by [regenie.wdl](https://github.com/briansha/Regenie_WDL/blob/master/regenie.wdl#L515).

## GC fix

In [None]:
regenie_results <- read_delim(pipe(str_glue('gsutil cat {REGENIE_RESULTS[[1]]}')), delim = ' ')
    
regenie_results$p_value <- 10 ^ (-1 * regenie_results$LOG10P)

In [None]:
min(regenie_results$p_value)
min(regenie_results$LOG10P)

In [None]:
max(regenie_results$p_value)
max(regenie_results$LOG10P)

In [None]:
retval <- lapply(REGENIE_RESULTS, function(lipid_result) {
    regenie_results <- read_delim(pipe(str_glue('gsutil cat {lipid_result}')), delim = ' ')
    
    regenie_results$p_value <- 10 ^ (-1 * regenie_results$LOG10P)
       
    gc_score_A <- qchisq(median(regenie_results$p_value), 1, lower.tail=FALSE) / 0.456
    gc_score_B <- median(qchisq(1 - regenie_results$p_value, 1)) / qchisq(0.5, 1)
    
    message(str_glue('nrow: {nrow(regenie_results)} ncol: {ncol(regenie_results)} in {lipid_result}'))
    message(str_glue('GC: {gc_score_A} {gc_score_B}'))
    message(
        regenie_results %>%
            group_by(TEST) %>%
        summarize(count = n())
    )
        
    options(repr.plot.width = 10, repr.plot.height = 10)
    manhattan(regenie_results,
              chr="CHROM",
              bp="GENPOS",
              snp="ID",
              p="p_value",
              logp=TRUE,
              annotateTop = FALSE,
              # Omit the high outliers to make the plot more readable.
              # TODO(deflaux) log scale the y axis instead.
              ylim = c(0, 200),
              cex = 1.25,
              cex.axis = 1.25,
              cex.lab = 1.25,
              main = str_glue('{basename(lipid_result)} results\nfrom {dirname(lipid_result)}'),
              sub = PLOT_SUBTITLE
             )

    qq(regenie_results$p_value,
       cex = 1.25,
       cex.axis = 1.25,
       cex.lab = 1.25,
       main = str_glue('{basename(lipid_result)} results\nfrom {dirname(lipid_result)}\n GC: {gc_score_A} {gc_score_B}'),
       sub = PLOT_SUBTITLE)

})

## GC fix and filter to common variants

In [None]:
regenie_results <- read_delim(pipe(str_glue('gsutil cat {REGENIE_RESULTS[[1]]}')), delim = ' ')

In [None]:
min(regenie_results$A1FREQ)

In [None]:
max(regenie_results$A1FREQ)

In [None]:
retval <- lapply(REGENIE_RESULTS, function(lipid_result) {
    regenie_results <- read_delim(pipe(str_glue('gsutil cat {lipid_result}')), delim = ' ')
    
    regenie_results$p_value <- 10 ^ (-1 * regenie_results$LOG10P)

    # Use only common variants in GC and the plots.
    common_regenie_results <- regenie_results %>%
        filter(A1FREQ > 0.01 & A1FREQ < 0.99)
    
    gc_score_A <- qchisq(median(common_regenie_results$p_value), 1, lower.tail=FALSE) / 0.456
    gc_score_B <- median(qchisq(1 - common_regenie_results$p_value, 1)) / qchisq(0.5, 1)
    
    message(str_glue('nrow: {nrow(regenie_results)} ncol: {ncol(regenie_results)} in {lipid_result}'))
    message(str_glue('GC: {gc_score_A} {gc_score_B}'))
    message(
        regenie_results %>%
            group_by(TEST) %>%
        summarize(count = n())
    )
        
    options(repr.plot.width = 10, repr.plot.height = 10)
    manhattan(common_regenie_results,
              chr="CHROM",
              bp="GENPOS",
              snp="ID",
              p="p_value",
              logp=TRUE,
              annotateTop = FALSE,
              # Omit the high outliers to make the plot more readable.
              # TODO(deflaux) log scale the y axis instead.
              ylim = c(0, 200),
              cex = 1.25,
              cex.axis = 1.25,
              cex.lab = 1.25,
              main = str_glue('{basename(lipid_result)} results\nfrom {dirname(lipid_result)}'),
              sub = PLOT_SUBTITLE
             )

    qq(common_regenie_results$p_value,
       cex = 1.25,
       cex.axis = 1.25,
       cex.lab = 1.25,
       main = str_glue('{basename(lipid_result)} results\nfrom {dirname(lipid_result)}\n GC: {gc_score_A} {gc_score_B}'),
       sub = PLOT_SUBTITLE)

})

## GC fix and prune variants in LD

In [None]:
ld_pruned_variants  <- read_tsv(pipe(str_glue('gsutil cat {LD_PRUNED_VARIANTS}')), col_names = 'variant_id')

head(ld_pruned_variants)

In [None]:
retval <- lapply(REGENIE_RESULTS, function(lipid_result) {
    regenie_results <- read_delim(pipe(str_glue('gsutil cat {lipid_result}')), delim = ' ')
    
    regenie_results$p_value <- 10 ^ (-1 * regenie_results$LOG10P)
    
    # Use LD pruned results in GC and the QQ plot.
    ld_pruned_regenie_results <- regenie_results %>%
        filter(ID %in% ld_pruned_variants$variant_id)
    
    gc_score_A <- qchisq(median(ld_pruned_regenie_results$p_value), 1, lower.tail=FALSE) / 0.456
    gc_score_B <- median(qchisq(1 - ld_pruned_regenie_results$p_value, 1)) / qchisq(0.5, 1)
    
    message(str_glue('nrow: {nrow(regenie_results)} ncol: {ncol(regenie_results)} in {lipid_result}'))
    message(str_glue('GC: {gc_score_A} {gc_score_B}'))
    message(
        regenie_results %>%
            group_by(TEST) %>%
        summarize(count = n())
    )
        
    options(repr.plot.width = 10, repr.plot.height = 10)
    manhattan(regenie_results,
              chr="CHROM",
              bp="GENPOS",
              snp="ID",
              p="p_value",
              logp=TRUE,
              annotateTop = FALSE,
              # Omit the high outliers to make the plot more readable.
              # TODO(deflaux) log scale the y axis instead.
              ylim = c(0, 200),
              cex = 1.25,
              cex.axis = 1.25,
              cex.lab = 1.25,
              main = str_glue('{basename(lipid_result)} results\nfrom {dirname(lipid_result)}'),
              sub = PLOT_SUBTITLE
             )

    qq(ld_pruned_regenie_results$p_value,
       cex = 1.25,
       cex.axis = 1.25,
       cex.lab = 1.25,
       main = str_glue('{basename(lipid_result)} results\nfrom {dirname(lipid_result)}\n GC: {gc_score_A} {gc_score_B}'),
       sub = PLOT_SUBTITLE)

})

## GC fix, LD pruning, and common variants

In [None]:
ld_pruned_variants  <- read_tsv(pipe(str_glue('gsutil cat {LD_PRUNED_VARIANTS}')), col_names = 'variant_id')

head(ld_pruned_variants)

In [None]:
retval <- lapply(REGENIE_RESULTS, function(lipid_result) {
    regenie_results <- read_delim(pipe(str_glue('gsutil cat {lipid_result}')), delim = ' ')
    
    regenie_results$p_value <- 10 ^ (-1 * regenie_results$LOG10P)
    
    # Use only common variants in GC and the plots.
    common_regenie_results <- regenie_results %>%
        filter(A1FREQ > 0.01 & A1FREQ < 0.99)

    # Use LD pruned results in GC and the QQ plot.
    ld_pruned_common_regenie_results <- common_regenie_results %>%
        filter(ID %in% ld_pruned_variants$variant_id)
    
    gc_score_A <- qchisq(median(ld_pruned_common_regenie_results$p_value), 1, lower.tail=FALSE) / 0.456
    gc_score_B <- median(qchisq(1 - ld_pruned_common_regenie_results$p_value, 1)) / qchisq(0.5, 1)
    
    message(str_glue('nrow: {nrow(regenie_results)} ncol: {ncol(regenie_results)} in {lipid_result}'))
    message(str_glue('GC: {gc_score_A} {gc_score_B}'))
    message(
        regenie_results %>%
            group_by(TEST) %>%
        summarize(count = n())
    )
        
    options(repr.plot.width = 10, repr.plot.height = 10)
    manhattan(common_regenie_results,
              chr="CHROM",
              bp="GENPOS",
              snp="ID",
              p="p_value",
              logp=TRUE,
              annotateTop = FALSE,
              # Omit the high outliers to make the plot more readable.
              # TODO(deflaux) log scale the y axis instead.
              ylim = c(0, 200),
              cex = 1.25,
              cex.axis = 1.25,
              cex.lab = 1.25,
              main = str_glue('{basename(lipid_result)} results\nfrom {dirname(lipid_result)}'),
              sub = PLOT_SUBTITLE
             )

    qq(ld_pruned_common_regenie_results$p_value,
       cex = 1.25,
       cex.axis = 1.25,
       cex.lab = 1.25,
       main = str_glue('{basename(lipid_result)} results\nfrom {dirname(lipid_result)}\n GC: {gc_score_A} {gc_score_B}'),
       sub = PLOT_SUBTITLE)

})

# Provenance 

In [None]:
devtools::session_info()