# Plot results for pooled analyses

In this notebook we review and explore the *All of Us* and UK Biobank data for lipids phenotypes and covariates and GWAS results.

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on All of Us Workbench. It runs fine on the default Cloud Environment. 
</div>

In [None]:
lapply(c('hrbrthemes', 'skimr', 'tidyverse', 'qqman'),
       function(pkg) { if(! pkg %in% installed.packages()) { install.packages(pkg)} } )

In [None]:
library(grid)
library(gridExtra)
library(hrbrthemes)
library(qqman)
library(readxl)
library(scales)
library(skimr)
library(tidyverse)

In [None]:
# Set some visualiation defaults.
theme_set(theme_ipsum(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

# Constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

# Inputs
AOU_PHENO <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/AOU_Full_Data.csv'
UKB_PHENO <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/UKB_Full_Data.csv'

GWAS_PHENO <- c(
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/MergedData_HDL_Iteration2_ForGWAS.csv',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/MergedData_LDL_Iteration2_ForGWAS.csv',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/MergedData_TC_Iteration2_ForGWAS.csv',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/MergedData_TG_Iteration2_ForGWAS.csv'
)

REGENIE_PHENO <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210921/aou_alpha2_ukb_lipids_phenotypes_and_covariates.tsv'

REGENIE_RESULTS <- c(
    HDL='gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210921/aou_alpha2_ukb_lipids_regenie_part2_HDLnorm.regenie',
    LDL='gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210921/aou_alpha2_ukb_lipids_regenie_part2_LDLnorm.regenie',
    TC='gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210921/aou_alpha2_ukb_lipids_regenie_part2_TCnorm.regenie',
    TG='gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/regenie/20210921/aou_alpha2_ukb_lipids_regenie_part2_TGnorm.regenie'
)

LIPIDS <- names(REGENIE_RESULTS)

LD_PRUNED_VARIANTS <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/plink/20210907/aou_alpha2_ukb_lipids_plink_ld.prune.in'

MERGED_AF <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20211103/merged_allele_freq.csv'
AOU_ONLY_AF <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20211102/aou_only_allele_freq.csv'
UKB_ONLY_AF <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20211102/ukb_only_allele_freq.csv'

PLOT_SUBTITLE <- 'Source: All of Us v5 and UK Biobank data'

# Join the phenotypes

In [None]:
aou_pheno = read_csv(pipe(str_glue('gsutil cat {AOU_PHENO}')))

dim(aou_pheno)
spec(aou_pheno)

Participants have an NA cohort when they do not have genomic data.

This is because PCs are available for only few of the 500K samples (eg:WES) in UKB and likewise few of the samples in AOU pheno files are not present in the PC matrix. Please see section 38, 39 in "AOU_UKB_phenotype_refined" notebook for more detail.

In [None]:
aou_pheno %>%
    group_by(Cohort) %>%
    summarize(count = n())

In [None]:
aou_pheno %>%
    group_by(sex) %>%
    summarize(count = n())

In [None]:
aou_pheno %>%
    group_by(statin_use) %>%
    summarize(count = n())

In [None]:
ukb_pheno = read_csv(pipe(str_glue('gsutil cat {UKB_PHENO}')))

dim(ukb_pheno)
spec(ukb_pheno)

Participants have an NA cohort when they do not have genomic data.

This is because PCs are available for only few of the 500K samples (eg:WES) in UKB and likewise few of the samples in AOU pheno files are not present in the PC matrix. Please see section 38, 39 in "AOU_UKB_phenotype_refined" notebook for more detail.

In [None]:
ukb_pheno %>%
    group_by(Cohort) %>%
    summarize(count = n())

In [None]:
ukb_pheno %>%
    group_by(Sex_numeric) %>%
    summarize(count = n())

In [None]:
ukb_pheno %>%
    group_by(statin0) %>%
    summarize(count = n())

## Modify the input dataframes so that they can be combined.

In [None]:
pheno <- bind_rows(
    aou_pheno %>%
        mutate(
            # There were some NAs in the input cohort field for people in the alpha2 release. Retain those rows.
            Cohort = 'AoU',
            # Consolidate the values in this field.
            sex = case_when(
                sex == 'Female' ~ 'Female',
                sex == 'Male' ~ 'Male',
                TRUE ~ 'not specified'
            )
        ) %>%
        select(id = person_id, cohort=Cohort, age, sex, statin_use,
               TC, TG, LDL, HDL, TCadjusted, TGadjusted, LDLadjusted),
    ukb_pheno %>%
        # There were some NAs in the input cohort field for people without WES. Drop those rows.
        filter(Cohort == 'UKB') %>%
        mutate(
            # Convert numeric coding to strings per https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=9.
            sex = case_when(
                Sex_numeric == 0 ~ 'Female',
                Sex_numeric == 1 ~ 'Male',
                TRUE ~ 'not specified'
            ),
            # Convert numeric coding to boolean.
            statin_use = case_when(
                statin0 == 1 ~ TRUE,
                TRUE ~ FALSE
            )
        ) %>%
        select(id = eid, cohort=Cohort, age, sex, statin_use,
               TC = chol, TG = trig, LDL = ldl, HDL = hdl,
               TCadjusted = choladj, TGadjusted = TG_adjusted_log, LDLadjusted = ldladj
              )
    ) %>%
    mutate(
        age_group_smaller_bins = cut_width(age, width = 10, boundary = 0),
        age_group = cut_width(age, width = 20, boundary = 0)
    )

In [None]:
pheno %>%
    group_by(cohort) %>%
    summarize(count = n())

In [None]:
pheno %>%
    group_by(sex) %>%
    summarize(count = n())

In [None]:
pheno %>%
    group_by(statin_use) %>%
    summarize(count = n())

# Plot lipids

In [None]:
plot_vars <- function(data, xvar, yvar, fillvar, title_detail = '', log_scale = FALSE) {
    xvar_sym <- sym(xvar)
    xvar_name <- xvar
    yvar_sym <- sym(yvar)
    yvar_name <- yvar
    fillvar_sym <- sym(fillvar)
    fillvar_name <- fillvar

    options(repr.plot.width = 16, repr.plot.height = 8)
    
    p <- data %>%
        filter(!is.na(!!yvar_sym)) %>%
        ggplot(aes(x = !!xvar_sym, y = !!yvar_sym, fill = !!fillvar_sym)) +
        geom_boxplot() +
        stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                     position = position_dodge(width = 0.9), vjust = -0.8) +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{yvar_name} mg/dL per person by {xvar_name} and {fillvar_name} {title_detail}'),
             caption = PLOT_SUBTITLE)

    if(log_scale) {
        p = p + scale_y_log10()
    }

    p
}

## By age group - bin size 10 [not okay to share, groups too small]

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'age_group_smaller_bins', yvar = lipid, fillvar = 'cohort'))
}

## By age group - larger bins [okay to share]

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'age_group', yvar = lipid, fillvar = 'cohort'))
}

In [None]:
# Filter high outliers from TG and plot again.
plot_vars(data = pheno %>% filter(TG < 750), xvar = 'age_group', yvar = 'TG', fillvar = 'cohort',
         title_detail = '[high outliers removed]')

In [None]:
plot_vars(data = pheno, xvar = 'age_group', yvar = 'TG', fillvar = 'cohort',
          log_scale = TRUE, title_detail = '[log scale y-axis]')

## By statin use

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'statin_use'))
}

In [None]:
# Filter high outliers from TG and plot again.
plot_vars(data = pheno %>% filter(TG < 750), xvar = 'cohort', yvar = 'TG', fillvar = 'statin_use',
          title_detail = '[high outliers removed]')

In [None]:
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TG', fillvar = 'statin_use',
          log_scale = TRUE, title_detail = '[log scale y-axis]')

## By statin use and adjusted

In [None]:
for (lipid in c('LDLadjusted', 'TCadjusted', 'TGadjusted')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'statin_use'))
}

In [None]:
# Special case the title for this plot.
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TGadjusted', fillvar = 'statin_use',
          log_scale = FALSE, title_detail = '[adjusted data is in log space]')

## By sex at birth

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'sex'))
}

In [None]:
# Filter high outliers from TG and plot again.
plot_vars(data = pheno %>% filter(TG < 750), xvar = 'cohort', yvar = 'TG', fillvar = 'sex',
          title_detail = '[high outliers removed]')

In [None]:
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TG', fillvar = 'sex',
          log_scale = TRUE, title_detail = '[log scale y-axis]')

# Plot GWAS phenotypes

<div class="alert alert-block alert-success">
    See notebooks <kbd>AOU_UKB_phenotypes.ipynb</kbd> and <kbd>AOU_UKB_phenotype_refined.ipynb</kbd> for the code that wrangles these phenotypes and covariates.
</div>

In [None]:
plot_var_histograms <- function(data, xvar, facetvar, title_detail = '', log_scale = FALSE) {
    xvar_sym <- sym(xvar)
    xvar_name <- xvar
    facetvar_sym <- sym(facetvar)
    facetvar_name <- facetvar

    options(repr.plot.width = 16, repr.plot.height = 8)
    
    p <- data %>%
        filter(!is.na(!!xvar_sym)) %>%
        ggplot(aes(x = !!xvar_sym)) +
        geom_histogram(bins = 30) +
        facet_wrap(vars(!!facetvar_sym), ncol = 2, scales = 'free') +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{xvar_name} {title_detail}'),
             caption = PLOT_SUBTITLE)

    if(log_scale) {
        p = p + scale_y_log10()
    }

    p
}

In [None]:
retval <- lapply(GWAS_PHENO, function(gwas_pheno) {
    gwas_pheno_data <- read_csv(pipe(str_glue('gsutil cat {gwas_pheno}')), col_types = cols())
    
    raw_col <- str_which(colnames(gwas_pheno_data), 'raw')
    adj_col <- str_which(colnames(gwas_pheno_data), 'adj')
    resid_col <- str_which(colnames(gwas_pheno_data), 'resid')
    norm_col <- str_which(colnames(gwas_pheno_data), 'norm')
    
    print(plot_var_histograms(data = gwas_pheno_data,
                              xvar = colnames(gwas_pheno_data)[[raw_col[[1]]]],
                              facetvar = 'cohort'))

    print(plot_var_histograms(data = gwas_pheno_data,
                              xvar = colnames(gwas_pheno_data)[[adj_col[[1]]]],
                              facetvar = 'cohort'))

    print(plot_var_histograms(data = gwas_pheno_data,
                              xvar = colnames(gwas_pheno_data)[[resid_col[[1]]]],
                              facetvar = 'cohort'))

    print(plot_var_histograms(data = gwas_pheno_data,
                              xvar = colnames(gwas_pheno_data)[[norm_col[[1]]]],
                              facetvar = 'cohort'))

})

# Plot final regenie phenotypes

<div class="alert alert-block alert-success">
    See notebook <kbd>regenie_gwas.ipynb</kbd> for the code that transforms the GWAS phenotypes to regenie's specific format. 
</div>

In [None]:
regenie_pheno = read_tsv(pipe(str_glue('gsutil cat {REGENIE_PHENO}')))

dim(regenie_pheno)
colnames(regenie_pheno)

In [None]:
regenie_pheno %>%
    group_by(cohort) %>%
    summarize(
        count = n()
    )

In [None]:
options(repr.plot.width = 8, repr.plot.height = 8)
for (lipid in c('HDLnorm', 'LDLnorm', 'TCnorm', 'TGnorm')) {
    print(plot_var_histograms(data = regenie_pheno, xvar = lipid, facetvar = 'cohort'))
}

# Load the regenie GWAS results

Bring our results into a single dataframe with a lipid type column.

In [None]:
combined_regenie_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        file = REGENIE_RESULTS[lipid]
        read_delim(pipe(str_glue('gsutil cat {file}')), delim = ' ') %>%
        mutate(lipid_type = lipid)
    })) %>%
    mutate(
        p_value = 10 ^ (-1 * LOG10P),
        RSID = paste0(CHROM, ':' , GENPOS, ':', ALLELE0, ':', ALLELE1)
    )

dim(combined_regenie_results)

In [None]:
head(combined_regenie_results)

In [None]:
combined_regenie_results %>%
    group_by(lipid_type) %>%
    summarize(
        min_p_value = min(p_value),
        max_p_value = max(p_value),
        min_LOG10P = min(LOG10P),
        max_LOG10P = max(LOG10P),
        min_A1FREQ = min(A1FREQ),
        max_A1FREQ = max(A1FREQ),
        min_N = min(N),
        max_N = max(N),
    )

# Plot regenie results

In [None]:
plot_manhattan_and_qq <- function(regenie_results, manhattan_title, qq_title) {
    options(repr.plot.width = 10, repr.plot.height = 10)
    manhattan(regenie_results,
              chr='CHROM',
              bp='GENPOS',
              snp='ID',
              p='p_value',
              logp=TRUE,
              annotateTop = FALSE,
              ylim = c(0, 200),
              cex = 1.25,
              cex.axis = 1.25,
              cex.lab = 1.25,
              main = manhattan_title,
              sub = PLOT_SUBTITLE
             )

    qq(regenie_results$p_value,
       cex = 1.25,
       cex.axis = 1.25,
       cex.lab = 1.25,
       main = qq_title,
       sub = PLOT_SUBTITLE)
}

In [None]:
library('stats')

alternate_qq <- function(regenie_results, qq_title) {
    options(repr.plot.width = 21, repr.plot.height = 7)
    
    regenie_results %>%
        mutate(
            percentile = case_when(
                A1FREQ < .01 | (1 - A1FREQ) < .01 ~ '0 <= MAF < 0.01',
                A1FREQ < .1 | (1 - A1FREQ) < .1 ~ '0.01 <= MAF < 0.1',
                A1FREQ < .5 | (1 - A1FREQ) < .5 ~ '0.1 <= MAF < 0.5',
                TRUE ~ 'other'
            )
        ) %>%
        arrange(p_value) %>%
        bind_cols(tibble(expected = -log10(ppoints(nrow(regenie_results))))) %>%
        ggplot(aes(x = expected, y = LOG10P, color = percentile)) +
            geom_point(alpha = 0.5) +
            geom_abline() +
            xlim(0, 20) +
            ylim(0, 20) +
            facet_grid(cols = vars(percentile))
}

alternate_qq(combined_regenie_results %>% filter(lipid_type == 'LDL'), 'test plot')

## All GWAS results

In [None]:
map(LIPIDS, function(lipid) {
    regenie_results <- combined_regenie_results %>% filter(lipid_type == lipid)
    file = REGENIE_RESULTS[lipid]

    gc_score <- median(regenie_results$CHISQ) / qchisq(0.5, 1)

    message(str_glue('nrow: {nrow(regenie_results)} ncol: {ncol(regenie_results)} in {file}'))
    message(str_glue('GC: {round(gc_score, 3)}'))
    message(
        regenie_results %>%
            group_by(TEST) %>%
        summarize(count = n())
    )

    plot_manhattan_and_qq(
        regenie_results,
        manhattan_title = str_glue('{basename(file)} results\nfrom {dirname(file)}'),
        qq_title = str_glue('{basename(file)} results\nfrom {dirname(file)}\n GC: {round(gc_score, 3)}')
    )
})

## Filter to common variants

In [None]:
map(LIPIDS, function(lipid) {
    regenie_results <- combined_regenie_results %>% filter(lipid_type == lipid)
    file = REGENIE_RESULTS[lipid]
    
    # Use only common variants in GC and the plots.
    common_regenie_results <- regenie_results %>%
        filter(A1FREQ > 0.01 & A1FREQ < 0.99)

    gc_score <- median(common_regenie_results$CHISQ) / qchisq(0.5, 1)

    message(str_glue('nrow: {nrow(regenie_results)} n_sig: {nrow(regenie_results %>% filter(LOG10P > -log10(5e-08)))} in {file}'))
    message(str_glue('nrow: {nrow(common_regenie_results)} n_sig: {nrow(common_regenie_results %>% filter(LOG10P > -log10(5e-08)))} after filtering to common variants'))
    message(str_glue('GC: {round(gc_score, 3)}'))
    message(
        regenie_results %>%
            group_by(TEST) %>%
        summarize(count = n())
    )

    plot_manhattan_and_qq(
        common_regenie_results,
        manhattan_title = str_glue('{basename(file)} results\nfrom {dirname(file)}\ncommon variants only'),
        qq_title = str_glue('{basename(file)} results\nfrom {dirname(file)}\ncommon variants only\tGC: {round(gc_score, 3)}')
    )
})

## Prune variants in LD

In [None]:
ld_pruned_variants  <- read_tsv(pipe(str_glue('gsutil cat {LD_PRUNED_VARIANTS}')), col_names = 'variant_id')

head(ld_pruned_variants)

In [None]:
map(LIPIDS, function(lipid) {
    regenie_results <- combined_regenie_results %>% filter(lipid_type == lipid)
    file = REGENIE_RESULTS[lipid]
    
    # Use LD pruned results in GC and the QQ plot.
    ld_pruned_regenie_results <- regenie_results %>%
        filter(ID %in% ld_pruned_variants$variant_id)

    gc_score <- median(ld_pruned_regenie_results$CHISQ) / qchisq(0.5, 1)
    
    message(str_glue('nrow: {nrow(regenie_results)} n_sig: {nrow(regenie_results %>% filter(LOG10P > -log10(5e-08)))} in {file}'))
    message(str_glue('nrow: {nrow(ld_pruned_regenie_results)} n_sig: {nrow(ld_pruned_regenie_results %>% filter(LOG10P > -log10(5e-08)))} after pruning variants in LD'))
    message(str_glue('GC: {round(gc_score, 3)}'))
    message(
        regenie_results %>%
            group_by(TEST) %>%
        summarize(count = n())
    )

    plot_manhattan_and_qq(
        ld_pruned_regenie_results,
        manhattan_title = str_glue('{basename(file)} results\nfrom {dirname(file)}\nVariants in LD pruned out.'),
        qq_title = str_glue('{basename(file)} results\nfrom {dirname(file)}\nVariants in LD pruned out.\tGC: {round(gc_score, 3)}')
    )
})

## Prune variants in LD and filter to common variants

In [None]:
map(LIPIDS, function(lipid) {
    regenie_results <- combined_regenie_results %>% filter(lipid_type == lipid)
    file = REGENIE_RESULTS[lipid]
    
    # Use only common variants in GC and the plots.
    common_regenie_results <- regenie_results %>%
        filter(A1FREQ > 0.01 & A1FREQ < 0.99)
    
    # Use LD pruned results in GC and the QQ plot.
    ld_pruned_common_regenie_results <- common_regenie_results %>%
        filter(ID %in% ld_pruned_variants$variant_id)

    gc_score <- median(ld_pruned_common_regenie_results$CHISQ) / qchisq(0.5, 1)
    
    message(str_glue('nrow: {nrow(regenie_results)} n_sig: {nrow(regenie_results %>% filter(LOG10P > -log10(5e-08)))} in {file}'))
    message(str_glue('nrow: {nrow(common_regenie_results)} n_sig: {nrow(common_regenie_results %>% filter(LOG10P > -log10(5e-08)))} after filtering to common variants only'))
    message(str_glue('nrow: {nrow(ld_pruned_common_regenie_results)} n_sig: {nrow(ld_pruned_common_regenie_results %>% filter(LOG10P > -log10(5e-08)))} after pruning variants in LD'))
    message(str_glue('GC: {round(gc_score, 3)}'))
    message(
        regenie_results %>%
            group_by(TEST) %>%
        summarize(count = n())
    )

    plot_manhattan_and_qq(
        ld_pruned_common_regenie_results,
        manhattan_title = str_glue('{basename(file)} results\nfrom {dirname(file)}\nVariants in LD pruned out and common variants only.'),
        qq_title = str_glue('{basename(file)} results\nfrom {dirname(file)}\nVariants in LD pruned out and common variants only.\tGC: {round(gc_score, 3)}')
    )
})

# Comparisons against other lipids studies 

## Comparison with UKB published GWAS summary

##### Rare coding variants in 35 genes associate with circulating lipid levels – a multi-ancestry analysis of 170,000 exomes. Hindy et al 2021

In [None]:
download.file('https://www.biorxiv.org/content/biorxiv/early/2021/09/01/2020.12.22.423783/DC2/embed/media-2.xlsx?download=true', 'hindy.xlsx')

Bring the Hindy results into a single dataframe with a lipid type column.

In [None]:
combined_hindy_results <- read_xlsx('hindy.xlsx', sheet = 'Table_S11', skip = 1, na = 'NA') %>%
    filter(Ancestry == 'Overall') %>%
    mutate(
        lipid_type = case_when(
            Trait == 'LDL_ADJ' ~ 'LDL',
            Trait == 'TOTAL_ADJ' ~ 'TC',
            TRUE ~ Trait
        )
    )

dim(combined_hindy_results)

In [None]:
head(combined_hindy_results)

In [None]:
map(LIPIDS, function(lipid) {
    hindy_results = combined_hindy_results %>%
        filter(lipid_type == lipid) %>%
        select(RSID, beta_Hindy=BETA_FE)

    in_common_results = inner_join(
        hindy_results,
        combined_regenie_results %>%
            filter(lipid_type == lipid) %>%
            select(RSID, beta_AoU_siloed=BETA)
    )
    
    num_hindy_results = nrow(hindy_results)
    num_in_common_results = nrow(in_common_results)
    result_cor = cor(in_common_results$beta_AoU_siloed, in_common_results$beta_Hindy)
    
    options(repr.plot.width = 10, repr.plot.height = 10)

    in_common_results %>%
    ggplot(aes(x = beta_Hindy, y = beta_AoU_siloed)) +
        geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x = max(in_common_results$beta_Hindy),
                 y = min(in_common_results$beta_AoU_siloed),
                 hjust = 'right',
                 vjust = -1,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('correlation: {round(result_cor, digits = 3)}\nN = {num_in_common_results}'))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{lipid} GWAS result comparison to {num_hindy_results}\nsignificant RSID from Hindy et al. 2021'),
             caption = PLOT_SUBTITLE)

})

# Comparison with TOPMed (Freeze8) Lipid GWAS

Whole genome sequence analysis of blood lipid levels in >66,000 individuals. [Selvaraj et al 2021](https://www.biorxiv.org/content/10.1101/2021.10.11.463514v1.supplementary-material)

In [None]:
download.file('https://www.biorxiv.org/content/biorxiv/early/2021/10/12/2021.10.11.463514/DC1/embed/media-1.xlsx?download=true', 'selvaraj.xlsx')

Bring the Selvaraj results into a single dataframe with a lipid type column.

In [None]:
selvaraj_tables = c(HDL = 'A4:L361', LDL = 'A363:L701', TC = 'A703:L1027', TG = 'A1029:L1318')

combined_selvaraj_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        # Print some metadata for an eyeball check that we are associating the data with the correct lipid type.
        print(str_glue('{lipid} {selvaraj_tables[lipid]}'))
        first_row = as.integer(str_extract(selvaraj_tables[lipid], '\\d+'))
        print(read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = str_glue('A{first_row - 1}:A{first_row}')))
        print(nrow(read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = selvaraj_tables[lipid])))
        
        # Retrieve the data.
        read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = selvaraj_tables[lipid]) %>%
        mutate(
            # Work around a bad entry in the data causing the p.value column to be of type character.
            p.value = as.numeric(p.value),
            RSID = paste0(CHR, ':' , POS, ':', Allele1, ':', Allele2),
            lipid_type = lipid
        )
    }))

dim(combined_selvaraj_results)

In [None]:
head(combined_selvaraj_results)

In [None]:
map(LIPIDS, function(lipid) {
    selvaraj_results = combined_selvaraj_results %>%
        filter(lipid_type == lipid) %>%
        select(RSID, beta_selvaraj=BETA)

    in_common_results = inner_join(
        selvaraj_results,
        combined_regenie_results %>%
            filter(lipid_type == lipid) %>%
            select(RSID, beta_AoU_siloed=BETA)
    )
    
    num_selvaraj_results = nrow(selvaraj_results)
    num_in_common_results = nrow(in_common_results)
    result_cor = cor(in_common_results$beta_AoU_siloed, in_common_results$beta_selvaraj)
    
    options(repr.plot.width = 10, repr.plot.height = 10)

    in_common_results %>%
    ggplot(aes(x = beta_selvaraj, y = beta_AoU_siloed)) +
        geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x = max(in_common_results$beta_selvaraj),
                 y = min(in_common_results$beta_AoU_siloed),
                 hjust = 'right',
                 vjust = -1,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('correlation: {round(result_cor, digits = 3)}\nN = {num_in_common_results}'))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{lipid} GWAS result comparison to {num_selvaraj_results}\nsignificant RSID from Selvaraj et al. 2021'),
             caption = PLOT_SUBTITLE)

})

# Comparison Hindy vs. Selvaraj

In [None]:
map(LIPIDS, function(lipid) {
    hindy_results = combined_hindy_results %>%
        filter(lipid_type == lipid) %>%
        select(RSID, beta_Hindy=BETA_FE)

    selvaraj_results = combined_selvaraj_results %>%
        filter(lipid_type == lipid) %>%
        select(RSID, beta_selvaraj=BETA)

    in_common_results = inner_join(
        hindy_results,
        selvaraj_results
    )
    
    num_hindy_results = nrow(hindy_results)
    num_selvaraj_results = nrow(selvaraj_results)
    num_in_common_results = nrow(in_common_results)
    result_cor = cor(in_common_results$beta_selvaraj, in_common_results$beta_Hindy)
    
    options(repr.plot.width = 10, repr.plot.height = 10)

    in_common_results %>%
    ggplot(aes(x = beta_Hindy, y = beta_selvaraj)) +
        geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x = max(in_common_results$beta_Hindy),
                 y = min(in_common_results$beta_selvaraj),
                 hjust = 'right',
                 vjust = -1,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('correlation: {round(result_cor, digits = 3)}\nN = {num_in_common_results}'))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{lipid} GWAS result comparison between {num_hindy_results} significant RSID
from Hindy et al. 2021 and {num_selvaraj_results} significant RSID from
Selvaraj et al. 2021'),
             caption = PLOT_SUBTITLE)

})

# Plot allele frequencies

In [None]:
merged_af <- read_csv(pipe(str_glue('gsutil cat {MERGED_AF}')))

dim(merged_af)

In [None]:
aou_only_af <- read_csv(pipe(str_glue('gsutil cat {AOU_ONLY_AF}')))

dim(aou_only_af)

In [None]:
ukb_only_af <- read_csv(pipe(str_glue('gsutil cat {UKB_ONLY_AF}')))

dim(ukb_only_af)

In [None]:
allele_freq <- bind_rows(
    merged_af %>% mutate(variant_set = 'merged'),
    aou_only_af %>% mutate(variant_set = 'aou_only'),
    ukb_only_af %>% mutate(variant_set = 'ukb_only')
) %>%
mutate(
    variant_set = fct_relevel(variant_set, 'merged', after = Inf)
) %>%
rename(
    allele_frequency = AF
)

dim(allele_freq)

In [None]:
skim(allele_freq)

In [None]:
allele_freq %>% filter(is.na(allele_frequency))

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

allele_freq %>%
    ggplot(aes(x = allele_frequency)) +
    geom_histogram() +
    facet_grid(cols = vars(variant_set)) +
    scale_y_log10(labels=comma) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    labs(title = str_glue('Histogram of allele frequencies [y-axis in log scale]'),
         caption = PLOT_SUBTITLE)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

allele_freq %>%
    filter(allele_frequency < .01) %>%
    ggplot(aes(x = allele_frequency)) +
    geom_histogram() +
    facet_grid(cols = vars(variant_set)) +
    scale_y_log10(labels=comma) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    labs(title = str_glue('Histogram of allele frequencies MAF < 1% [y-axis in log scale]'),
         caption = PLOT_SUBTITLE)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

allele_freq %>%
    ggplot(aes(x = allele_frequency)) +
    geom_histogram() +
    facet_grid(cols = vars(variant_set)) +
    scale_y_continuous(labels=comma)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

allele_freq %>%
    ggplot(aes(x = variant_set, y = allele_frequency)) +
    geom_boxplot() +
    scale_y_log10()

# Provenance 

In [None]:
devtools::session_info()