# Plot results for pooled analyses

In this notebook we review and explore the pooled *All of Us* and UK Biobankdata for lipids phenotypes, covariates, and GWAS results.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://github.com/all-of-us/ukb-cross-analysis-demo-project). Specifically this is for the portion of the project that is the **pooled** analysis.

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench.
    <ul>
        <li>Use "Recommended Environment" <kbd><b>General Analysis</b></kbd> which creates compute type <kbd><b>Standard VM</b></kbd> with reasonable defaults for CPU, RAM, and disk.</li>
        <li>This notebook takes several minutes to run interactively. You can also it in the background via <kbd>run_notebook_in_the_background</kbd> for the sake of provenance and reproducibility.</li>
    </ul>
</div>

In [None]:
lapply(c('ggExtra', 'hexbin', 'hrbrthemes', 'skimr', 'qqman', 'viridis'),
       function(pkg) { if(! pkg %in% installed.packages()) { install.packages(pkg)} } )

In [None]:
library(ggExtra)
library(grid)
library(gridExtra)
library(hrbrthemes)
library(qqman)
library(readxl)
library(scales)
library(skimr)
library(tidyverse)
library(viridis)

In [None]:
# Set some visualiation defaults.
theme_set(theme_ipsum(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
# Created via aou_workbench_pooled_analyses/02_merge_lipids_phenotypes.ipynb
MERGED_PHENO <- 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/pheno/20220310/aou_alpha3_ukb_lipids_phenotype.csv'
# Created via aou_workbench_pooled_analyses/08_pooled_phenotype_for_gwas.ipynb
REGENIE_PHENO <- 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/pheno/20220315/aou_alpha3_ukb_lipids_gwas_phenotype.tsv'

# Created via notebook aou_workbench_pooled_analyses/09_pooled_regenie_gwas.ipynb
REGENIE_RESULTS <- c(
    HDL='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_HDL_norm.regenie',
    LDL='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_LDL_adjusted_norm.regenie',
    TC='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_TC_adjusted_norm.regenie',
    TG='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_TG_adjusted_norm.regenie'
)

LIPIDS <- names(REGENIE_RESULTS)

# Created via notebook aou_workbench_pooled_analyses/04_pooled_plink_ld_and_pca.ipynb
LD_PRUNED_VARIANTS <- 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/ld-pca/20220314/aou_alpha3_ukb_lipids_plink_ld.prune.in'

# Created via notebook aou_workbench_pooled_analyses/04_examine_variant_merge_results.ipynb
POOLED_AF <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/pooled/examine-geno/20220512/merged_allele_freq-chr1-chr22.tsv'
AOU_ONLY_AF <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/pooled/examine-geno/20220512/aou_only_allele_freq-chr1-chr22.tsv'
UKB_ONLY_AF <- 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/pooled/examine-geno/20220512/ukb_only_allele_freq-chr1-chr22.tsv'

PLOT_SUBTITLE <- 'Source: All of Us v5 alpha3 and UK Biobank data'

# Load phenotypes

In [None]:
pheno <- read_csv(pipe(str_glue('gsutil cat {MERGED_PHENO}')))

dim(pheno)
spec(pheno)

## Add age group categorical variable

In [None]:
pheno <- pheno %>%
    mutate(
        age_group = cut_width(age, width = 20, boundary = 0)
    )

## Improve `sex_at_birth` labels

In [None]:
unique(pheno$sex_at_birth)

In [None]:
pheno <- pheno %>%
    mutate(
        sex_at_birth = ifelse(is.na(sex_at_birth),
                              'Not male, not female, prefer not to answer, or skipped',
                              sex_at_birth)
    )

In [None]:
unique(pheno$sex_at_birth)

## Improve `cohort` labels

In [None]:
unique(pheno$cohort)

In [None]:
pheno <- pheno %>%
    mutate(
        cohort = case_when(
            cohort == 'AOU' ~ 'All of Us',
            cohort == 'UKB' ~ 'UK Biobank'
        )
    )

In [None]:
unique(pheno$cohort)

## Consolidate `race` labels

In [None]:
unique(pheno$race)

In [None]:
pheno %>%
    select(id, cohort, race) %>%
    distinct() %>%
    group_by(cohort, race) %>%
    summarize(count = n())

In [None]:
pheno <- pheno %>%
    mutate(
        ancestry = case_when(
            race %in% c('Native Hawaiian or Other Pacific Islander',
                        'I prefer not to answer',
                        'None Indicated',
                        'None of these',
                        'PMI: Skip',
                        'Do not know',
                        'Other ethnic group',
                        'Prefer not to answer') | is.na(race) ~ 'Other or\nnot specified',
            race %in% c('More than one population', 'Mixed') ~ 'More than\none population',
            race %in% c('Black or African American', 'Black or Black British') ~ 'Black',
            race %in% c('Asian', 'Asian or Asian British', 'Chinese') ~ 'Asian',
            race == 'Middle Eastern or North African' ~ 'Middle Eastern\nor North African',
            TRUE ~ race
        )
    )

In [None]:
pheno %>%
    select(id, cohort, ancestry) %>%
    distinct() %>%
    group_by(cohort, ancestry) %>%
    summarize(count = n())

In [None]:
pheno %>%
    select(id, ancestry) %>%
    distinct() %>%
    group_by(ancestry) %>%
    summarize(count = n())

## Distinguish between adjusted and unadjusted values

In [None]:
pheno <- pheno %>%
    separate(
        col = lipid_type,
        into = c('lipid', 'is_adjusted'),
        sep = '_',
        remove = FALSE
    ) %>%
    mutate(
        is_adjusted_for_statin_use = ifelse(is.na(is_adjusted), 'raw', is_adjusted),
        is_adjusted_for_statin_use = fct_relevel(is_adjusted_for_statin_use, 'adjusted', after = Inf)
    )

## Check categorical variables

In [None]:
pheno %>%
    select(id, cohort) %>%
    distinct() %>%
    group_by(cohort) %>%
    summarize(count = n())

In [None]:
pheno %>%
    select(id, sex_at_birth) %>%
    distinct() %>%
    group_by(sex_at_birth) %>%
    summarize(count = n())

In [None]:
pheno %>%
    select(id, statin_use) %>%
    distinct() %>%
    group_by(statin_use) %>%
    summarize(count = n())

In [None]:
pheno %>%
    group_by(lipid_type, lipid, is_adjusted_for_statin_use) %>%
    summarize(count = n())

# Plot lipids

In [None]:
plot_vars <- function(data, xvar, yvar, fillvar, title_detail = '', log_scale = FALSE, long_labels = FALSE) {
    xvar_sym <- sym(xvar)
    xvar_name <- xvar
    yvar_sym <- sym(yvar)
    yvar_name <- yvar
    fillvar_sym <- sym(fillvar)
    fillvar_name <- fillvar

    p <- data %>%
        filter(lipid_type == yvar_name) %>%
        filter(!is.na(mg_dl)) %>%
        ggplot(aes(x = !!xvar_sym, y = mg_dl, fill = !!fillvar_sym)) +
        geom_boxplot() +
        stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                     position = position_dodge(width = 0.9), vjust = -0.8) +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        ylab(yvar_name) +
        labs(title = str_glue('{yvar_name} mg/dL per person by {xvar_name} and {fillvar_name} {title_detail}'),
             caption = PLOT_SUBTITLE)

    if(log_scale) {
        p = p + scale_y_log10()
    }
    
    if(long_labels) {
        p = p + theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))
        options(repr.plot.width = 16, repr.plot.height = 10)
    } else {
        options(repr.plot.width = 16, repr.plot.height = 8)
    }

    p
}

## By age group

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'age_group', yvar = lipid, fillvar = 'cohort'))
}

In [None]:
plot_vars(data = pheno, xvar = 'age_group', yvar = 'TG', fillvar = 'cohort',
          log_scale = TRUE, title_detail = '[log scale y-axis]')

## By ancestry

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'ancestry', yvar = lipid, fillvar = 'cohort', long_labels = TRUE))
}

In [None]:
plot_vars(data = pheno, xvar = 'ancestry', yvar = 'TG', fillvar = 'cohort',
          log_scale = TRUE, title_detail = '[log scale y-axis]', long_labels = TRUE)

## By statin use

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'statin_use'))
}

In [None]:
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TG', fillvar = 'statin_use',
          log_scale = TRUE, title_detail = '[log scale y-axis]')

## By statin use and adjusted

In [None]:
for (lipid in c('LDL_adjusted', 'TC_adjusted', 'TG_adjusted')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'statin_use'))
}

In [None]:
# Special case the title for this plot.
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TG_adjusted', fillvar = 'statin_use',
          log_scale = FALSE, title_detail = '[adjusted data is in log space]')

## By sex at birth

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_vars(data = pheno, xvar = 'cohort', yvar = lipid, fillvar = 'sex_at_birth'))
}

In [None]:
plot_vars(data = pheno, xvar = 'cohort', yvar = 'TG', fillvar = 'sex_at_birth',
          log_scale = TRUE, title_detail = '[log scale y-axis]')

## Faceted plots for paper

In [None]:
plot_multiple_lipids <- function(data, xvar, yvar, fillvar, title_detail = '', log_scale = FALSE, long_labels = FALSE) {
    xvar_sym <- sym(xvar)
    xvar_name <- xvar
    yvar_sym <- sym(yvar)
    yvar_name <- yvar
    fillvar_sym <- sym(fillvar)
    fillvar_name <- fillvar

    p <- data %>%
        filter(lipid == yvar_name) %>%
        filter(!is.na(mg_dl)) %>%
        ggplot(aes(x = !!xvar_sym, y = mg_dl, fill = !!fillvar_sym)) +
        geom_boxplot() +
        stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                     position = position_dodge(width = 0.9), vjust = -0.8) +
        facet_grid(cols = vars(is_adjusted_for_statin_use)) +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        ylab(yvar_name) +
        labs(title = str_glue('{yvar_name} mg/dL per person by {xvar_name} and {fillvar_name} {title_detail}'),
             caption = PLOT_SUBTITLE) +
        theme(strip.text.x = element_text(size = 20))

    if(log_scale) {
        p = p + scale_y_log10()
    }
    
    if(long_labels) {
        p = p + theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))
        options(repr.plot.width = 16, repr.plot.height = 10)
    } else {
        options(repr.plot.width = 16, repr.plot.height = 8)
    }

    p
}

In [None]:
for (lipid in c('LDL', 'TC', 'HDL', 'TG')) {
    print(plot_multiple_lipids(data = pheno %>% filter(mg_dl < 700),
                               xvar = 'cohort',
                               yvar = lipid,
                               fillvar = 'statin_use') +
          ylab(str_glue('{lipid} mg/dL')) +
          labs(title = str_glue('{lipid} per person by cohort, before and after adjustment for statin use'),
          caption = PLOT_SUBTITLE)
         )
    ggsave(str_glue('{lipid}_before_and_after_adjustment_for_statin_use.png'), device = 'png', width = 18, height = 8, units = 'in')
}

# Load GWAS phenotypes

In [None]:
regenie_pheno = read_tsv(pipe(str_glue('gsutil cat {REGENIE_PHENO}')))

dim(regenie_pheno)
colnames(regenie_pheno)

In [None]:
regenie_pheno %>%
    group_by(cohort) %>%
    summarize(
        count = n()
    )

In [None]:
sort(colnames(regenie_pheno))

# Plot GWAS phenotypes

In [None]:
plot_var_histograms <- function(data, xvar, facetvar, title_detail = '', log_scale = FALSE) {
    xvar_sym <- sym(xvar)
    xvar_name <- xvar
    facetvar_sym <- sym(facetvar)
    facetvar_name <- facetvar

    options(repr.plot.width = 16, repr.plot.height = 8)
    
    p <- data %>%
        filter(!is.na(!!xvar_sym)) %>%
        ggplot(aes(x = !!xvar_sym)) +
        geom_histogram(bins = 30) +
        facet_wrap(vars(!!facetvar_sym), ncol = 2, scales = 'free') +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{xvar_name} {title_detail}'),
             caption = PLOT_SUBTITLE)

    if(log_scale) {
        p = p + scale_y_log10()
    }

    p
}

In [None]:
options(repr.plot.width = 8, repr.plot.height = 8)
for (lipid in c('HDL', 'HDL_resid', 'HDL_norm',
                'LDL', 'LDL_adjusted', 'LDL_adjusted_resid', 'LDL_adjusted_norm',
                'TC', 'TC_adjusted', 'TC_adjusted_resid', 'TC_adjusted_norm',
                'TG', 'TG_adjusted', 'TG_adjusted_resid', 'TG_adjusted_norm')) {
    print(plot_var_histograms(data = regenie_pheno, xvar = lipid, facetvar = 'cohort'))
}

# Load the regenie GWAS results

Bring our results into a single dataframe with a lipid type column.

In [None]:
combined_regenie_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        file <- REGENIE_RESULTS[lipid]
        read_delim(pipe(str_glue('gsutil cat {file}')), delim = ' ') %>%
        mutate(lipid_type = lipid)
    })) %>%
    mutate(
        p_value = 10 ^ (-1 * LOG10P)
    )

dim(combined_regenie_results)

In [None]:
head(combined_regenie_results)

In [None]:
combined_regenie_results %>%
    mutate(
        significant = LOG10P > -log10(5e-08)
    ) %>%
    group_by(lipid_type) %>%
    summarize(
        mum_results = n(),
        num_significant_results = sum(significant),
        min_p_value = min(p_value),
        max_p_value = max(p_value),
        min_LOG10P = min(LOG10P),
        max_LOG10P = max(LOG10P),
        min_A1FREQ = min(A1FREQ),
        max_A1FREQ = max(A1FREQ),
        min_N = min(N),
        max_N = max(N),
    )

# Plot regenie results

In [None]:
plot_manhattan_and_qq <- function(regenie_results, manhattan_title, qq_title) {
    options(repr.plot.width = 10, repr.plot.height = 10)
    manhattan(regenie_results,
              chr='CHROM',
              bp='GENPOS',
              snp='ID',
              p='p_value',
              logp=TRUE,
              annotateTop = FALSE,
              ylim = c(0, 200),
              cex = 1.25,
              cex.axis = 1.25,
              cex.lab = 1.25,
              main = manhattan_title,
              sub = PLOT_SUBTITLE
             )

    qq(regenie_results$p_value,
       cex = 1.25,
       cex.axis = 1.25,
       cex.lab = 1.25,
       main = qq_title,
       sub = PLOT_SUBTITLE)
}

## All GWAS results

In [None]:
map(LIPIDS, function(lipid) {
    regenie_results <- combined_regenie_results %>% filter(lipid_type == lipid)
    file = REGENIE_RESULTS[lipid]

    gc_score <- median(regenie_results$CHISQ) / qchisq(0.5, 1, lower.tail=FALSE)

    message(str_glue('nrow: {nrow(regenie_results)} ncol: {ncol(regenie_results)} in {file}'))
    message(str_glue('GC: {round(gc_score, 3)}'))
    message(
        regenie_results %>%
            group_by(TEST) %>%
        summarize(count = n())
    )

    plot_manhattan_and_qq(
        regenie_results,
        manhattan_title = str_glue('{basename(file)} results\nfrom {dirname(file)}'),
        qq_title = str_glue('{basename(file)} results\nfrom {dirname(file)}\n GC: {round(gc_score, 3)}')
    )
})

## Filter to common variants

## Prune variants in LD

In [None]:
ld_pruned_variants  <- read_tsv(pipe(str_glue('gsutil cat {LD_PRUNED_VARIANTS}')), col_names = 'variant_id')

head(ld_pruned_variants)

## Prune variants in LD and filter to common variants

# Comparisons against other lipids studies 

## Comparison with UKB published GWAS summary

##### Rare coding variants in 35 genes associate with circulating lipid levels – a multi-ancestry analysis of 170,000 exomes. Hindy et al 2021

In [None]:
download.file('https://www.biorxiv.org/content/biorxiv/early/2021/09/01/2020.12.22.423783/DC2/embed/media-2.xlsx?download=true', 'hindy.xlsx')

Bring the Hindy results into a single dataframe with a lipid type column.

In [None]:
combined_hindy_results <- read_xlsx('hindy.xlsx', sheet = 'Table_S11', skip = 1, na = 'NA') %>%
    filter(Ancestry == 'Overall') %>%
    mutate(
        ID = str_c('chr', str_replace_all(RSID, ':', '_')),
        lipid_type = case_when(
            Trait == 'LDL_ADJ' ~ 'LDL',
            Trait == 'TOTAL_ADJ' ~ 'TC',
            TRUE ~ Trait
        )
    )

dim(combined_hindy_results)

In [None]:
head(combined_hindy_results)

In [None]:
map(LIPIDS, function(lipid) {
    hindy_results = combined_hindy_results %>%
        filter(lipid_type == lipid) %>%
        select(ID, beta_Hindy=BETA_FE)

    in_common_results = inner_join(
        hindy_results,
        combined_regenie_results %>%
            filter(lipid_type == lipid) %>%
            select(ID, beta_pooled=BETA)
    )
    
    num_hindy_results = nrow(hindy_results)
    num_in_common_results = nrow(in_common_results)
    result_cor = round(cor(in_common_results$beta_pooled, in_common_results$beta_Hindy)^2, digits=2)
    result_cor_test = cor.test(in_common_results$beta_pooled, in_common_results$beta_Hindy)
    result_cor_test_p = scientific(result_cor_test$p.value, digits = 2)
    
    options(repr.plot.width = 10, repr.plot.height = 10)

    in_common_results %>%
    ggplot(aes(x = beta_Hindy, y = beta_pooled)) +
        geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x = max(in_common_results$beta_Hindy),
                 y = min(in_common_results$beta_pooled),
                 hjust = 'right',
                 vjust = -1,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('N = {num_in_common_results}\nR-square: {round(result_cor, digits = 3)}\nP-value= {result_cor_test_p}'))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{lipid} pooled GWAS result comparison to {num_hindy_results}\nsignificant RSID from Hindy et al. 2021'),
             caption = PLOT_SUBTITLE)

})

## Comparison with TOPMed (Freeze8) Lipid GWAS

Whole genome sequence analysis of blood lipid levels in >66,000 individuals. [Selvaraj et al 2021](https://www.biorxiv.org/content/10.1101/2021.10.11.463514v1.supplementary-material)

In [None]:
download.file('https://www.biorxiv.org/content/biorxiv/early/2021/10/12/2021.10.11.463514/DC1/embed/media-1.xlsx?download=true', 'selvaraj.xlsx')

Bring the Selvaraj results into a single dataframe with a lipid type column.

In [None]:
selvaraj_tables = c(HDL = 'A4:L361', LDL = 'A363:L701', TC = 'A703:L1027', TG = 'A1029:L1318')

combined_selvaraj_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        # Print some metadata for an eyeball check that we are associating the data with the correct lipid type.
        print(str_glue('{lipid} {selvaraj_tables[lipid]}'))
        first_row = as.integer(str_extract(selvaraj_tables[lipid], '\\d+'))
        print(read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = str_glue('A{first_row - 1}:A{first_row}')))
        print(nrow(read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = selvaraj_tables[lipid])))
        
        # Retrieve the data.
        read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = selvaraj_tables[lipid]) %>%
        mutate(
            # Work around a bad entry in the data causing the p.value column to be of type character.
            p.value = as.numeric(p.value),
            ID = str_glue('chr{CHR}_{POS}_{Allele1}_{Allele2}'),
            RSID = str_glue('{CHR}:{POS}:{Allele1}:{Allele2}'),
            lipid_type = lipid
        )
    }))

dim(combined_selvaraj_results)

In [None]:
head(combined_selvaraj_results)

In [None]:
map(LIPIDS, function(lipid) {
    selvaraj_results = combined_selvaraj_results %>%
        filter(lipid_type == lipid) %>%
        select(ID, beta_selvaraj=BETA)

    in_common_results = inner_join(
        selvaraj_results,
        combined_regenie_results %>%
            filter(lipid_type == lipid) %>%
            select(ID, beta_pooled=BETA)
    )
    
    num_selvaraj_results = nrow(selvaraj_results)
    num_in_common_results = nrow(in_common_results)
    result_cor = round(cor(in_common_results$beta_pooled, in_common_results$beta_selvaraj)^2, digits=2)
    result_cor_test = cor.test(in_common_results$beta_pooled, in_common_results$beta_selvaraj)
    result_cor_test_p = scientific(result_cor_test$p.value, digits = 2)

    options(repr.plot.width = 8, repr.plot.height = 8)

    in_common_results %>%
    ggplot(aes(x = beta_selvaraj, y = beta_pooled)) +
        geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x = max(in_common_results$beta_selvaraj),
                 y = min(in_common_results$beta_pooled),
                 hjust = 'right',
                 vjust = -1,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('N = {num_in_common_results}\nR-square: {round(result_cor, digits = 3)}\nP-value= {result_cor_test_p}'))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{lipid} pooled GWAS result comparison to {num_selvaraj_results}\nsignificant RSID from Selvaraj et al. 2021'),
             caption = PLOT_SUBTITLE)

})

## Comparison Hindy vs. Selvaraj

In [None]:
map(LIPIDS, function(lipid) {
    hindy_results = combined_hindy_results %>%
        filter(lipid_type == lipid) %>%
        select(RSID, beta_Hindy=BETA_FE)

    selvaraj_results = combined_selvaraj_results %>%
        filter(lipid_type == lipid) %>%
        select(RSID, beta_selvaraj=BETA)

    in_common_results = inner_join(
        hindy_results,
        selvaraj_results
    )
    
    num_hindy_results = nrow(hindy_results)
    num_selvaraj_results = nrow(selvaraj_results)
    num_in_common_results = nrow(in_common_results)
    result_cor = cor(in_common_results$beta_selvaraj, in_common_results$beta_Hindy)
    
    options(repr.plot.width = 8, repr.plot.height = 8)

    in_common_results %>%
    ggplot(aes(x = beta_Hindy, y = beta_selvaraj)) +
        geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x = max(in_common_results$beta_Hindy),
                 y = min(in_common_results$beta_selvaraj),
                 hjust = 'right',
                 vjust = -1,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('correlation: {round(result_cor, digits = 3)}\nN = {num_in_common_results}'))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{lipid} GWAS result comparison between {num_hindy_results} significant RSID
from Hindy et al. 2021 and {num_selvaraj_results} significant RSID from
Selvaraj et al. 2021'),
             caption = PLOT_SUBTITLE)

})

# Plot allele frequencies

In [None]:
pooled_af <- read_tsv(pipe(str_glue('gsutil cat {POOLED_AF}')))

dim(pooled_af)

In [None]:
aou_only_af <- read_tsv(pipe(str_glue('gsutil cat {AOU_ONLY_AF}')))

dim(aou_only_af)

In [None]:
ukb_only_af <- read_tsv(pipe(str_glue('gsutil cat {UKB_ONLY_AF}')))

dim(ukb_only_af)

In [None]:
AUTOSOME_LEVELS <- c(paste('chr', c(as.character(seq(1, 22))), sep = ''), as.character(seq(1, 22)))
VARIANT_SET_LEVELS <- c('pooled', 'ukb_only', 'aou_only')

allele_freq <- bind_rows(
    pooled_af %>% mutate(variant_set = 'pooled'),
    aou_only_af %>% mutate(variant_set = 'aou_only'),
    ukb_only_af %>% mutate(variant_set = 'ukb_only')
  ) %>%
  mutate(
    chrom = str_extract(locus, '^chr\\d+'),
    chrom = parse_factor(chrom, levels = AUTOSOME_LEVELS),
    variant_set = parse_factor(variant_set, VARIANT_SET_LEVELS)
  )

dim(allele_freq)

In [None]:
skim(allele_freq)

In [None]:
allele_freq %>% filter(is.na(AF)) %>% group_by(variant_set) %>% summarize(count = n())

In [None]:
allele_freq %>% filter(is.na(AF)) %>% head()

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

allele_freq %>%
    ggplot(aes(x = AF)) +
    geom_histogram() +
    facet_grid(cols = vars(variant_set)) +
    scale_y_log10(labels=comma) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    labs(title = str_glue('Histogram of allele frequencies [y-axis in log scale]'),
         caption = PLOT_SUBTITLE)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

allele_freq %>%
    filter(AF < .01) %>%
    ggplot(aes(x = AF)) +
    geom_histogram() +
    facet_grid(cols = vars(variant_set)) +
    scale_y_log10(labels=comma) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    labs(title = str_glue('Histogram of allele frequencies MAF < 1% [y-axis in log scale]'),
         caption = PLOT_SUBTITLE)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

allele_freq %>%
#    filter(chrom == 'chr21') %>%
    ggplot(aes(x = AF, y = AN)) +
    geom_hex(bins = 100) +
    facet_wrap(~variant_set, ncol = 3) +
    scale_y_continuous(labels=comma) +
    scale_fill_viridis(direction = -1) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    xlab('allele frequency (AF)') +
    ylab('allele number (AN)') +
    labs(title = str_glue('Characteristics of variants found in both datasets, versus those found only in',
                          ' UK Biobank or All of Us.'),
         caption = PLOT_SUBTITLE)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

allele_freq %>%
#    filter(chrom == 'chr21') %>%
    ggplot(aes(x = AF, y = AN)) +
    geom_point(alpha = 0.3) +
    facet_wrap(~variant_set, ncol = 3) +
    scale_y_continuous(labels=comma) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    xlab('allele frequency (AF)') +
    ylab('allele number (AN)') +
    labs(title = str_glue('Characteristics of variants found in both datasets, versus those found only in',
                          ' UK Biobank or All of Us.'),
         caption = PLOT_SUBTITLE)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 8)

p <- aou_only_af %>%
    ggplot(aes(x = AF, y = AN)) +
    geom_point(alpha = 0.5) +
#    facet_wrap(~gnomad_popmax_pop, ncol = 2) +
    scale_y_continuous(labels=comma) +
    scale_fill_viridis() +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    )  +
    labs(title = str_glue('Allele information for variants found in AoU only'),
         caption = PLOT_SUBTITLE)

ggMarginal(p, type = 'histogram', size = 10, fill = 'green', xparams = list(bins=100))

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

allele_freq %>%
    ggplot(aes(x = AF, y = gnomad_popmax_AF), alpha = 0.1) +
    geom_point(alpha = 0.3) +
    facet_wrap(~variant_set, ncol = 3) +
    scale_y_continuous(labels=comma) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    xlab('allele frequency (AF)') +
    ylab('gnomAD popmax allele frequency') +
    labs(title = str_glue('gnomAD popmax allele frequencies of variants found in both datasets, versus those found\nonly in',
                          ' UK Biobank or All of Us.'),
         caption = PLOT_SUBTITLE)

# Provenance 

In [None]:
devtools::session_info()