# Examine significant GWAS results

In this notebook we review and explore the significant lipids *All of Us* and UK Biobank data GWAS results.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://github.com/all-of-us/ukb-cross-analysis-demo-project).

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench.
    <ul>
        <li>Use "Recommended Environment" <kbd><b>General Analysis</b></kbd> which creates compute type <kbd><b>Standard VM</b></kbd> with reasonable defaults for CPU, RAM, and disk.</li>
        <li>This notebook takes several minutes to run interactively. You can also it in the background via <kbd>run_notebook_in_the_background</kbd> for the sake of provenance and reproducibility.</li>
    </ul>
</div>

In [None]:
lapply(c('ggExtra', 'hexbin', 'hrbrthemes', 'skimr', 'qqman', 'tidyjson', 'viridis'),
       function(pkg) { if(! pkg %in% installed.packages()) { install.packages(pkg)} } )

In [None]:
library(ggExtra)
library(grid)
library(gridExtra)
library(hrbrthemes)
library(lubridate)
library(qqman)
library(readxl)
library(scales)
library(skimr)
library(tidyjson)
library(tidyverse)
library(viridis)

In [None]:
# Set some visualiation defaults.
theme_set(theme_ipsum(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
# Created via notebook aou_workbench_pooled_analyses/13_annotate_significant_gwas_results.ipynb
ANNOTATED_SIGNIFICANT_LIPIDS_GWAS_RESULTS <- 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/results/20220603/significant_lipids_gwas_results_gnomad_annotated-chr1-chr22.tsv'
ANNOTATED_SIGNIFICANT_BATCH_GWAS_RESULTS <- 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/results/20220603/significant_batch_gwas_results_gnomad_annotated-chr1-chr22.tsv'

PLOT_SUBTITLE <- 'Source: All of Us v5 alpha3 and UK Biobank data'

AUTOSOMES <- c(paste('chr', c(as.character(seq(1, 22))), sep = ''),
               as.character(seq(1, 22)))

#---[ Outputs ]---
# Create a timestamp for a folder of results generated today.
DATESTAMP <- strftime(now(), '%Y%m%d')
DESTINATION <- str_glue('{Sys.getenv("WORKSPACE_BUCKET")}/data/results/{DATESTAMP}/')

# Read in annotated significant lipids GWAS results

In [None]:
annotated_significant_lipids_gwas_results <- read_tsv(
    pipe(str_glue('gsutil cat {ANNOTATED_SIGNIFICANT_LIPIDS_GWAS_RESULTS}')))

In [None]:
skim(annotated_significant_lipids_gwas_results)

In [None]:
head(annotated_significant_lipids_gwas_results)

In [None]:
colnames(annotated_significant_lipids_gwas_results)

In [None]:
head(annotated_significant_lipids_gwas_results$popmax)

In [None]:
annotated_significant_lipids_gwas_results <- annotated_significant_lipids_gwas_results %>%
    mutate(
        popmax = ifelse(is.na(popmax), '{}', popmax)
    ) %>%
    as.tbl_json(json.column = 'popmax') %>%
    spread_all() %>%
    rename_at(vars(AC, AF, AN, homozygote_count, pop, faf95), ~ paste0('gnomad_popmax_', .))

colnames(annotated_significant_lipids_gwas_results)

# Plot pooled and gnomAD popmax

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

annotated_significant_lipids_gwas_results %>%
filter(significant_pooled == TRUE) %>%
ggplot(aes(x = A1FREQ_pooled, y = gnomad_popmax_AF, color = gnomad_popmax_pop)) +
    geom_point() +
    theme(
        axis.title.x = element_text(size=14),
        axis.title.y = element_text(size=14),
        strip.text.x = element_text(size=16),
    ) +
    geom_abline(intercept = 0, slope = 1) +
#    xlab('') +
#    ylab('') +
    ggtitle('gnomAD popmax allele frequencies for significant pooled GWAS results')


# Plot metal and gnomAD popmax

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

annotated_significant_lipids_gwas_results %>%
filter(significant_metal == TRUE) %>%
ggplot(aes(x = Freq1_metal, y = gnomad_popmax_AF, color = gnomad_popmax_pop)) +
    geom_point() +
    theme(
        axis.title.x = element_text(size=14),
        axis.title.y = element_text(size=14),
        strip.text.x = element_text(size=16),
    ) +
    geom_abline(intercept = 0, slope = 1) +
    ggtitle('gnomAD popmax allele frequencies for significant METAL GWAS results')


## Flip METAL results

In [None]:
colnames(annotated_significant_lipids_gwas_results)

In [None]:
annotated_significant_lipids_gwas_results <- annotated_significant_lipids_gwas_results %>%
    mutate(
        is_flipped_metal = ref == str_to_upper(Allele2_metal),
        flipped_Effect_metal = ifelse(is_flipped_metal, -Effect_metal, Effect_metal),
        flipped_Freq1_metal =  ifelse(is_flipped_metal, 1 - Freq1_metal, Freq1_metal),
    )

dim(annotated_significant_lipids_gwas_results)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

annotated_significant_lipids_gwas_results %>%
filter(significant_metal == TRUE) %>%
ggplot(aes(x = flipped_Freq1_metal, y = gnomad_popmax_AF, color = gnomad_popmax_pop)) +
    geom_point() +
    theme(
        axis.title.x = element_text(size=14),
        axis.title.y = element_text(size=14),
        strip.text.x = element_text(size=16),
    ) +
    geom_abline(intercept = 0, slope = 1) +
#    xlab('') +
#    ylab('') +
    ggtitle('gnomAD popmax allele frequencies for significant METAL GWAS results')


# Summarize by popmax

In [None]:
annotated_significant_lipids_gwas_result_summary <- annotated_significant_lipids_gwas_results %>%
    group_by(lipid_type, significant_pooled, significant_metal, single_cohort_result_metal, gnomad_popmax_pop) %>%
    summarize(
        count = n()
    )

dim(annotated_significant_lipids_gwas_result_summary)

In [None]:
head(annotated_significant_lipids_gwas_result_summary)

In [None]:
annotated_significant_lipids_gwas_result_summary %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE && (is.na(significant_metal) | significant_metal == FALSE))

In [None]:
annotated_significant_lipids_gwas_result_summary %>%
    filter(lipid_type == 'LDL') %>%
    filter((is.na(significant_pooled) | significant_pooled == FALSE) && significant_metal == TRUE)

# Read in annotated significant batch GWAS results

In [None]:
annotated_significant_batch_gwas_results <- read_tsv(
    pipe(str_glue('gsutil cat {ANNOTATED_SIGNIFICANT_BATCH_GWAS_RESULTS}')))

In [None]:
skim(annotated_significant_batch_gwas_results)

In [None]:
colnames(annotated_significant_batch_gwas_results)

In [None]:
head(annotated_significant_batch_gwas_results$info)

In [None]:
annotated_significant_batch_gwas_results %>%
    mutate(
        info = ifelse(is.na(info), '{}', info)
    ) %>%
    as.tbl_json(json.column = 'info') %>%
    gather_object %>%
    json_types %>%
    count(name, type)

In [None]:
annotated_significant_batch_gwas_results <- annotated_significant_batch_gwas_results %>%
    mutate(
        info = ifelse(is.na(info), '{}', info),
        Chromosome = parse_factor(chr, levels = c(AUTOSOMES))
    ) %>%
    extract(info, 'AS_FS', regex = ',"AS_FS":([^,]+)', remove = FALSE, convert = TRUE) %>%
    extract(info, 'AS_MQ', regex = ',"AS_MQ":([^,]+)', remove = FALSE, convert = TRUE) %>%
    extract(info, 'AS_QD', regex = ',"AS_QD":([^,]+)', remove = FALSE, convert = TRUE) %>%
    extract(info, 'AS_VarDP', regex = ',"AS_VarDP":([^,]+)', remove = FALSE, convert = TRUE) %>%
    extract(info, 'AS_VQSLOD', regex = ',"AS_VQSLOD":([^,]+)', remove = FALSE, convert = TRUE) %>%
    # The VEP field has many rows, just extract the gene symbol from the first row.
    extract(vep, 'gene', regex = ',"gene_symbol":"([^"]+)",', remove = FALSE, convert = TRUE) %>%
    mutate(
        AS_FS = case_when(
                AS_FS == 'NaN' ~ NA_real_,
            TRUE ~ as.numeric(AS_FS)
        )
    )

dim(annotated_significant_batch_gwas_results)

In [None]:
skim(annotated_significant_batch_gwas_results %>% select(starts_with('AS_')))

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

annotated_significant_batch_gwas_results %>%
    filter(lipid_type == 'LDL') %>%
    ggplot(aes(x = Chromosome, y = AS_MQ)) +
    geom_boxplot() +
    stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                 position = position_dodge(width = 0.9), vjust = -0.8) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    labs(title = str_glue('gnomAD AS_MQ scores for significant batch variants'),
         caption = PLOT_SUBTITLE)

ggsave(str_glue('gnomAD_AS_MQ_scores_for_significant_batch_variants.png'), device = 'png',
       width = 18, height = 8, units = 'in')

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

annotated_significant_batch_gwas_results %>%
    filter(lipid_type == 'LDL') %>%
    ggplot(aes(x = Chromosome, y = AS_FS)) +
    geom_boxplot() +
    stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                 position = position_dodge(width = 0.9), vjust = -0.8) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    labs(title = str_glue('gnomAD AS_FS scores for significant batch variants'),
         caption = PLOT_SUBTITLE)

ggsave(str_glue('gnomAD_AS_FS_scores_for_significant_batch_variants.png'), device = 'png',
       width = 18, height = 8, units = 'in')

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

annotated_significant_batch_gwas_results %>%
    filter(lipid_type == 'LDL') %>%
    ggplot(aes(x = Chromosome, y = AS_QD)) +
    geom_boxplot() +
    stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                 position = position_dodge(width = 0.9), vjust = -0.8) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    labs(title = str_glue('gnomAD AS_QD scores for significant batch variants'),
         caption = PLOT_SUBTITLE)

ggsave(str_glue('gnomAD_AS_QD_scores_for_significant_batch_variants.png'), device = 'png',
       width = 18, height = 8, units = 'in')

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

annotated_significant_batch_gwas_results %>%
    filter(lipid_type == 'LDL') %>%
    ggplot(aes(x = Chromosome, y = AS_VarDP)) +
    geom_boxplot() +
    stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                 position = position_dodge(width = 0.9), vjust = -0.8) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    labs(title = str_glue('gnomAD AS_VarDP scores for significant batch variants'),
         caption = PLOT_SUBTITLE)

ggsave(str_glue('gnomAD_AS_VarDP_scores_for_significant_batch_variants.png'), device = 'png',
       width = 18, height = 8, units = 'in')

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

annotated_significant_batch_gwas_results %>%
    filter(lipid_type == 'LDL') %>%
    ggplot(aes(x = Chromosome, y = AS_VQSLOD)) +
    geom_boxplot() +
    stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                 position = position_dodge(width = 0.9), vjust = -0.8) +
    theme(
        axis.title.x=element_text(size=14),
        axis.title.y=element_text(size=14),
    ) +
    labs(title = str_glue('gnomAD AS_VQSLOD scores for significant batch variants'),
         caption = PLOT_SUBTITLE)

ggsave(str_glue('gnomAD_AS_VQSLOD_scores_for_significant_batch_variants.png'), device = 'png',
       width = 18, height = 8, units = 'in')

In [None]:
annotated_significant_batch_gwas_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(Chromosome, gene) %>%
    summarize(count = n()) %>%
    arrange(count)

# Provenance 

In [None]:
devtools::session_info()