# Compare GWAS results

In this notebook we review and explore the differences and similarities between the pooled and METAL meta-analysis *All of Us* and UK Biobank data GWAS results.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://github.com/all-of-us/ukb-cross-analysis-demo-project).

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench.
    <ul>
        <li>Use "Recommended Environment" <kbd><b>General Analysis</b></kbd> which creates compute type <kbd><b>Standard VM</b></kbd> with reasonable defaults for CPU, RAM, and disk.</li>
        <li>This notebook takes several minutes to run interactively. You can also it in the background via <kbd>run_notebook_in_the_background</kbd> for the sake of provenance and reproducibility.</li>
    </ul>
</div>

In [None]:
lapply(c('ggExtra', 'hexbin', 'hrbrthemes', 'skimr', 'qqman', 'viridis'),
       function(pkg) { if(! pkg %in% installed.packages()) { install.packages(pkg)} } )

In [None]:
library(ggExtra)
library(grid)
library(gridExtra)
library(hrbrthemes)
library(lubridate)
library(qqman)
library(readxl)
library(scales)
library(skimr)
library(tidyverse)
library(viridis)

In [None]:
# Set some visualiation defaults.
theme_set(theme_ipsum(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
# Created via notebook aou_workbench_pooled_analyses/09_pooled_regenie_gwas.ipynb
POOLED_LIPIDS_REGENIE_RESULTS <- c(
    HDL='gs://fc-secure-814555c0-7d23-4cf6-bab9-3782e1dcb32f/data/pooled/regenie/20230403/aou_alpha3_ukb_lipids_regenie_step2_HDL_norm.regenie',
    LDL='gs://fc-secure-814555c0-7d23-4cf6-bab9-3782e1dcb32f/data/pooled/regenie/20230403/aou_alpha3_ukb_lipids_regenie_step2_LDL_adjusted_norm.regenie',
    TC='gs://fc-secure-814555c0-7d23-4cf6-bab9-3782e1dcb32f/data/pooled/regenie/20230403/aou_alpha3_ukb_lipids_regenie_step2_TC_adjusted_norm.regenie',
    TG='gs://fc-secure-814555c0-7d23-4cf6-bab9-3782e1dcb32f/data/pooled/regenie/20230403/aou_alpha3_ukb_lipids_regenie_step2_TG_adjusted_norm.regenie'
)
LIPIDS <- names(POOLED_LIPIDS_REGENIE_RESULTS)

# Created via notebook aou_workbench_siloed_analyses/09_metal_meta_analysis.ipynb
METAL_META_ANALYSIS_RESULTS <- c(
    HDL='gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/metaanalysis/20230410/METAANALYSIS_HDL_1.tbl',
    LDL='gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/metaanalysis/20230410/METAANALYSIS_LDL_1.tbl',
    TC='gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/metaanalysis/20230410/METAANALYSIS_TC_1.tbl',
    TG='gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/metaanalysis/20230410/METAANALYSIS_TG_1.tbl'
)

PLOT_SUBTITLE <- 'Source: All of Us v5 alpha3 and UK Biobank data'

#---[ Outputs ]---
# Create a timestamp for a folder of results generated today.
DATESTAMP <- strftime(now(), '%Y%m%d')
DESTINATION <- str_glue('{Sys.getenv("WORKSPACE_BUCKET")}/data/results-ac-6/{DATESTAMP}/')
GWAS_RESULT_SUMMARY_FILENAME <- 'gwas_result_summary.tsv'
SIGNIFICANT_LIPIDS_GWAS_RESULTS_FILENAME <- 'CONTAINS_AC_6_significant_lipids_gwas_results.tsv'

# Load pooled lipids GWAS results

In [None]:
pooled_lipids_regenie_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        file <- POOLED_LIPIDS_REGENIE_RESULTS[lipid]
        read_delim(pipe(str_glue('gsutil cat {file}')), delim = ' ') %>%
        mutate(lipid_type = lipid)
    })) %>%
    mutate(
        is_flipped = ID != str_glue('chr{CHROM}_{GENPOS}_{ALLELE0}_{ALLELE1}'),
        AN = 2 * N,
        AC_alt = round(A1FREQ * AN),
        AC_ref = round((1 - A1FREQ) * AN),
        significant = LOG10P > -log10(5e-08),
        p_value = 10 ^ (-1 * LOG10P)
    ) %>%
    rename_at(vars(-lipid_type, -ID, -CHROM, -GENPOS), ~ paste0(., '_pooled'))

dim(pooled_lipids_regenie_results)

In [None]:
head(pooled_lipids_regenie_results)

# Load meta-analysis lipids GWAS results

In [None]:
metal_meta_analysis_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        file <- METAL_META_ANALYSIS_RESULTS[lipid]
        read_table(pipe(str_glue('gsutil cat {file}'))) %>%
        mutate(lipid_type = lipid)
    })) %>%
    separate(MarkerName, sep = '_', into = c('CHROM_ID', 'GENPOS', NA, NA), convert = TRUE, remove = FALSE) %>%
    mutate(
        ID = MarkerName,
        CHROM = parse_number(CHROM_ID),
        # Fix values when Allele2 is the reference allele.
        is_flipped = MarkerName != str_c(CHROM_ID, GENPOS, str_to_upper(Allele1), str_to_upper(Allele2), sep='_'),
        fixed_beta = ifelse(is_flipped, -Effect, Effect),
        fixed_freq1 =  ifelse(is_flipped, 1 - Freq1, Freq1),
        LOG10P = -log10(`P-value`),
        significant = `P-value` <= 5e-08,
        single_cohort_result = str_detect(Direction, '\\?'),
        cohorts = case_when(
            Direction %in% c('++', '--') ~ 'both cohorts', 
            Direction %in% c('+-', '-+') ~ 'both cohorts with conflicting direction of effect', 
            Direction %in% c('+?', '-?') ~ 'AoU only', 
            Direction %in% c('?+', '?-') ~ 'UKB only'
        )
    ) %>%
    rename_at(vars(-lipid_type, -ID, -CHROM, -GENPOS), ~ paste0(., '_metal'))

dim(metal_meta_analysis_results)

In [None]:
head(metal_meta_analysis_results)

In [None]:
colnames(metal_meta_analysis_results)

In [None]:
metal_meta_analysis_results %>% group_by(Direction_metal) %>% summarize(count = n())

# Check pooled vs. meta analysis lipids results

## Pooled versus Metal - '?' results flagged

In [None]:
joined_pooled_and_all_metal_results <- metal_meta_analysis_results %>%
    full_join(pooled_lipids_regenie_results, by = c('lipid_type', 'ID', 'CHROM', 'GENPOS'))

dim(joined_pooled_and_all_metal_results)

In [None]:
colnames(joined_pooled_and_all_metal_results)

### Check the join

In [None]:
length(unique(joined_pooled_and_all_metal_results$ID))
length(unique(c(metal_meta_analysis_results$ID, pooled_lipids_regenie_results$ID)))
stopifnot(length(unique(joined_pooled_and_all_metal_results$ID))
          == length(unique(c(metal_meta_analysis_results$ID,
                             pooled_lipids_regenie_results$ID))))

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(is_flipped_pooled, is_flipped_metal) %>%
    summarize(count = n())

### Compare LOG10P values

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(lipid_type, significant_pooled, significant_metal) %>%
    summarize(
        count = n(),
        min_LOG10P_pooled = round(min(LOG10P_pooled, na.rm = TRUE), 3),
        min_LOG10P_metal = round(min(LOG10P_metal, na.rm = TRUE), 3),
        median_LOG10P_pooled = round(median(LOG10P_pooled, na.rm = TRUE), 3),
        median_LOG10P_metal = round(median(LOG10P_metal, na.rm = TRUE), 3),
        max_LOG10P_pooled = round(max(LOG10P_pooled, na.rm = TRUE), 3),
        max_LOG10P_metal = round(max(LOG10P_metal, na.rm = TRUE), 3),
        mean_LOG10P_pooled = round(mean(LOG10P_pooled, na.rm = TRUE), 3),
        mean_LOG10P_metal = round(mean(LOG10P_metal, na.rm = TRUE), 3),
        sd_LOG10P_pooled = sd(LOG10P_pooled, na.rm = TRUE),
        sd_LOG10P_metal = sd(LOG10P_metal, na.rm = TRUE),
        )

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(lipid_type, significant_pooled, significant_metal, cohorts_metal) %>%
    summarize(
        count = n(),
        min_LOG10P_pooled = round(min(LOG10P_pooled, na.rm = TRUE), 3),
        min_LOG10P_metal = round(min(LOG10P_metal, na.rm = TRUE), 3),
        median_LOG10P_pooled = round(median(LOG10P_pooled, na.rm = TRUE), 3),
        median_LOG10P_metal = round(median(LOG10P_metal, na.rm = TRUE), 3),
        max_LOG10P_pooled = round(max(LOG10P_pooled, na.rm = TRUE), 3),
        max_LOG10P_metal = round(max(LOG10P_metal, na.rm = TRUE), 3),
        mean_LOG10P_pooled = round(mean(LOG10P_pooled, na.rm = TRUE), 3),
        mean_LOG10P_metal = round(mean(LOG10P_metal, na.rm = TRUE), 3),
        sd_LOG10P_pooled = sd(LOG10P_pooled, na.rm = TRUE),
        sd_LOG10P_metal = sd(LOG10P_metal, na.rm = TRUE),
)

### Check summary counts

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(lipid_type, significant_pooled, significant_metal, single_cohort_result_metal) %>%
    summarize(count = n())

In [None]:
gwas_result_summary <- joined_pooled_and_all_metal_results %>%
    group_by(lipid_type,
             significant_pooled,
             significant_metal,
             single_cohort_result_metal) %>%
    summarize(count = n())

In [None]:
gwas_result_summary %>%
    filter(lipid_type == 'LDL')

#### Create a TSV with the summary counts for all lipids.

In [None]:
write_tsv(gwas_result_summary, GWAS_RESULT_SUMMARY_FILENAME)

In [None]:
# Copy the file to the workspace bucket.
#system(str_glue('gsutil cp {GWAS_RESULT_SUMMARY_FILENAME} {DESTINATION}'), intern = T)

### Create a TSV with the significant results for all lipids.

In [None]:
significant_lipids_gwas_results <- joined_pooled_and_all_metal_results %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    # Prevent GoogleSheets from interpreting this as a formula.
    mutate(Direction_metal = paste0("'", Direction_metal, "'")) %>%
    select(
        lipid_type, ID,
        significant_pooled, significant_metal, single_cohort_result_metal, cohorts_metal,
        p_value_pooled, `P-value_metal`,
        BETA_pooled, Effect_metal, 
        SE_pooled, StdErr_metal,
        LOG10P_pooled, CHISQ_pooled,
        A1FREQ_pooled, N_pooled, Freq1_metal, FreqSE_metal, MinFreq_metal, MaxFreq_metal,
        CHROM, GENPOS,
        ALLELE0_pooled, ALLELE1_pooled, Allele1_metal, Allele2_metal
    ) %>%
    arrange(lipid_type, CHROM, GENPOS)

In [None]:
head(significant_lipids_gwas_results)

In [None]:
significant_lipids_gwas_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(significant_pooled, significant_metal) %>%
    summarize(count = n())

In [None]:
write_tsv(significant_lipids_gwas_results, SIGNIFICANT_LIPIDS_GWAS_RESULTS_FILENAME)

In [None]:
# Copy the file to the workspace bucket.
#system(str_glue('gsutil cp {SIGNIFICANT_LIPIDS_GWAS_RESULTS_FILENAME} {DESTINATION}'), intern = T)

### Examine Metal direction

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    group_by(significant_pooled, significant_metal, Direction_metal) %>%
    summarize(count = n())

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    filter(!is.na(Direction_metal)) %>%
    mutate(
        direction = case_when(
            Direction_metal %in% c('--', '++') ~ 'agreement',
            Direction_metal %in% c('?-', '-?', '?+', '+?') ~ 'single cohort',
            Direction_metal %in% c('-+', '+-') ~ 'conflicting',            
        )
    ) %>%
    group_by(significant_pooled, significant_metal, direction) %>%
    summarize(count = n())

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    filter(!is.na(Direction_metal)) %>%
    mutate(
        direction = case_when(
            Direction_metal %in% c('--', '++') ~ 'agreement',
            Direction_metal %in% c('?-', '-?', '?+', '+?') ~ 'single cohort',
            Direction_metal %in% c('-+', '+-') ~ 'conflicting',            
        )
    ) %>%
    group_by(direction) %>%
    summarize(count = n())

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    filter(!is.na(Direction_metal)) %>%
    mutate(
        direction = case_when(
            Direction_metal %in% c('--', '++') ~ 'agreement',
            Direction_metal %in% c('?-', '-?', '?+', '+?') ~ 'single cohort',
            Direction_metal %in% c('-+', '+-') ~ 'conflicting',            
        )
    ) %>%
    group_by(direction) %>%
    summarize(
        count = n(),
        median_pooled_se = median(SE_pooled, na.rm = TRUE),
        median_metal_se = median(StdErr_metal, na.rm = TRUE),
        mean_pooled_se = mean(SE_pooled, na.rm = TRUE),
        mean_metal_se = mean(StdErr_metal, na.rm = TRUE),
        sd_pooled_se = sd(SE_pooled, na.rm = TRUE),
        sd_metal_se = sd(StdErr_metal, na.rm = TRUE)
    )

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    mutate(
        direction = case_when(
            Direction_metal %in% c('--', '++') ~ 'agreement [++, --]',
            Direction_metal %in% c('?-', '-?', '?+', '+?') ~ 'single cohort [+?, ?+, -?, ?-]',
            Direction_metal %in% c('-+', '+-') ~ 'conflicting [+_, _+]',
            TRUE ~ 'pooled-only result'
        ),
        direction = fct_relevel(direction, 'pooled-only result', after = Inf)
    ) %>%
    pivot_longer(
        cols = c(SE_pooled, StdErr_metal),
        names_to = 'approach',
        names_pattern = '.*_(.*)',
        values_to = 'stderr'
    ) %>%
    ggplot(aes(x=approach, y=stderr)) +
        geom_boxplot() +
        stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                     position = position_dodge(width = 0.9), vjust = -0.8) +
        facet_wrap(~ direction, ncol = 4) +
        scale_y_log10() +
        theme(
            axis.title.x = element_text(size=14),
            axis.title.y = element_text(size=14),
        ) +
        xlab('analysis approach') +
        ylab('standard error [log scale]') +
        labs(title = str_glue('Standard error values for significant LDL results by cohort "direction" from METAL.'),
             caption = PLOT_SUBTITLE)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    mutate(
        direction = case_when(
            Direction_metal %in% c('--', '++') ~ 'agreement [++, --]',
            Direction_metal %in% c('?-', '-?', '?+', '+?') ~ 'single cohort [+?, ?+, -?, ?-]',
            Direction_metal %in% c('-+', '+-') ~ 'conflicting [+_, _+]',
            TRUE ~ 'pooled-only result'
        ),
        direction = fct_relevel(direction, 'pooled-only result', after = Inf)
    ) %>%
    pivot_longer(
        cols = c(p_value_pooled, `P-value_metal`),
        names_to = 'approach',
        names_pattern = '.*_(.*)$',
        values_to = 'p_value'
    ) %>%
    ggplot(aes(x=approach, y=p_value)) +
        geom_boxplot() +
        stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                     position = position_dodge(width = 0.9), vjust = -0.8) +
        facet_wrap(~ direction, ncol = 4) +
        scale_y_log10() +
        theme(
            axis.title.x = element_text(size=14),
            axis.title.y = element_text(size=14),
        ) +
        xlab('analysis approach') +
        ylab('p-value [log scale]') +
        labs(title = str_glue('p-values for significant LDL results by cohort "direction" from METAL.'),
             caption = PLOT_SUBTITLE)

## Pooled versus Metal - ‘?’ Metal results removed

In [None]:
joined_pooled_and_metal_results <- metal_meta_analysis_results %>%
    filter(str_detect(Direction_metal, '\\?', negate = TRUE)) %>%
    full_join(pooled_lipids_regenie_results, by = c('ID', 'lipid_type'))

dim(joined_pooled_and_metal_results)

In [None]:
joined_pooled_and_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(lipid_type, significant_pooled, significant_metal) %>%
    summarize(count = n())

In [None]:
categorized_ldl_pooled_and_metal_results <- joined_pooled_and_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    mutate(
        `meets genome-wide\nsignificance level` = case_when(
            significant_metal == TRUE & significant_pooled == TRUE ~ 'both',
            significant_metal == FALSE & significant_pooled == TRUE ~ 'pooled',
            is.na(significant_metal) & significant_pooled == TRUE ~ 'pooled only',
            significant_metal == TRUE & is.na(significant_pooled) ~ 'meta-analysis only',
            significant_metal == TRUE & significant_pooled == FALSE ~ 'meta-analysis',
            TRUE ~ 'not significant'
        ),
        `Meta-analysis` = case_when(
            significant_metal == TRUE ~ 'significant in meta-analysis',
            significant_metal == FALSE ~ 'not significant in meta-analysis',
            is.na(significant_metal) ~ 'not in meta-analysis\n(single-cohort variant due to AC<6\nor filtered during variant QC)'
        ),
        `Pooled analysis` = case_when(
            significant_pooled == TRUE ~ 'significant in pooled',
            significant_pooled == FALSE ~ 'not significant in pooled',
            is.na(significant_pooled) ~ 'not in pooled\n(filtered during variant QC)'
        ),
        `Pooled analysis` = fct_rev(`Pooled analysis`)
    )

categorized_ldl_pooled_and_metal_results %>%
    group_by(`meets genome-wide\nsignificance level`) %>%
    summarize(count = n())

In [None]:
options(repr.plot.width = 10, repr.plot.height = 8)

categorized_ldl_pooled_and_metal_results %>%
    filter(`meets genome-wide\nsignificance level` %in% c('pooled', 'meta-analysis')) %>%
    ggplot(aes(x = LOG10P_pooled, y = LOG10P_metal, color = `meets genome-wide\nsignificance level`)) +
    geom_point(size = 3) +
    theme(
        axis.title.x = element_text(size=14),
        axis.title.y = element_text(size=14),
    ) +
    labs(title = 'LDL results in both the meta-analysis and the pooled analysis,\nbut found to be genome-wide significant in only one approach',
         caption = PLOT_SUBTITLE)

In [None]:
categorized_ldl_pooled_and_metal_results %>%
    group_by(`Meta-analysis`, `Pooled analysis`) %>%
    summarize(count = n())

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

categorized_ldl_pooled_and_metal_results %>%
    group_by(`Meta-analysis`, `Pooled analysis`) %>%
    summarize(count = n()) %>%
    ggplot(aes(x = `Pooled analysis`, y = `Meta-analysis`, fill = count, label = count)) +
    geom_tile() +
    geom_text(color = 'white', size = 8) +
    scale_fill_continuous(trans = 'reverse', guide = 'none') +
    theme(
        axis.title.x = element_text(size = 22),
        axis.title.y = element_text(size = 22),
        axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 18),
        axis.text.y = element_text(size = 18)
    ) +
    labs(title = 'Counts of LDL GWAS results categorized by significance and presence\nin each approach',
         caption = PLOT_SUBTITLE)

In [None]:
joined_pooled_and_all_metal_results %>%
    mutate(
        significant_metal = ifelse(single_cohort_result_metal == TRUE, NA, significant_metal)
    )  %>%
    filter(lipid_type == 'LDL') %>%
    group_by(lipid_type, significant_pooled, significant_metal, single_cohort_result_metal) %>%
    summarize(count = n())

In [None]:
joined_pooled_and_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(lipid_type, significant_pooled, significant_metal, Direction_metal) %>%
    summarize(count = n())

# Provenance 

In [None]:
devtools::session_info()