# Plot results for METAL meta-analysis

In this notebook we review and explore the results of the meta-analysis of aggregated *All of Us* and UK Biobank GWAS results.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://github.com/all-of-us/ukb-cross-analysis-demo-project). Specifically this is for the portion of the project that is the meta-analysis of **siloed** gwas results.

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench.
    <ul>
        <li>Use "Recommended Environment" <kbd><b>General Analysis</b></kbd> which creates compute type <kbd><b>Standard VM</b></kbd> with reasonable defaults for CPU, RAM, and disk.</li>
        <li>This notebook takes several minutes to run interactively. You can also it in the background via <kbd>run_notebook_in_the_background</kbd> for the sake of provenance and reproducibility.</li>
    </ul>
</div>

In [None]:
lapply(c('hrbrthemes', 'skimr', 'qqman'),
       function(pkg) { if(! pkg %in% installed.packages()) { 
           install.packages(pkg)
       } } )

In [None]:
library(grid)
library(gridExtra)
library(hrbrthemes)
library(qqman)
library(readxl)
library(scales)
library(skimr)
library(tidyverse)

In [None]:
# Set some visualiation defaults.
theme_set(theme_ipsum(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

## Define constants

In [None]:
#---[ Inputs ]---
# Created via notebook aou_workbench_siloed_analyses/09_metal_meta_analysis.ipynb
METAL_META_ANALYSIS_RESULTS <- c(
    HDL='gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/metaanalysis/20230410/METAANALYSIS_HDL_1.tbl',
    LDL='gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/metaanalysis/20230410/METAANALYSIS_LDL_1.tbl',
    TC='gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/metaanalysis/20230410/METAANALYSIS_TC_1.tbl',
    TG='gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/metaanalysis/20230410/METAANALYSIS_TG_1.tbl'
)

LIPIDS <- names(METAL_META_ANALYSIS_RESULTS)

PLOT_SUBTITLE <- 'Source: All of Us v5 alpha3 and UK Biobank data'

# Load the METAL results

In [None]:
metal_meta_analysis_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        file <- METAL_META_ANALYSIS_RESULTS[lipid]
        read_table(pipe(str_glue('gsutil cat {file}'))) %>%
        mutate(lipid_type = lipid)
    }))

dim(metal_meta_analysis_results)

In [None]:
head(metal_meta_analysis_results)

In [None]:
print(system(str_glue('gsutil cat {METAL_META_ANALYSIS_RESULTS["LDL"]}.info'), intern=TRUE))

## Add derived fields

In [None]:
metal_meta_analysis_results <- metal_meta_analysis_results %>%
    mutate(
        LOG10P_metal = -log10(`P-value`),
        significant_metal = `P-value` <= 5e-08,
        single_cohort_metal_result = str_detect(Direction, '\\?'),
        cohorts = case_when(
            Direction %in% c('++', '--') ~ 'both cohorts', 
            Direction %in% c('+-', '-+') ~ 'both cohorts with conflicting direction of effect', 
            Direction %in% c('+?', '-?') ~ 'AoU only', 
            Direction %in% c('?+', '?-') ~ 'UKB only'
        )
    )

In [None]:
head(metal_meta_analysis_results)

## Fix values when Allele2 is the reference allele

In [None]:
metal_meta_analysis_results <- metal_meta_analysis_results %>%
    mutate(ID = MarkerName) %>%
    separate(ID, sep = '_', into = c('CHROM_ID', 'GENPOS', 'REF', 'ALT'), convert = TRUE) %>%
    mutate(
        CHROM = parse_number(CHROM_ID),
        is_flipped = MarkerName != str_c(CHROM_ID, GENPOS, str_to_upper(Allele1), str_to_upper(Allele2), sep='_'),
        fixed_beta = ifelse(is_flipped, -Effect, Effect),
        fixed_freq1 =  ifelse(is_flipped, 1 - Freq1, Freq1),
    )

dim(metal_meta_analysis_results)

In [None]:
metal_meta_analysis_results %>%
    select(MarkerName, CHROM, GENPOS, REF, ALT, Allele1, Allele2, is_flipped) %>%
    head()

# Counts by significance threshold

In [None]:
metal_meta_analysis_results %>%
    group_by(lipid_type, significant_metal) %>%
    summarize(count = n())

Show only results from both cohorts.

In [None]:
metal_meta_analysis_results %>%
    filter(single_cohort_metal_result == FALSE) %>%
    group_by(lipid_type, significant_metal) %>%
    summarize(count = n())

# Significant LDL results by chromosome

In [None]:
metal_meta_analysis_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_metal == TRUE) %>%
    group_by(CHROM) %>%
    summarize(count = n()) %>%
    arrange(desc(count))

Show only results from both cohorts.

In [None]:
metal_meta_analysis_results %>%
    filter(single_cohort_metal_result == FALSE) %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_metal == TRUE) %>%
    group_by(CHROM) %>%
    summarize(count = n()) %>%
    arrange(desc(count))

# Top LDL results

In [None]:
metal_meta_analysis_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_metal == TRUE) %>%
    arrange(desc(LOG10P_metal)) %>%
    head()

Show only results from both cohorts.

In [None]:
metal_meta_analysis_results %>%
    filter(single_cohort_metal_result == FALSE) %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_metal == TRUE) %>%
    arrange(desc(LOG10P_metal)) %>%
    head()

# METAL direction field counts

In [None]:
metal_meta_analysis_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(Direction) %>%
    summarize(count = n()) %>%
    arrange(count)

In [None]:
metal_meta_analysis_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(cohorts) %>%
    summarize(count = n()) %>%
    arrange(count)

In [None]:
metal_meta_analysis_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_metal == TRUE) %>%
    group_by(cohorts) %>%
    summarize(count = n()) %>%
    arrange(count)

# METAL standard error

In [None]:
metal_meta_analysis_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(cohorts) %>%
    summarize(
        count = n(),
        min_StdErr = min(StdErr),
        max_StdErr = max(StdErr),
        mean_StdErr = mean(StdErr),
        median_StdErr = median(StdErr),
        sd_StdErr = sd(StdErr),
    ) %>%
    arrange(count)

Show only significant results.

In [None]:
metal_meta_analysis_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_metal == TRUE) %>%
    group_by(cohorts) %>%
    summarize(
        count = n(),
        min_StdErr = min(StdErr),
        max_StdErr = max(StdErr),
        mean_StdErr = mean(StdErr),
        median_StdErr = median(StdErr),
        sd_StdErr = sd(StdErr),
    ) %>%
    arrange(count)

# METAL allele frequency standard error

In [None]:
metal_meta_analysis_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(cohorts) %>%
    summarize(
        count = n(),
        min_FreqSE = min(FreqSE),
        max_FreqSE = max(FreqSE),
        mean_FreqSE = mean(FreqSE),
        median_FreqSE = median(FreqSE),
        sd_FreqSE = sd(FreqSE),
    ) %>%
    arrange(count)

Show only significant results.

In [None]:
metal_meta_analysis_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_metal == TRUE) %>%
    group_by(cohorts) %>%
    summarize(
        count = n(),
        min_FreqSE = min(FreqSE),
        max_FreqSE = max(FreqSE),
        mean_FreqSE = mean(FreqSE),
        median_FreqSE = median(FreqSE),
        sd_FreqSE = sd(FreqSE),
    ) %>%
    arrange(count)

# QQ and manhattan plots for METAL results

In [None]:
plot_manhattan_and_qq <- function(metal_results, manhattan_title, qq_title) {
    options(repr.plot.width = 10, repr.plot.height = 10)
    manhattan(metal_results,
              chr='CHROM',
              bp='GENPOS',
              snp='MarkerName',
              p='P-value',
              logp=TRUE,
              annotateTop = FALSE,
              ylim = c(0, 200),
              cex = 1.25,
              cex.axis = 1.25,
              cex.lab = 1.25,
              main = manhattan_title,
              sub = PLOT_SUBTITLE
             )

    qq(metal_results$`P-value`,
       cex = 1.25,
       cex.axis = 1.25,
       cex.lab = 1.25,
       main = qq_title,
       sub = PLOT_SUBTITLE)
}

## All METAL results

In [None]:
map(LIPIDS, function(lipid) {
    results <- metal_meta_analysis_results %>% filter(lipid_type == lipid)
    file = METAL_META_ANALYSIS_RESULTS[lipid]

    gc_score <- median(qchisq(1 - results$`P-value`, 1)) / qchisq(0.5, 1, lower.tail=FALSE)

    message(str_glue('nrow: {nrow(results)} ncol: {ncol(results)} in {file}'))
    message(str_glue('GC: {round(gc_score, 3)}'))

    plot_manhattan_and_qq(
        results,
        manhattan_title = str_glue('Results from one or more cohorts {basename(file)}\n{dirname(file)}'),
        qq_title = str_glue('Results from one or more cohorts {basename(file)}\n{dirname(file)}\n GC: {round(gc_score, 3)}')
    )
})

## Show only results found in both cohorts

In [None]:
map(LIPIDS, function(lipid) {
    results <- metal_meta_analysis_results %>%
        filter(lipid_type == lipid) %>%
        filter(single_cohort_metal_result == 'FALSE')
    file = METAL_META_ANALYSIS_RESULTS[lipid]

    gc_score <- median(qchisq(1 - results$`P-value`, 1)) / qchisq(0.5, 1, lower.tail=FALSE)

    message(str_glue('nrow: {nrow(results)} ncol: {ncol(results)} in {file}'))
    message(str_glue('GC: {round(gc_score, 3)}'))

    plot_manhattan_and_qq(
        results,
        manhattan_title = str_glue('Using only variants present in both cohorts {basename(file)}\n{dirname(file)}'),
        qq_title = str_glue('Using only variants present in both cohorts {basename(file)}\n{dirname(file)}\n GC: {round(gc_score, 3)}')
    )
})

# Comparisons against other lipids studies

## Comparison with UKB published GWAS summary

##### Rare coding variants in 35 genes associate with circulating lipid levels – a multi-ancestry analysis of 170,000 exomes. [Hindy et al 2021](https://www.biorxiv.org/content/10.1101/2020.12.22.423783v1.supplementary-material?versioned=true)

In [None]:
download.file('https://www.biorxiv.org/content/biorxiv/early/2021/09/01/2020.12.22.423783/DC2/embed/media-2.xlsx?download=true', 'hindy.xlsx')

Bring the Hindy results into a single dataframe with a lipid type column.

In [None]:
combined_hindy_results <- read_xlsx('hindy.xlsx', sheet = 'Table_S11', skip = 1, na = 'NA') %>%
    filter(Ancestry == 'Overall') %>%
    mutate(
        ID = str_c('chr', str_replace_all(RSID, ':', '_')),
        lipid_type = case_when(
            Trait == 'LDL_ADJ' ~ 'LDL',
            Trait == 'TOTAL_ADJ' ~ 'TC',
            TRUE ~ Trait
        )
    )

dim(combined_hindy_results)

In [None]:
head(combined_hindy_results)

In [None]:
map(LIPIDS, function(lipid) {
    hindy_results = combined_hindy_results %>%
        filter(lipid_type == lipid) %>%
        select(ID, beta_Hindy=BETA_FE)

    in_common_results = inner_join(
        hindy_results,
        metal_meta_analysis_results %>%
            filter(lipid_type == lipid) %>%
            filter(single_cohort_metal_result == FALSE) %>%
            select(ID=MarkerName, beta_metal=fixed_beta)
    )
    
    num_hindy_results = nrow(hindy_results)
    num_in_common_results = nrow(in_common_results)
    result_cor = round(cor(in_common_results$beta_metal, in_common_results$beta_Hindy)^2, digits=2)
    result_cor_test = cor.test(in_common_results$beta_metal, in_common_results$beta_Hindy)
    result_cor_test_p = scientific(result_cor_test$p.value, digits = 2)
    
    options(repr.plot.width = 8, repr.plot.height = 8)

    in_common_results %>%
    ggplot(aes(x = beta_Hindy, y = beta_metal)) +
        geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x = max(in_common_results$beta_Hindy),
                 y = min(in_common_results$beta_metal),
                 hjust = 'right',
                 vjust = -1,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('N = {num_in_common_results}\nR-square: {round(result_cor, digits = 3)}\nP-value= {result_cor_test_p}'))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{lipid} meta-analysis GWAS result comparison to\n{num_hindy_results}significant RSID from Hindy et al. 2021'),
             caption = PLOT_SUBTITLE)

})

## Comparison with TOPMed (Freeze8) Lipid GWAS

##### Whole genome sequence analysis of blood lipid levels in >66,000 individuals. [Selvaraj et al 2021](https://www.biorxiv.org/content/10.1101/2021.10.11.463514v1.supplementary-material)

In [None]:
download.file('https://www.biorxiv.org/content/biorxiv/early/2021/10/12/2021.10.11.463514/DC1/embed/media-1.xlsx?download=true', 'selvaraj.xlsx')

In [None]:
selvaraj_tables = c(HDL = 'A4:L361', LDL = 'A363:L701', TC = 'A703:L1027', TG = 'A1029:L1318')
LIPIDS <- c("HDL", "LDL", "TC", "TG")
combined_selvaraj_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        # Print some metadata for an eyeball check that we are associating the data with the correct lipid type.
        print(str_glue('{lipid} {selvaraj_tables[lipid]}'))
        first_row = as.integer(str_extract(selvaraj_tables[lipid], '\\d+'))
        print(read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = str_glue('A{first_row - 1}:A{first_row}')))
        print(nrow(read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = selvaraj_tables[lipid])))
        
        # Retrieve the data.
        read_xlsx('selvaraj.xlsx', sheet = 'Supplementary Table 3', range = selvaraj_tables[lipid]) %>%
        mutate(
            # Work around a bad entry in the data causing the p.value column to be of type character.
            p.value = as.numeric(p.value),
            ID = str_glue('chr{CHR}_{POS}_{Allele1}_{Allele2}'),
            lipid_type = lipid
        )
    }))

dim(combined_selvaraj_results)

In [None]:
head(combined_selvaraj_results)

In [None]:
map(LIPIDS, function(lipid) {
    selvaraj_results = combined_selvaraj_results %>%
        filter(lipid_type == lipid) %>%
        select(ID, beta_selvaraj=BETA)

    in_common_results = inner_join(
        selvaraj_results,
        metal_meta_analysis_results %>%
            filter(lipid_type == lipid) %>%
            filter(single_cohort_metal_result == FALSE) %>%
            select(ID=MarkerName, beta_metal=fixed_beta)
    )
    
    num_selvaraj_results = nrow(selvaraj_results)
    num_in_common_results = nrow(in_common_results)
    result_cor = round(cor(in_common_results$beta_metal, in_common_results$beta_selvaraj)^2, digits=2)
    result_cor_test = cor.test(in_common_results$beta_metal, in_common_results$beta_selvaraj)
    result_cor_test_p = scientific(result_cor_test$p.value, digits = 2)
 
    options(repr.plot.width = 8, repr.plot.height = 8)

    in_common_results %>%
    ggplot(aes(x = beta_selvaraj, y = beta_metal)) +
        geom_point(alpha = .5) +
        annotate(geom = 'text',
                 x = max(in_common_results$beta_selvaraj),
                 y = min(in_common_results$beta_metal),
                 hjust = 'right',
                 vjust = -1,
                 color = 'dark blue', 
                 size = 6,
                 label = c(str_glue('N = {num_in_common_results}\nR-square: {round(result_cor, digits = 3)}\nP-value= {result_cor_test_p}'))) +
        geom_abline() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        labs(title = str_glue('{lipid} meta-analysis GWAS comparison to {num_selvaraj_results}\nsignificant RSID from Selvaraj et al. 2021'),
             caption = PLOT_SUBTITLE)

})

# Provenance

In [None]:
devtools::session_info()