# Compare GWAS results

In this notebook we review and explore the differences and similarities between the pooled, METAL, and batch *All of Us* and UK Biobank data GWAS results.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://github.com/all-of-us/ukb-cross-analysis-demo-project).

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench.
    <ul>
        <li>Use "Recommended Environment" <kbd><b>General Analysis</b></kbd> which creates compute type <kbd><b>Standard VM</b></kbd> with reasonable defaults for CPU, RAM, and disk.</li>
        <li>This notebook takes several minutes to run interactively. You can also it in the background via <kbd>run_notebook_in_the_background</kbd> for the sake of provenance and reproducibility.</li>
    </ul>
</div>

<div class="alert alert-block alert-warning">
    <b>Note</b>: This notebook assumes you've already used a plink notebook, such as <kbd>06_pooled_variant_qc.ipynb</kbd>, to install plink2 and transfer the bgen and sample files locally.
</div>

In [None]:
lapply(c('ggExtra', 'hexbin', 'hrbrthemes', 'skimr', 'qqman', 'viridis'),
       function(pkg) { if(! pkg %in% installed.packages()) { install.packages(pkg)} } )

In [None]:
library(ggExtra)
library(grid)
library(gridExtra)
library(hrbrthemes)
library(lubridate)
library(qqman)
library(readxl)
library(scales)
library(skimr)
library(tidyverse)
library(viridis)

In [None]:
# Set some visualiation defaults.
theme_set(theme_ipsum(base_size = 16)) # Default theme for plots.

#' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
#'
#' @param d A data frame.
#' @return A data frame with column y as max and column label as length.
get_boxplot_fun_data <- function(df) {
  return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
}

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
# Created via notebook aou_workbench_pooled_analyses/compare_siloed_and_pooled_results/11a_batch_regenie_gwas.ipynb
BATCH_REGENIE_RESULTS <- c(
    IS_AOU='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220531/aou_alpha3_ukb_batch_regenie_step2_is_aou.regenie',
    IS_UKB='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220531/aou_alpha3_ukb_batch_regenie_step2_is_ukb.regenie'
)
BATCHES <- names(BATCH_REGENIE_RESULTS)

# Created via notebook aou_workbench_pooled_analyses/09_pooled_regenie_gwas.ipynb
POOLED_LIPIDS_REGENIE_RESULTS <- c(
    HDL='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_HDL_norm.regenie',
    LDL='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_LDL_adjusted_norm.regenie',
    TC='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_TC_adjusted_norm.regenie',
    TG='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_TG_adjusted_norm.regenie'
)
LIPIDS <- names(POOLED_LIPIDS_REGENIE_RESULTS)

# Created via notebook aou_workbench_siloed_analyses/09_metal_meta_analysis.ipynb
METAL_META_ANALYSIS_RESULTS <- c(
    HDL='gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/regenie/20220323/METAANALYSIS_HDL_1.tbl',
    LDL='gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/regenie/20220323/METAANALYSIS_LDL_1.tbl',
    TC='gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/regenie/20220323/METAANALYSIS_TC_1.tbl',
    TG='gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/regenie/20220323/METAANALYSIS_TG_1.tbl'
)

# Created via aou_workbench_siloed_analyses/08_aggregate_gwas_results.ipynb
AOU_LDL_GWAS_RESULTS <- 'gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/pheno/20220323/fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5_data_aou_regenie_20220318_aou_alpha3_lipids_regenie_step2_LDL_adjusted_norm_aggregated.tsv'
# Created via ukb_rap_siloed_analyses/11_aggregate_gwas_results.ipynb
UKB_LDL_GWAS_RESULTS <- 'gs://fc-secure-471c1068-cd3d-4b43-9b5d-a618c85ceea5/data/aou/pheno/20220323/ukb_lipids_regenie_step2_LDL_adjusted_norm_aggregated.tsv'

# Created via notebook aou_workbench_pooled_analyses/06_pooled_variant_qc.ipynb
STEP2_VARIANT_QC_ID <- 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/variant-qc/20220311/aou_alpha3_ukb_lipids_step2QC_plink.id'
STEP2_VARIANT_QC_SNPLIST <- 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/variant-qc/20220311/aou_alpha3_ukb_lipids_step2QC_plink.snplist'

PLOT_SUBTITLE <- 'Source: All of Us v5 alpha3 and UK Biobank data'

#---[ Outputs ]---
# Create a timestamp for a folder of results generated today.
DATESTAMP <- strftime(now(), '%Y%m%d')
DESTINATION <- str_glue('{Sys.getenv("WORKSPACE_BUCKET")}/data/results/{DATESTAMP}/')
SIGNIFICANT_BATCH_VARIANTS_FILENAME <- 'significant_regenie_batch_variants.tsv'
OVERLAPPING_SIGNIFICANT_BATCH_VARIANTS_FILENAME <- 'significant_regenie_batch_variants_overlapping_lipids.tsv'
GWAS_RESULT_SUMMARY_FILENAME <- 'gwas_result_summary.tsv'
SIGNIFICANT_LIPIDS_GWAS_RESULTS_FILENAME <- 'significant_lipids_gwas_results.tsv'

# Load the regenie batch GWAS results

Bring our results into a single dataframe with a cohort_indicator type column.

In [None]:
batch_regenie_results <- bind_rows(
    lapply(BATCHES, function(cohort_indicator) {
        file <- BATCH_REGENIE_RESULTS[cohort_indicator]
        read_delim(pipe(str_glue('gsutil cat {file}')), delim = ' ') %>%
        mutate(phenotype = cohort_indicator)
    })) %>%
    mutate(
        AN = 2 * N,
        AC_alt = round(A1FREQ * AN),
        AC_ref = round((1 - A1FREQ) * AN),
        significant = LOG10P > -log10(5e-08),
        group_size_threshold = ifelse(AC_alt < 40 | AC_ref < 40,
                                      'below minimum group size threshold',
                                      'meets group size threshold'),
        p_value = 10 ^ (-1 * LOG10P),
    ) %>%
    rename_at(vars(-ID, -CHROM, -GENPOS), ~ paste0(., '_batch'))

dim(batch_regenie_results)

In [None]:
head(batch_regenie_results)

In [None]:
batch_regenie_results %>%
    group_by(phenotype_batch) %>%
    summarize(
        mum_results = n(),
        num_significant_batch_results = sum(significant_batch),
        min_p_value = min(p_value_batch),
        max_p_value = max(p_value_batch),
        min_LOG10P = min(LOG10P_batch),
        max_LOG10P = max(LOG10P_batch),
        min_A1FREQ = min(A1FREQ_batch),
        max_A1FREQ = max(A1FREQ_batch),
        min_N = min(N_batch),
        max_N = max(N_batch),
    )

In [None]:
batch_regenie_results %>%
    group_by(phenotype_batch, significant_batch, group_size_threshold_batch) %>%
    summarize(count = n())

In [None]:
batch_regenie_results %>%
    filter(phenotype_batch == 'IS_AOU') %>%
    group_by(CHROM) %>%
    summarize(count = n()) %>%
    arrange(CHROM, count)

In [None]:
batch_regenie_results %>%
    filter(phenotype_batch == 'IS_AOU') %>%
    filter(significant_batch == 'TRUE') %>%
    group_by(CHROM, group_size_threshold_batch) %>%
    summarize(count = n()) %>%
    arrange(CHROM, count)

## Plot regenie results

In [None]:
plot_manhattan_and_qq <- function(regenie_results, manhattan_title, qq_title) {
    options(repr.plot.width = 10, repr.plot.height = 10)
    manhattan(regenie_results,
              chr='CHROM',
              bp='GENPOS',
              snp='ID',
              p='p_value_batch',
              logp=TRUE,
              annotateTop = FALSE,
              ylim = c(0, 20),
              cex = 1.25,
              cex.axis = 1.25,
              cex.lab = 1.25,
              main = manhattan_title,
              sub = PLOT_SUBTITLE
             )

    qq(regenie_results$p_value_batch,
       cex = 1.25,
       cex.axis = 1.25,
       cex.lab = 1.25,
       main = qq_title,
       sub = PLOT_SUBTITLE)
}

In [None]:
map(BATCHES, function(cohort_indicator) {
    regenie_results <- batch_regenie_results %>%
        filter(phenotype_batch == cohort_indicator) %>%
        filter(group_size_threshold_batch == 'meets group size threshold')

    file = BATCH_REGENIE_RESULTS[cohort_indicator]

    gc_score <- median(regenie_results$CHISQ_batch) / qchisq(0.5, 1, lower.tail=FALSE)

    message(str_glue('nrow: {nrow(regenie_results)} ncol: {ncol(regenie_results)} in {file}'))
    message(str_glue('GC: {round(gc_score, 3)}'))

    plot_manhattan_and_qq(
        regenie_results,
        manhattan_title = str_glue('{basename(file)} results from\n{dirname(file)}'),
        qq_title = str_glue('{basename(file)} results\nfrom {dirname(file)}\n GC: {round(gc_score, 3)}')
    )
})

# Check the intersection with pooled lipids GWAS results

In [None]:
pooled_lipids_regenie_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        file <- POOLED_LIPIDS_REGENIE_RESULTS[lipid]
        read_delim(pipe(str_glue('gsutil cat {file}')), delim = ' ') %>%
        mutate(lipid_type = lipid)
    })) %>%
    mutate(
        is_flipped = ID != str_glue('chr{CHROM}_{GENPOS}_{ALLELE0}_{ALLELE1}'),
        AN = 2 * N,
        AC_alt = round(A1FREQ * AN),
        AC_ref = round((1 - A1FREQ) * AN),
        significant = LOG10P > -log10(5e-08),
        group_size_threshold = ifelse(AC_alt < 40 | AC_ref < 40,
                                      'below minimum group size threshold',
                                      'meets group size threshold'),
        p_value = 10 ^ (-1 * LOG10P)
    ) %>%
    rename_at(vars(-lipid_type, -ID, -CHROM, -GENPOS), ~ paste0(., '_pooled'))

dim(pooled_lipids_regenie_results)

In [None]:
head(pooled_lipids_regenie_results)

In [None]:
batch_overlap_pooled <- batch_regenie_results %>%
    filter(phenotype_batch == 'IS_AOU') %>%
    filter(significant_batch) %>%
    inner_join(pooled_lipids_regenie_results,
               by = c('ID', 'CHROM', 'GENPOS'),
               suffix = c('_batch', '_pooled')
              )

In [None]:
# Check the join. These values should be similar.
nrow(batch_overlap_pooled)
4 * nrow(batch_regenie_results %>% filter(phenotype_batch == 'IS_AOU') %>% filter(significant_batch))

In [None]:
batch_overlap_pooled %>%
    group_by(lipid_type, significant_batch, significant_pooled) %>%
    summarize(
        count = n(),
        max_LOG10P_batch = max(LOG10P_batch),
        max_LOG10P_pooled = max(LOG10P_pooled),
        max_BETA_batch = max(BETA_batch),
        max_BETA_pooled = max(BETA_pooled),
    )

In [None]:
batch_overlap_pooled %>%
    group_by(lipid_type, CHROM, significant_batch, significant_pooled) %>%
    summarize(
        count = n(),
        max_LOG10P_batch = max(LOG10P_batch),
        max_LOG10P_pooled = max(LOG10P_pooled),
        max_BETA_batch = max(BETA_batch),
        max_BETA_pooled = max(BETA_pooled),
    )

# Check the intersection with meta analysis lipids GWAS results

In [None]:
metal_meta_analysis_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        file <- METAL_META_ANALYSIS_RESULTS[lipid]
        read_table(pipe(str_glue('gsutil cat {file}'))) %>%
        mutate(lipid_type = lipid)
    })) %>%
    separate(MarkerName, sep = ':', into = c('CHROM', 'GENPOS', NA, NA), convert = TRUE, remove = FALSE) %>%
    mutate(
        ID = str_c('chr', str_replace_all(MarkerName, ':', '_')),
        # Fix values when Allele2 is the reference allele.
        is_flipped = MarkerName != str_c(CHROM, GENPOS, str_to_upper(Allele1), str_to_upper(Allele2), sep=':'),
        fixed_beta = ifelse(is_flipped, -Effect, Effect),
        fixed_freq1 =  ifelse(is_flipped, 1 - Freq1, Freq1),
        LOG10P = -log10(`P-value`),
        significant = `P-value` <= 5e-08,
        single_cohort_result = str_detect(Direction, '\\?'),
        cohorts = case_when(
            Direction %in% c('++', '--') ~ 'both cohorts', 
            Direction %in% c('+-', '-+') ~ 'both cohorts with conflicting direction of effect', 
            Direction %in% c('+?', '-?') ~ 'AoU only', 
            Direction %in% c('?+', '?-') ~ 'UKB only'
        )
    ) %>%
    rename_at(vars(-lipid_type, -ID, -CHROM, -GENPOS), ~ paste0(., '_metal'))

dim(metal_meta_analysis_results)

In [None]:
head(metal_meta_analysis_results)

In [None]:
batch_overlap_both <- batch_overlap_pooled %>%
    left_join(
        metal_meta_analysis_results, by = c('lipid_type', 'ID', 'CHROM', 'GENPOS'))

dim(batch_overlap_both)

In [None]:
# Check the join. These numbers should be similar.
length(unique(batch_overlap_both$ID))
nrow(batch_regenie_results %>% filter(significant_batch) %>% select(ID) %>% distinct)
stopifnot(length(unique(batch_overlap_both$ID)) < nrow(batch_regenie_results %>% filter(significant_batch) %>% select(ID) %>% distinct))

In [None]:
colnames(batch_overlap_both)

In [None]:
batch_overlap_both %>%
    group_by(lipid_type, significant_batch, significant_pooled, significant_metal) %>%
    summarize(
        count = n(),
        max_LOG10P_batch = max(LOG10P_batch),
        max_LOG10P_pooled = max(LOG10P_pooled),
        max_BETA_batch = max(BETA_batch),
        max_BETA_pooled = max(BETA_pooled),
    )

## Create a TSV with the significant batch variants that overlap significant lipids results

In [None]:
batch_overlap_both %>%
    filter(phenotype_batch == 'IS_AOU') %>%
    filter(group_size_threshold_batch == 'meets group size threshold') %>%
    filter(significant_pooled | significant_metal) %>%
    select(lipid_type, ID,
           significant_batch, significant_pooled, significant_metal,
           LOG10P_batch, LOG10P_pooled, LOG10P_metal,
           BETA_batch, BETA_pooled, Effect_metal,
           SE_batch, SE_pooled, StdErr_metal,
           CHROM, GENPOS,
           ALLELE0_batch, ALLELE1_batch, ALLELE0_pooled, ALLELE1_pooled, Allele1_metal, Allele2_metal
          ) %T>%
    write_tsv(OVERLAPPING_SIGNIFICANT_BATCH_VARIANTS_FILENAME) %>% 
    arrange(lipid_type, CHROM, GENPOS)

In [None]:
# Copy the file to the workspace bucket.
system(str_glue('gsutil cp {OVERLAPPING_SIGNIFICANT_BATCH_VARIANTS_FILENAME} {DESTINATION}'), intern = T)

## Create a TSV with the significant batch variants

In [None]:
batch_overlap_both %>%
    filter(phenotype_batch == 'IS_AOU') %>%
    filter(group_size_threshold_batch == 'meets group size threshold') %>%
    select(lipid_type, ID,
           significant_batch, significant_pooled, significant_metal,
           LOG10P_batch, LOG10P_pooled, LOG10P_metal,
           BETA_batch, BETA_pooled, Effect_metal,
           SE_batch, SE_pooled, StdErr_metal,
           CHROM, GENPOS,
           ALLELE0_batch, ALLELE1_batch, ALLELE0_pooled, ALLELE1_pooled, Allele1_metal, Allele2_metal
          ) %T>%
    write_tsv(SIGNIFICANT_BATCH_VARIANTS_FILENAME) %>%
    nrow()

In [None]:
# Copy the file to the workspace bucket.
system(str_glue('gsutil cp {SIGNIFICANT_BATCH_VARIANTS_FILENAME} {DESTINATION}'), intern = T)

# Check pooled vs. meta analysis lipids results

## Pooled versus Metal - '?' results flagged

In [None]:
joined_pooled_and_all_metal_results <- metal_meta_analysis_results %>%
    full_join(pooled_lipids_regenie_results, by = c('lipid_type', 'ID', 'CHROM', 'GENPOS'))

dim(joined_pooled_and_all_metal_results)

In [None]:
colnames(joined_pooled_and_all_metal_results)

### Check the join

In [None]:
length(unique(joined_pooled_and_all_metal_results$ID))
length(unique(c(metal_meta_analysis_results$ID, pooled_lipids_regenie_results$ID)))
stopifnot(length(unique(joined_pooled_and_all_metal_results$ID))
          == length(unique(c(metal_meta_analysis_results$ID,
                             pooled_lipids_regenie_results$ID))))

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(is_flipped_pooled, is_flipped_metal) %>%
    summarize(count = n())

### Compare LOG10P values

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(lipid_type, significant_pooled, significant_metal) %>%
    summarize(
        count = n(),
        min_LOG10P_pooled = round(min(LOG10P_pooled, na.rm = TRUE), 3),
        min_LOG10P_metal = round(min(LOG10P_metal, na.rm = TRUE), 3),
        median_LOG10P_pooled = round(median(LOG10P_pooled, na.rm = TRUE), 3),
        median_LOG10P_metal = round(median(LOG10P_metal, na.rm = TRUE), 3),
        max_LOG10P_pooled = round(max(LOG10P_pooled, na.rm = TRUE), 3),
        max_LOG10P_metal = round(max(LOG10P_metal, na.rm = TRUE), 3),
        mean_LOG10P_pooled = round(mean(LOG10P_pooled, na.rm = TRUE), 3),
        mean_LOG10P_metal = round(mean(LOG10P_metal, na.rm = TRUE), 3),
        sd_LOG10P_pooled = sd(LOG10P_pooled, na.rm = TRUE),
        sd_LOG10P_metal = sd(LOG10P_metal, na.rm = TRUE),
        )

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(lipid_type, significant_pooled, significant_metal, cohorts_metal) %>%
    summarize(
        count = n(),
        min_LOG10P_pooled = round(min(LOG10P_pooled, na.rm = TRUE), 3),
        min_LOG10P_metal = round(min(LOG10P_metal, na.rm = TRUE), 3),
        median_LOG10P_pooled = round(median(LOG10P_pooled, na.rm = TRUE), 3),
        median_LOG10P_metal = round(median(LOG10P_metal, na.rm = TRUE), 3),
        max_LOG10P_pooled = round(max(LOG10P_pooled, na.rm = TRUE), 3),
        max_LOG10P_metal = round(max(LOG10P_metal, na.rm = TRUE), 3),
        mean_LOG10P_pooled = round(mean(LOG10P_pooled, na.rm = TRUE), 3),
        mean_LOG10P_metal = round(mean(LOG10P_metal, na.rm = TRUE), 3),
        sd_LOG10P_pooled = sd(LOG10P_pooled, na.rm = TRUE),
        sd_LOG10P_metal = sd(LOG10P_metal, na.rm = TRUE),
)

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_metal == TRUE) %>%
    ggplot(aes(y = p_value_pooled, x = significant_pooled)) +
    scale_y_log10() +
    geom_boxplot()

In [None]:
gwas_result_summary <- joined_pooled_and_all_metal_results %>%
    group_by(lipid_type, significant_pooled, significant_metal,
             group_size_threshold_pooled, single_cohort_result_metal) %>%
    summarize(count = n())

In [None]:
gwas_result_summary %>%
    filter(lipid_type == 'LDL')

### Create a TSV with the summary counts for all lipids.

In [None]:
write_tsv(gwas_result_summary, GWAS_RESULT_SUMMARY_FILENAME)

In [None]:
# Copy the file to the workspace bucket.
system(str_glue('gsutil cp {GWAS_RESULT_SUMMARY_FILENAME} {DESTINATION}'), intern = T)

### Create a TSV with the significant results for all lipids.

In [None]:
significant_lipids_gwas_results <- joined_pooled_and_all_metal_results %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    filter(is.na(group_size_threshold_pooled)
           | group_size_threshold_pooled != 'below minimum group size threshold') %>%
    # Prevent GoogleSheets from interpreting this as a formula.
    mutate(Direction_metal = paste0("'", Direction_metal, "'")) %>%
    select(
        lipid_type, ID,
        significant_pooled, significant_metal, single_cohort_result_metal, cohorts_metal,
        p_value_pooled, `P-value_metal`,
        BETA_pooled, Effect_metal, 
        SE_pooled, StdErr_metal,
        LOG10P_pooled, CHISQ_pooled,
        A1FREQ_pooled, N_pooled, Freq1_metal, FreqSE_metal, MinFreq_metal, MaxFreq_metal,
        CHROM, GENPOS,
        ALLELE0_pooled, ALLELE1_pooled, Allele1_metal, Allele2_metal
    ) %>%
    arrange(lipid_type, CHROM, GENPOS)

In [None]:
head(significant_lipids_gwas_results)

In [None]:
significant_lipids_gwas_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(significant_pooled, significant_metal) %>%
    summarize(count = n())

In [None]:
write_tsv(significant_lipids_gwas_results, SIGNIFICANT_LIPIDS_GWAS_RESULTS_FILENAME)

In [None]:
# Copy the file to the workspace bucket.
system(str_glue('gsutil cp {SIGNIFICANT_LIPIDS_GWAS_RESULTS_FILENAME} {DESTINATION}'), intern = T)

### Examine Metal direction

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    group_by(significant_pooled, significant_metal, Direction_metal) %>%
    summarize(count = n())

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    filter(!is.na(Direction_metal)) %>%
    mutate(
        direction = case_when(
            Direction_metal %in% c('--', '++') ~ 'agreement',
            Direction_metal %in% c('?-', '-?', '?+', '+?') ~ 'single cohort',
            Direction_metal %in% c('-+', '+-') ~ 'conflicting',            
        )
    ) %>%
    group_by(significant_pooled, significant_metal, direction) %>%
    summarize(count = n())

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    filter(!is.na(Direction_metal)) %>%
    mutate(
        direction = case_when(
            Direction_metal %in% c('--', '++') ~ 'agreement',
            Direction_metal %in% c('?-', '-?', '?+', '+?') ~ 'single cohort',
            Direction_metal %in% c('-+', '+-') ~ 'conflicting',            
        )
    ) %>%
    group_by(direction) %>%
    summarize(count = n())

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    filter(!is.na(Direction_metal)) %>%
    mutate(
        direction = case_when(
            Direction_metal %in% c('--', '++') ~ 'agreement',
            Direction_metal %in% c('?-', '-?', '?+', '+?') ~ 'single cohort',
            Direction_metal %in% c('-+', '+-') ~ 'conflicting',            
        )
    ) %>%
    group_by(direction) %>%
    summarize(
        count = n(),
        median_pooled_se = median(SE_pooled, na.rm = TRUE),
        median_metal_se = median(StdErr_metal, na.rm = TRUE),
        mean_pooled_se = mean(SE_pooled, na.rm = TRUE),
        mean_metal_se = mean(StdErr_metal, na.rm = TRUE),
        sd_pooled_se = sd(SE_pooled, na.rm = TRUE),
        sd_metal_se = sd(StdErr_metal, na.rm = TRUE)
    )

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    mutate(
        direction = case_when(
            Direction_metal %in% c('--', '++') ~ 'agreement [++, --]',
            Direction_metal %in% c('?-', '-?', '?+', '+?') ~ 'single cohort [+?, ?+, -?, ?-]',
            Direction_metal %in% c('-+', '+-') ~ 'conflicting [+_, _+]',
            TRUE ~ 'pooled-only result'
        ),
        direction = fct_relevel(direction, 'pooled-only result', after = Inf)
    ) %>%
    pivot_longer(
        cols = c(SE_pooled, StdErr_metal),
        names_to = 'approach',
        names_pattern = '.*_(.*)',
        values_to = 'stderr'
    ) %>%
    ggplot(aes(x=approach, y=stderr)) +
        geom_boxplot() +
        stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                     position = position_dodge(width = 0.9), vjust = -0.8) +
        facet_wrap(~ direction, ncol = 4) +
        scale_y_log10() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        xlab('analysis approach') +
        ylab('standard error [log scale]') +
        labs(title = str_glue('Standard error values for significant LDL results by cohort "direction" from METAL.'),
             caption = PLOT_SUBTITLE)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    mutate(
        direction = case_when(
            Direction_metal %in% c('--', '++') ~ 'agreement [++, --]',
            Direction_metal %in% c('?-', '-?', '?+', '+?') ~ 'single cohort [+?, ?+, -?, ?-]',
            Direction_metal %in% c('-+', '+-') ~ 'conflicting [+_, _+]',
            TRUE ~ 'pooled-only result'
        ),
        direction = fct_relevel(direction, 'pooled-only result', after = Inf)
    ) %>%
    pivot_longer(
        cols = c(p_value_pooled, `P-value_metal`),
        names_to = 'approach',
        names_pattern = '.*_(.*)$',
        values_to = 'p_value'
    ) %>%
    ggplot(aes(x=approach, y=p_value)) +
        geom_boxplot() +
        stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                     position = position_dodge(width = 0.9), vjust = -0.8) +
        facet_wrap(~ direction, ncol = 4) +
        scale_y_log10() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        xlab('analysis approach') +
        ylab('p-value [log scale]') +
        labs(title = str_glue('p-values for significant LDL results by cohort "direction" from METAL.'),
             caption = PLOT_SUBTITLE)

## Pooled versus Metal - ‘?’ Metal results removed

In [None]:
joined_pooled_and_metal_results <- metal_meta_analysis_results %>%
    filter(str_detect(Direction_metal, '\\?', negate = TRUE)) %>%
    full_join(pooled_lipids_regenie_results, by = c('ID', 'lipid_type'))

dim(joined_pooled_and_metal_results)

In [None]:
joined_pooled_and_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(lipid_type, significant_pooled, significant_metal, group_size_threshold_pooled) %>%
    summarize(count = n())

In [None]:
joined_pooled_and_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    group_by(lipid_type, significant_pooled, significant_metal, group_size_threshold_pooled, Direction_metal) %>%
    summarize(count = n())

# Examine mysteries regarding '?' METAL results

In [None]:
pooled_variants_passing_QC <- read_tsv(pipe('gsutil cat {STEP2_VARIANT_QC_SNPLIST}'), col_names = c('ID'))

dim(pooled_variants_passing_QC)

In [None]:
head(pooled_variants_passing_QC)

In [None]:
system(paste('/tmp/plink2/plink2 --pfile plink2 --write-snplist --out all_aou_ukb_variants'), intern = TRUE)

In [None]:
all_pooled_variants <- read_tsv('all_aou_ukb_variants.snplist', col_names = c('ID'))

dim(all_pooled_variants)

In [None]:
head(all_pooled_variants)

In [None]:
aou_siloed_ldl_gwas_results <- read_tsv(pipe(str_glue('gsutil cat {AOU_LDL_GWAS_RESULTS}'))) %>%
    mutate (
        ID = str_c('chr', str_replace_all(ID, ':', '_')),
    ) %>%
    rename_at(vars(-ID, -CHROM, -GENPOS), ~ paste0(., '_aou_siloed'))

dim(aou_siloed_ldl_gwas_results)

In [None]:
head(aou_siloed_ldl_gwas_results)

In [None]:
ukb_siloed_ldl_gwas_results <- read_tsv(pipe(str_glue('gsutil cat {UKB_LDL_GWAS_RESULTS}'))) %>%
    mutate (
        ID = str_c('chr', str_replace_all(ID, ':', '_')),    ) %>%
    rename_at(vars(-ID, -CHROM, -GENPOS), ~ paste0(., '_ukb_siloed'))

dim(ukb_siloed_ldl_gwas_results)

In [None]:
head(ukb_siloed_ldl_gwas_results)

## Why does metal report results for variants found in both cohorts, but those variants are not in pooled? 

hypothesis: when the alleles were pooled, they no longer met the variant QC thresholds

In [None]:
gwas_result_summary %>%
    filter(lipid_type == 'LDL') %>%
    filter(is.na(significant_pooled)) %>%
    filter(single_cohort_result_metal == FALSE)

In [None]:
metal_two_cohort_na_pooled <- joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(is.na(significant_pooled)) %>%
    filter(single_cohort_result_metal == FALSE) %>%
    select(ID)

dim(metal_two_cohort_na_pooled)

In [None]:
head(metal_two_cohort_na_pooled)

In [None]:
length(intersect(metal_two_cohort_na_pooled$ID, all_pooled_variants$ID))

In [None]:
length(intersect(metal_two_cohort_na_pooled$ID, pooled_variants_passing_QC$ID))

In [None]:
length(setdiff(metal_two_cohort_na_pooled$ID, all_pooled_variants$ID))

In [None]:
unexplained <- setdiff(metal_two_cohort_na_pooled$ID, all_pooled_variants$ID)

unexplained

In [None]:
length(intersect(aou_siloed_ldl_gwas_results$ID, unexplained))

In [None]:
length(intersect(ukb_siloed_ldl_gwas_results$ID, unexplained))

In [None]:
metal_two_cohort_na_pooled %>% 
    filter(str_starts(ID, 'chr19_49406888')) %>%
    select(ID)           

In [None]:
all_pooled_variants %>% 
    filter(str_starts(ID, 'chr19_49406888')) %>%
    select(ID)           

In [None]:
metal_two_cohort_na_pooled %>% 
    filter(str_starts(ID, 'chr2_241809208')) %>%
    select(ID)           

In [None]:
all_pooled_variants %>% 
    filter(str_starts(ID, 'chr2_241809208')) %>%
    select(ID)           

## Why does metal return '?' for so many variants also found in pooled?

hypothesis: they are present in both cohorts, but only AC >= 40 in one cohort.

In [None]:
gwas_result_summary %>%
    filter(lipid_type == 'LDL') %>%
    filter(!is.na(significant_pooled)) %>%
    filter(single_cohort_result_metal == TRUE)

In [None]:
metal_single_cohort_also_in_pooled <- joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(!is.na(significant_pooled)) %>%
    filter(single_cohort_result_metal == TRUE) %>%
    select(ID)

dim(metal_single_cohort_also_in_pooled)

In [None]:
length(intersect(aou_siloed_ldl_gwas_results$ID, metal_single_cohort_also_in_pooled$ID))

In [None]:
length(intersect(ukb_siloed_ldl_gwas_results$ID, metal_single_cohort_also_in_pooled$ID))

In [None]:
length(intersect(aou_siloed_ldl_gwas_results$ID, metal_single_cohort_also_in_pooled$ID)) +
length(intersect(ukb_siloed_ldl_gwas_results$ID, metal_single_cohort_also_in_pooled$ID))

In [None]:
length(unique(c(intersect(aou_siloed_ldl_gwas_results$ID, metal_single_cohort_also_in_pooled$ID),
                intersect(ukb_siloed_ldl_gwas_results$ID, metal_single_cohort_also_in_pooled$ID))))

In [None]:
length(union(intersect(aou_siloed_ldl_gwas_results$ID, metal_single_cohort_also_in_pooled$ID),
             intersect(ukb_siloed_ldl_gwas_results$ID, metal_single_cohort_also_in_pooled$ID)))

In [None]:
metal_single_cohort_also_in_pooled %>%
    filter(ID %in% aou_siloed_ldl_gwas_results$ID) %>%
    filter(ID %in% ukb_siloed_ldl_gwas_results$ID) %>%
    nrow()

## Why do we have some metal results that are '+-' or '-+'?

hypothesis: stderrs are high for those

In [None]:
joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(cohorts_metal == 'both cohorts with conflicting direction of effect') %>%
    group_by(lipid_type, significant_pooled, significant_metal, cohorts_metal, group_size_threshold_pooled) %>%
    summarize(count = n())

In [None]:
metal_with_silos <- joined_pooled_and_all_metal_results %>%
    filter(lipid_type == 'LDL') %>%
    filter(cohorts_metal == 'both cohorts with conflicting direction of effect') %>%
    left_join(aou_siloed_ldl_gwas_results, by = c('CHROM', 'GENPOS', 'ID')) %>%
    left_join(ukb_siloed_ldl_gwas_results, by = c('CHROM', 'GENPOS', 'ID'))

dim(metal_with_silos)

In [None]:
colnames(metal_with_silos)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

metal_with_silos %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_pooled == TRUE
           | significant_metal == TRUE) %>%
    pivot_longer(
        cols = c(SE_pooled, StdErr_metal, SE_aou_siloed, SE_ukb_siloed),
        names_to = 'approach',
        names_pattern = '[^_]?_(.*)',
        values_to = 'stderr'
    ) %>%
    mutate(approach = fct_relevel(approach, c('pooled', 'metal', 'aou_siloed', 'ukb_siloed'))) %>%
    ggplot(aes(x=approach, y=stderr)) +
        geom_boxplot() +
        stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                     position = position_dodge(width = 0.9), vjust = -0.8) +
        scale_y_log10() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        xlab('analysis approach') +
        ylab('standard error [log scale]') +
        labs(title = str_glue('Standard error values for significant LDL results with METAL "conflicting cohorts".'),
             caption = PLOT_SUBTITLE)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 8)

metal_with_silos %>%
    filter(lipid_type == 'LDL') %>%
    filter(significant_metal == FALSE) %>%
    pivot_longer(
        cols = c(SE_pooled, StdErr_metal, SE_aou_siloed, SE_ukb_siloed),
        names_to = 'approach',
        names_pattern = '[^_]?_(.*)',
        values_to = 'stderr'
    ) %>%
    mutate(approach = fct_relevel(approach, c('pooled', 'metal', 'aou_siloed', 'ukb_siloed'))) %>%
    ggplot(aes(x=approach, y=stderr)) +
        geom_boxplot() +
        stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 5,
                     position = position_dodge(width = 0.9), vjust = -0.8) +
        scale_y_log10() +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
        ) +
        xlab('analysis approach') +
        ylab('standard error [log scale]') +
        labs(title = str_glue('Standard error values for non-significant LDL results with METAL "conflicting cohorts".'),
             caption = PLOT_SUBTITLE)

# Appendix

## Check the significant batch variants with PLINK

In [None]:
batch_regenie_results %>%
    filter(phenotype_batch == 'IS_AOU') %>%
    filter(significant_batch) %>%
    select(ID) %>%
    write_tsv('batch.snplist')

In [None]:
system(str_glue('gsutil cat {STEP2_VARIANT_QC_ID} | grep AOU > aou.ids'), intern = TRUE)

In [None]:
system(str_glue('gsutil cat {STEP2_VARIANT_QC_ID} | grep UKB > ukb.ids'), intern = TRUE)

Use the regenie notebook to install plink2 and transfer the bgen and sample files locally.

In [None]:
if (!file.exists('plink2.pgen')) {
    system('/tmp/plink2/plink2 --bgen aou-alpha3-ukb-chr1-chr22.bgen ref-first --sample aou-alpha3-ukb-chr1-chr22.sample --make-pgen',
           intern = TRUE)
}

In [None]:
system(paste('/tmp/plink2/plink2 --pfile plink2 --keep aou.ids --extract batch.snplist ',
             '--freq --missing --geno-counts --sample-counts --out aou_counts 2>&1'),
       intern = TRUE)

In [None]:
system(paste('/tmp/plink2/plink2 --pfile plink2 --keep ukb.ids --extract batch.snplist ',
             '--freq --missing --geno-counts --sample-counts --out ukb_counts 2>&1'),
       intern = TRUE)

In [None]:
gcounts <- inner_join(
    read_tsv('ukb_counts.gcount'),
    read_tsv('aou_counts.gcount'),
    by = c('#CHROM', 'ID', 'REF', 'ALT'),
    suffix = c('_ukb', '_aou'))

gcounts

In [None]:
gcounts %>%
    select(ID,
           HOM_REF_CT_aou, HOM_REF_CT_ukb,
           HET_REF_ALT_CTS_aou, HET_REF_ALT_CTS_ukb,
           TWO_ALT_GENO_CTS_aou, TWO_ALT_GENO_CTS_ukb,
           HAP_REF_CT_aou, HAP_REF_CT_ukb,
           HAP_ALT_CTS_aou, HAP_ALT_CTS_ukb,
           MISSING_CT_aou, MISSING_CT_ukb)

In [None]:
afreq <- inner_join(
    read_tsv('ukb_counts.afreq'),
    read_tsv('aou_counts.afreq'),
    by = c('#CHROM', 'ID', 'REF', 'ALT'),
    suffix = c('_ukb', '_aou'))

afreq

In [None]:
vmiss <- inner_join(
    read_tsv('ukb_counts.vmiss'),
    read_tsv('aou_counts.vmiss'),
    by = c('#CHROM', 'ID'),
    suffix = c('_ukb', '_aou'))

vmiss

In [None]:
colnames(gcounts)

In [None]:
colnames(afreq)

In [None]:
colnames(vmiss)

In [None]:
afreq %>%
    inner_join(gcounts) %>%
    inner_join(vmiss %>% select(-OBS_CT_aou, -OBS_CT_ukb)) %>%
    select(ID,
           ALT_FREQS_aou, ALT_FREQS_ukb,
           F_MISS_aou, F_MISS_ukb,
#           OBS_CT_aou, OBS_CT_ukb,
           TWO_ALT_GENO_CTS_aou, TWO_ALT_GENO_CTS_ukb,
           HET_REF_ALT_CTS_aou, HET_REF_ALT_CTS_ukb,
           HOM_REF_CT_aou, HOM_REF_CT_ukb,
#           HAP_REF_CT_aou, HAP_REF_CT_ukb,
#           HAP_ALT_CTS_aou, HAP_ALT_CTS_ukb,
           MISSING_CT_aou, MISSING_CT_ukb,
          )

# Provenance 

In [None]:
devtools::session_info()