# Aggregate GWAS results

In this notebook we modify our GWAS results such that they become okay to leave the *All of Us* Researcher workbench per the data dissemination rules.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://docs.google.com/document/d/19ZS0z_-7FEM37pNDAXaWaqBSLnqyd9MZEkiOmtF3n_0/edit#).

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench.
    <ul>
        <li>Use "Recommended Environment" <kbd><b>General Analysis</b></kbd> which creates compute type <kbd><b>Standard VM</b></kbd> with reasonable defaults for CPU, RAM, and disk.</li>
        <li>This notebook only takes a few minutes to run interactively. You can also it in the background via <kbd>run_notebook_in_the_background</kbd> for the sake of provenance and reproducibility.</li>
    </ul>
</div>

In [None]:
library(tidyverse)

## Constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

# Created via notebook aou_workbench_siloed_analyses/06_aou_regenie_gwas.ipynb
REGENIE_RESULTS <- c(
    HDL='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_HDL_norm.regenie',
    LDL='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_LDL_adjusted_norm.regenie',
    TC='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_TC_adjusted_norm.regenie',
    TG='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220315/aou_alpha3_ukb_lipids_regenie_step2_TG_adjusted_norm.regenie',
    BATCH='gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/regenie/20220507/aou_alpha3_ukb_batch_regenie_step2_is_aou.regenie'
)

LIPIDS <- names(REGENIE_RESULTS)

RAW_FILE_SUFFIX <- '.regenie'
AGGREGATE_FILE_SUFFIX <- '_aggregated.tsv'

# Load the regenie GWAS results

Bring our results into a single dataframe with a lipid type column.

In [None]:
combined_regenie_results <- bind_rows(
    lapply(LIPIDS, function(lipid) {
        file = REGENIE_RESULTS[lipid]
        read_delim(pipe(str_glue('gsutil cat {file}')), delim = ' ') %>%
        mutate(lipid_type = lipid)
    })) %>%
    mutate(
        AN = 2 * N,
        AC_alt = round(A1FREQ * AN),
        AC_ref = round((1 - A1FREQ) * AN)
    )

dim(combined_regenie_results)

In [None]:
head(combined_regenie_results)

In [None]:
combined_regenie_results %>%
    group_by(lipid_type) %>%
    summarize(
        count = n(),
        min_LOG10P = min(LOG10P),
        max_LOG10P = max(LOG10P),
        min_A1FREQ = min(A1FREQ),
        max_A1FREQ = max(A1FREQ),
        min_N = min(N),
        max_N = max(N),
        min_AC_alt = min(AC_alt),
        max_AC_alt = max(AC_alt),
        min_AC_ref = min(AC_ref),
        max_AC_ref = max(AC_ref),
    )

## How many significant results will be removed from the aggregate?

In [None]:
combined_regenie_results %>%
    mutate(
        significant = LOG10P > -log10(5e-08),
        group_size_threshold = ifelse(AC_alt < 40 | AC_ref < 40,
                                      'below minimum group size threshold',
                                      'meets group size threshold'),
    ) %>%
    group_by(lipid_type, significant, group_size_threshold) %>%
    summarize(count = n())

# Filter to ensure at least 20 individuals have the variant

In [None]:
aggregate_regenie_results <- combined_regenie_results %>%
    filter(AC_alt >= 40 & AC_ref >= 40)

In [None]:
aggregate_regenie_results %>%
    group_by(lipid_type) %>%
    summarize(
        count = n(),
        min_LOG10P = min(LOG10P),
        max_LOG10P = max(LOG10P),
        min_A1FREQ = min(A1FREQ),
        max_A1FREQ = max(A1FREQ),
        min_N = min(N),
        max_N = max(N),
        min_AC_alt = min(AC_alt),
        max_AC_alt = max(AC_alt),
        min_AC_ref = min(AC_ref),
        max_AC_ref = max(AC_ref),
    )

# Write out the aggregate data to local disk

In [None]:
for (lipid in LIPIDS) {
    input_file <- REGENIE_RESULTS[lipid]
    output_file <- input_file %>%
        str_replace('gs://', '') %>%
        str_replace_all('/', '_') %>%
        str_replace(str_glue('{RAW_FILE_SUFFIX}$'), AGGREGATE_FILE_SUFFIX)
    message(str_glue('Aggregating results from {input_file} to {output_file}'))
    stopifnot('output filename must be different from input filename' =
              output_file != input_file)
    write_tsv(aggregate_regenie_results %>% filter(lipid_type == lipid), output_file)
}

# Now you can download these files!

**Be sure to download the aggregated TSV files**, not the .regenie files with the raw results.

# Provenance 

In [None]:
devtools::session_info()