# Annotate significant GWAS results with gnomAD

In this notebook, we use [Hail](https://hail.is/) to annotate the significant GWAS results with gnomAD.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://github.com/all-of-us/ukb-cross-analysis-demo-project).

# Setup 

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench.
    <ul>
        <li>Use "Recommended Environment" <kbd><b>Hail Genomics Analysis</b></kbd> which creates compute type <kbd>Dataproc Cluster</kbd> with reasonable defaults for CPU, RAM, disk, and number of workers. If you like, you can increase the number of workers to make this job complete faster.</li>
        <li>This notebook can take a while to run. Recommend that it is run in the background via <kbd>run_notebook_in_the_background</kbd>.</li>
        <ul>
            <li><b>TODO(deflaux) add runtime details</b></li>
        </ul>
    </ul>
</div>

In [None]:
from datetime import datetime
import hail as hl
import os
import time

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
# The gnomAD v3.1.2 data set contains 76,156 whole genomes (and no exomes), all mapped to the GRCh38 reference sequence.
# See also https://gnomad.broadinstitute.org/downloads
GNOMAD_TAB = 'gs://gcp-public-data--gnomad/release/3.1.2/ht/genomes/gnomad.genomes.v3.1.2.sites.ht'
# Created via notebook aou_workbench_pooled_analyses/12_examine_scientific_differences.ipynb
LIPIDS_GWAS_RESULTS = 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/results/20220603/significant_lipids_gwas_results.tsv'
BATCH_GWAS_RESULTS = 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/results/20220603/significant_regenie_batch_variants.tsv'

# This file is from https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=3803.
EXOME_REGIONS = f'{os.getenv("WORKSPACE_BUCKET")}/data/ukb/xgen_plus_spikein.GRCh38.bed'
EXOME_REGIONS = 'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/ukb/xgen_plus_spikein.GRCh38.bed'

INTERVALS_TO_EXAMINE = ['chr1-chr22']
INTERVALS_TO_EXAMINE_NAME = '_'.join(INTERVALS_TO_EXAMINE).replace(':', 'range')

#---[ Outputs ]---
# Create a timestamp for a folder of results generated today.
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

RESULTS_DIR = f'{os.getenv("WORKSPACE_BUCKET")}/data/results/{DATESTAMP}/'
ANNOTATED_LIPIDS_GWAS_RESULTS = f'significant_lipids_gwas_results_gnomad_annotated-{INTERVALS_TO_EXAMINE_NAME}.tsv'
ANNOTATED_BATCH_GWAS_RESULTS = f'significant_batch_gwas_results_gnomad_annotated-{INTERVALS_TO_EXAMINE_NAME}.tsv'
HAIL_LOG = f'{WORK_DIR[0]}/hail-examine-significant-gwas-results-{TIMESTAMP}.log'
HAIL_LOG_DIR_FOR_PROVENANCE = f'{os.getenv("WORKSPACE_BUCKET")}/hail-logs/{DATESTAMP}/'

## Check access

In [None]:
!gsutil ls {LIPIDS_GWAS_RESULTS}

In [None]:
!gsutil ls {BATCH_GWAS_RESULTS}

## Start Hail 

In [None]:
# See also https://towardsdatascience.com/fetch-failed-exception-in-apache-spark-decrypting-the-most-common-causes-b8dff21075c
# See https://spark.apache.org/docs/2.4.7/configuration.html

EXTRA_SPARK_CONFIG = {
    # If set to "true", performs speculative execution of tasks. This means if one or more tasks are running
    # slowly in a stage, they will be re-launched.
    'spark.speculation': 'true', # Default is false.
    
    # Fraction of tasks which must be complete before speculation is enabled for a particular stage.
    'spark.speculation.quantile': '0.95', # Default is 0.75

    # Default timeout for all network interactions. This config will be used in place of 
    # spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, 
    # spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.
    'spark.network.timeout': '180s', # Default is 120s
        
    # (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a
    # non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient
    # network connectivity issues.
    'spark.shuffle.io.maxRetries': '10',  # Default is 3
    
    # (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds
    # by default, calculated as maxRetries * retryWait.
    'spark.shuffle.io.retryWait': '15s',  # Default is 5s
    
    # Number of failures of any particular task before giving up on the job. The total number of failures spread
    # across different tasks will not cause the job to fail; a particular task has to fail this number of attempts.
    # Should be greater than or equal to 1. Number of allowed retries = this value - 1.
    'spark.task.maxFailures': '10', # Default is 4.

    # Number of consecutive stage attempts allowed before a stage is aborted.
    'spark.stage.maxConsecutiveAttempts': '10' # Default is 4.
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

Check the configuration.

In [None]:
sc = hl.spark_context()
config = sc._conf.getAll()
config.sort()
config

In [None]:
start_all = datetime.now()
print(start_all)

# Read the lipids GWAS results table

In [None]:
lipids_gwas_results = hl.import_table(LIPIDS_GWAS_RESULTS, min_partitions=50, impute=True)

In [None]:
lipids_gwas_results.describe()

In [None]:
lipids_gwas_results.show()

In [None]:
lipids_gwas_results = hl.experimental.separate(lipids_gwas_results, field='ID',
                                               into=['chr', 'pos', 'ref', 'alt'], delim='_')

In [None]:
lipids_gwas_results.describe()

In [None]:
lipids_gwas_results.show()

In [None]:
lipids_gwas_results = lipids_gwas_results.annotate(locus=hl.locus(lipids_gwas_results.chr,
                                                                  hl.int(lipids_gwas_results.pos)))

In [None]:
lipids_gwas_results = lipids_gwas_results.annotate(alleles=hl.array([lipids_gwas_results.ref, lipids_gwas_results.alt]))

In [None]:
lipids_gwas_results.describe()

In [None]:
lipids_gwas_results = lipids_gwas_results.key_by(lipids_gwas_results.locus, lipids_gwas_results.alleles)

In [None]:
lipids_gwas_results.describe()

# Read the batch GWAS results table

In [None]:
batch_gwas_results = hl.import_table(BATCH_GWAS_RESULTS, min_partitions=50, impute=True)

In [None]:
batch_gwas_results.describe()

In [None]:
batch_gwas_results.show()

In [None]:
batch_gwas_results = hl.experimental.separate(batch_gwas_results, field='ID',
                                               into=['chr', 'pos', 'ref', 'alt'], delim='_')

In [None]:
batch_gwas_results.describe()

In [None]:
batch_gwas_results.show()

In [None]:
batch_gwas_results = batch_gwas_results.annotate(locus=hl.locus(batch_gwas_results.chr,
                                                                hl.int(batch_gwas_results.pos)))

In [None]:
batch_gwas_results = batch_gwas_results.annotate(alleles=hl.array([batch_gwas_results.ref, batch_gwas_results.alt]))

In [None]:
batch_gwas_results.describe()

In [None]:
batch_gwas_results = batch_gwas_results.key_by(batch_gwas_results.locus, batch_gwas_results.alleles)

In [None]:
batch_gwas_results.describe()

# Read the UKB exome capture regions

In [None]:
ukb_exome_capture_regions = hl.import_bed(EXOME_REGIONS)

In [None]:
ukb_exome_capture_regions.describe()

# Read the gnomAD variant annotation table

In [None]:
gnomad = hl.read_table(GNOMAD_TAB)

In [None]:
gnomad.describe()

## Filter to include only our genomic intervals of interest

In [None]:
gnomad = hl.filter_intervals(
    gnomad,
    [hl.parse_locus_interval(x) for x in INTERVALS_TO_EXAMINE],
    keep=True)

## Filter gnomAD to just the exonic variants

In [None]:
gnomad = hl.filter_intervals(
    gnomad,
    ukb_exome_capture_regions['interval'].collect(),
    keep=True)

In [None]:
gnomad.rsid.show()

In [None]:
gnomad.show()

# Annotate significant lipids GWAS results with gnomAD

In [None]:
annotated_lipids_gwas_results = lipids_gwas_results.join(gnomad, how='left')

In [None]:
annotated_lipids_gwas_results.describe()

## Write annotated lipids GWAS results to TSV

In [None]:
annotated_lipids_gwas_results.export(os.path.join(RESULTS_DIR, ANNOTATED_LIPIDS_GWAS_RESULTS))

# Annotate significant batch GWAS results with gnomAD

In [None]:
annotated_batch_gwas_results = batch_gwas_results.join(gnomad, how='left')

In [None]:
annotated_batch_gwas_results.describe()

## Write annotated batch GWAS results to TSV

In [None]:
annotated_batch_gwas_results.export(os.path.join(RESULTS_DIR, ANNOTATED_BATCH_GWAS_RESULTS))

In [None]:
!gsutil ls {RESULTS_DIR}

# Provenance

In [None]:
end_all = datetime.now()
print(end_all)
print(end_all - start_all)

In [None]:
# Copy the Hail log to the workspace bucket so that we can retain it.
!gzip --keep {HAIL_LOG}
!gsutil cp -v {HAIL_LOG}.gz {HAIL_LOG_DIR_FOR_PROVENANCE}

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze