# Write filtered AoU BGEN file

In this notebook, we write the matrix table to a BGEN file for use with other tools such as [PLINK2](https://www.cog-genomics.org/plink/2.0/) and [regenie](https://rgcgithub.github.io/regenie/).

# Setup 

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the All of Us Workbench.
    <ul>
        <li>Use compute type 'Dataproc cluster' with default CPU, RAM, and number of workers.</li>
        <li>This notebook can take a while to run. Recommend that it is run in the background via <kbd>run_notebook_in_the_background</kbd>.</li>
    </ul>
</div>

In [None]:
from datetime import datetime
import hail as hl
import os
import time

## Define constants

In [None]:
AOU_MT = 'gs://fc-secure-.../PATH/TO/AOU/ALPHA3/MATRIX/TABLE.mt'

In [None]:
INTERVALS_TO_EXAMINE = ['chr1-chr22']  # Only include autosomes.
INTERVALS_TO_EXAMINE_NAME = '_'.join(INTERVALS_TO_EXAMINE).replace(':', 'range')

In [None]:
EXOME_REGIONS = f'{os.getenv("WORKSPACE_BUCKET")}/data/ukb-public/xgen_plus_spikein.GRCh38.bed'

In [None]:
RESULT_BUCKET = os.getenv("WORKSPACE_BUCKET")
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

# Outputs
OUTPUT_BGEN = f'{os.getenv("WORKSPACE_BUCKET")}/data/aou/{DATESTAMP}/aou-alpha3-{INTERVALS_TO_EXAMINE_NAME}' # Hail will add the .bgen suffix.
HAIL_LOG = f'{WORK_DIR[0]}/hail-write-filtered-bgen-{TIMESTAMP}.log'
HAIL_LOG_DIR_FOR_PROVENANCE = f'{os.getenv("WORKSPACE_BUCKET")}/hail-logs/{DATESTAMP}/'

In [None]:
OUTPUT_BGEN

## Check access

In [None]:
!gsutil ls {AOU_MT}

## Start Hail 

In [None]:
# See also https://towardsdatascience.com/fetch-failed-exception-in-apache-spark-decrypting-the-most-common-causes-b8dff21075c
# See https://spark.apache.org/docs/2.4.7/configuration.html

EXTRA_SPARK_CONFIG = {
    # If set to "true", performs speculative execution of tasks. This means if one or more tasks are running
    # slowly in a stage, they will be re-launched.
    'spark.speculation': 'true', # Default is false.
    
    # Fraction of tasks which must be complete before speculation is enabled for a particular stage.
    'spark.speculation.quantile': '0.95', # Default is 0.75

    # Default timeout for all network interactions. This config will be used in place of 
    # spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, 
    # spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.
    'spark.network.timeout': '180s', # Default is 120s
        
    # (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a
    # non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient
    # network connectivity issues.
    'spark.shuffle.io.maxRetries': '10',  # Default is 3
    
    # (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds
    # by default, calculated as maxRetries * retryWait.
    'spark.shuffle.io.retryWait': '15s',  # Default is 5s
    
    # Number of failures of any particular task before giving up on the job. The total number of failures spread
    # across different tasks will not cause the job to fail; a particular task has to fail this number of attempts.
    # Should be greater than or equal to 1. Number of allowed retries = this value - 1.
    'spark.task.maxFailures': '10', # Default is 4.

    # Number of consecutive stage attempts allowed before a stage is aborted.
    'spark.stage.maxConsecutiveAttempts': '10' # Default is 4.
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

Check the configuration.

In [None]:
sc = hl.spark_context()
config = sc._conf.getAll()
config.sort()
config

# Load exome capture regions

In [None]:
ukb_exome_capture_regions = hl.import_bed(EXOME_REGIONS)

In [None]:
ukb_exome_capture_regions.describe()

In [None]:
ukb_exome_capture_regions.aggregate(hl.agg.counter(ukb_exome_capture_regions.interval.start.contig))

In [None]:
ukb_exome_capture_regions.show(5)

# Read the matrix table

In [None]:
aou_mt = hl.read_AOU_MT(AOU_MT)

In [None]:
aou_mt.describe()

## Filter to our intervals of interest

In [None]:
if len(INTERVALS_TO_EXAMINE) > 0:
    aou_mt = hl.filter_intervals(
        aou_mt,
        [hl.parse_locus_interval(x) for x in INTERVALS_TO_EXAMINE],
        keep=True)

## Filter to include only exonic variants

In [None]:
aou_mt = aou_mt.filter_rows(hl.is_defined(ukb_exome_capture_regions[aou_mt.locus]))

## Omit variants with filter flags

aou_mt_rows = aou_mt.rows()
aou_mt_rows.group_by(aou_mt_rows.filters).aggregate(n = hl.agg.count()).show()

aou_mt = aou_mt.filter_rows(hl.is_missing(aou_mt.filters))

aou_mt_rows = aou_mt.rows()
aou_mt_rows.group_by(aou_mt_rows.filters).aggregate(n = hl.agg.count()).show()

## Create an rsid

This is needed by plink.

In [None]:
aou_mt = aou_mt.annotate_rows(
    rsid = aou_mt.locus.contig + '_' + hl.str(aou_mt.locus.position)
            + '_' + aou_mt.alleles[0] + '_' + aou_mt.alleles[1])

# Write the matrix table to BGEN

https://hail.is/docs/0.2/methods/impex.html#hail.methods.export_bgen

In [None]:
start = datetime.now()
print(start)

In [None]:
homref_gp = hl.literal([1.0, 0.0, 0.0])
het_gp = hl.literal([0.0, 1.0, 0.0])
homvar_gp = hl.literal([0.0, 0.0, 1.0])

aou_mt = aou_mt.annotate_entries(
    GP = hl.case()
        .when(aou_mt.GT.is_hom_ref(), homref_gp)
        .when(aou_mt.GT.is_het(), het_gp)
        .default(homvar_gp)
)

In [None]:
hl.methods.export_bgen(aou_mt=aou_mt, output=OUTPUT_BGEN, gp=aou_mt.GP, rsid=aou_mt.rsid, parallel=None)

In [None]:
end = datetime.now()
print(end)
print(end - start)

In [None]:
start = datetime.now()
print(start)

In [None]:
hl.methods.index_bgen(OUTPUT_BGEN + '.bgen')

In [None]:
end = datetime.now()
print(end)
print(end - start)

# Provenance

In [None]:
# Copy the Hail log to the workspace bucket so that we can retain it.
!gzip --keep {HAIL_LOG}
!gsutil cp {HAIL_LOG}.gz {HAIL_LOG_DIR_FOR_PROVENANCE}

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze