# Write merged AoU + UKB BGEN file

In this notebook, we use [HAIL](https://hail.is/) to write the matrix table to a BGEN file for use with other tools such as [PLINK2](https://www.cog-genomics.org/plink/2.0/) and [regenie](https://rgcgithub.github.io/regenie/).

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://docs.google.com/document/d/19ZS0z_-7FEM37pNDAXaWaqBSLnqyd9MZEkiOmtF3n_0/edit#). Specifically this is for the portion of the project that is the **pooled** analysis.

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench.
    <ul>
        <li>Use "Recommended Environment" <kbd><b>Hail Genomics Analysis</b></kbd> which creates compute type <kbd>Dataproc Cluster</kbd> with reasonable defaults for CPU, RAM, disk, and number of workers. If you like, you can increase the number of workers to make this job complete faster.</li>
        <li>This notebook can take several hours to run. Recommend that it is run in the background via <kbd>run_notebook_in_the_background</kbd>.</li>
        <ul>
            <li>chr1 - chr22 <b>TODO(deflaux) add these details</b></li>
        </ul>
    </ul>
</div>

In [None]:
from datetime import datetime
import hail as hl
import os
import time

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
# This matrix table was created via notebook `aou_workbench_pooled_analyses/03_merge_variants.ipynb`. 
MERGED_MT = [
    'gs://fc-secure-e53e4a44-7fe2-42b7-89b7-01aae1e399f7/data/pooled/geno/20220209/merged-aou3-ukb-filtered-chr1-chr22.mt'
]
INTERVALS_TO_EXAMINE = ['chr1-chr22']
INTERVALS_TO_EXAMINE_NAME = '_'.join(INTERVALS_TO_EXAMINE).replace(':', 'range')

#---[ Outputs ]---
# Create a timestamp for a folder of results generated today.
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

OUTPUT_BGEN = f'{os.getenv("WORKSPACE_BUCKET")}/data/pooled/geno/{DATESTAMP}/aou-alpha3-ukb-{INTERVALS_TO_EXAMINE_NAME}' # Hail will add the .bgen suffix.
HAIL_LOG = f'{WORK_DIR[0]}/hail-write-bgen-{TIMESTAMP}.log'
HAIL_LOG_DIR_FOR_PROVENANCE = f'{os.getenv("WORKSPACE_BUCKET")}/hail-logs/{DATESTAMP}/'

In [None]:
OUTPUT_BGEN

## Check access

In [None]:
for mt in MERGED_MT:
    !gsutil ls {mt}
    print('\n')

## Start Hail 

In [None]:
# See also https://towardsdatascience.com/fetch-failed-exception-in-apache-spark-decrypting-the-most-common-causes-b8dff21075c
# See https://spark.apache.org/docs/2.4.7/configuration.html

EXTRA_SPARK_CONFIG = {
    # If set to "true", performs speculative execution of tasks. This means if one or more tasks are running
    # slowly in a stage, they will be re-launched.
    'spark.speculation': 'true', # Default is false.
    
    # Fraction of tasks which must be complete before speculation is enabled for a particular stage.
    'spark.speculation.quantile': '0.95', # Default is 0.75

    # Default timeout for all network interactions. This config will be used in place of 
    # spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, 
    # spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.
    'spark.network.timeout': '180s', # Default is 120s
        
    # (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a
    # non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient
    # network connectivity issues.
    'spark.shuffle.io.maxRetries': '10',  # Default is 3
    
    # (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds
    # by default, calculated as maxRetries * retryWait.
    'spark.shuffle.io.retryWait': '15s',  # Default is 5s
    
    # Number of failures of any particular task before giving up on the job. The total number of failures spread
    # across different tasks will not cause the job to fail; a particular task has to fail this number of attempts.
    # Should be greater than or equal to 1. Number of allowed retries = this value - 1.
    'spark.task.maxFailures': '10', # Default is 4.

    # Number of consecutive stage attempts allowed before a stage is aborted.
    'spark.stage.maxConsecutiveAttempts': '10' # Default is 4.
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

Check the configuration.

In [None]:
sc = hl.spark_context()
config = sc._conf.getAll()
config.sort()
config

# Read the matrix table

In [None]:
for i in range(0, len(MERGED_MT)):
    mt = hl.read_matrix_table(MERGED_MT[i])
    print(f'{mt.n_partitions()} {MERGED_MT[i]}')

In [None]:
merged = hl.read_matrix_table(MERGED_MT[0])

for i in range(1, len(MERGED_MT)):
    merged = merged.union_rows(hl.read_matrix_table(MERGED_MT[i]))

In [None]:
merged.describe()

## Filter to our intervals of interest

In [None]:
if len(INTERVALS_TO_EXAMINE) > 0:
    merged = hl.filter_intervals(
        merged,
        [hl.parse_locus_interval(x) for x in INTERVALS_TO_EXAMINE],
        keep=True)

## Create a single value to hold both parts of the column key

In [None]:
merged = merged.annotate_cols(cohort_key = merged.s + '_' + merged.cohort)

In [None]:
merged = merged.key_cols_by(merged.cohort_key)

In [None]:
merged.describe()

## Create an rsid

This is needed by plink.

In [None]:
merged = merged.annotate_rows(
    rsid = merged.locus.contig + '_' + hl.str(merged.locus.position)
            + '_' + merged.alleles[0] + '_' + merged.alleles[1])

# Write the matrix table to BGEN

https://hail.is/docs/0.2/methods/impex.html#hail.methods.export_bgen

In [None]:
start = datetime.now()
print(start)

In [None]:
homref_gp = hl.literal([1.0, 0.0, 0.0])
het_gp = hl.literal([0.0, 1.0, 0.0])
homvar_gp = hl.literal([0.0, 0.0, 1.0])

merged = merged.annotate_entries(
    GP = hl.case()
        .when(merged.GT.is_hom_ref(), homref_gp)
        .when(merged.GT.is_het(), het_gp)
        .default(homvar_gp)
)

In [None]:
hl.methods.export_bgen(mt=merged, output=OUTPUT_BGEN, gp=merged.GP, rsid=merged.rsid, parallel=None)

In [None]:
end = datetime.now()
print(end)
print(end - start)

In [None]:
start = datetime.now()
print(start)

In [None]:
hl.methods.index_bgen(OUTPUT_BGEN + '.bgen')

In [None]:
end = datetime.now()
print(end)
print(end - start)

# Provenance

In [None]:
# Copy the Hail log to the workspace bucket so that we can retain it.
!gzip --keep {HAIL_LOG}
!gsutil cp {HAIL_LOG}.gz {HAIL_LOG_DIR_FOR_PROVENANCE}

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze