# Create matrix tables 

In this notebook, we prepare small matrix tables for use in analysis testing.

# Setup 

In [None]:
from datetime import datetime
import hail as hl
import os
import time

## Retrieve exome region file

In [None]:
!wget -nd biobank.ndph.ox.ac.uk/ukb/ukb/auxdata/xgen_plus_spikein.GRCh38.bed

In [None]:
!gsutil cp xgen_plus_spikein.GRCh38.bed ${WORKSPACE_BUCKET}/data/ukb/exomes/

## Define constants

In [None]:
AOU_VCFS = 'gs://fc-aou-preprod-datasets-controlled/5/wgs/vcf/merged/*.vcf.gz'
AOU_MT = f'{os.getenv("WORKSPACE_BUCKET")}/data/aou/alpha1/cohort.mt'

UKB_VCFS = [f'gs://fc-7130e767-a885-4678-95ed-7c966c79e2d0/200K/pvcf/ukb23156_c{chrom}*.vcf.gz'
             for chrom in range(21, 22)]
UKB_MT = f'{os.getenv("WORKSPACE_BUCKET")}/data/ukb/exomes/chr21.mt'

EXOME_REGIONS = f'{os.getenv("WORKSPACE_BUCKET")}/data/ukb/exomes/xgen_plus_spikein.GRCh38.bed'

INTERVAL_TO_EXAMINE = 'chr21'

In [None]:
RESULT_BUCKET = os.getenv("WORKSPACE_BUCKET")
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

HAIL_LOG = f'{WORK_DIR[0]}/hail-make-mt-{TIMESTAMP}.log'

## Check access

In [None]:
!gsutil ls {AOU_VCFS} | head

In [None]:
!gsutil ls {UKB_VCFS[0]}

# Start Hail 

In [None]:
EXTRA_SPARK_CONFIG = {
    'spark.driver.extraJavaOptions': '-Xss4M',
    'spark.driver.maxResultSize': '50G',
    'spark.driver.memory': '90G',
    'spark.executor.extraJavaOptions': '-Xss4M',
    'spark.kryoserializer.buffer.max': '1G',
    'spark.memory.fraction': '0.33',
    'spark.network.timeout': '300',
    'spark.speculation': 'true',
    'spark.speculation.quantile': '0.95',
    'spark.task.maxFailures': '20'
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

# Load exome capture regions

In [None]:
ukb_exome_capture_regions = hl.import_bed(EXOME_REGIONS)

In [None]:
ukb_exome_capture_regions.describe()

In [None]:
ukb_exome_capture_regions.show(5)


## Temporary - limit to chr21¶ 

In [None]:
chr21_interval = hl.parse_locus_interval(INTERVAL_TO_EXAMINE)

In [None]:
ukb_exome_capture_regions = ukb_exome_capture_regions.filter(
    chr21_interval.overlaps(ukb_exome_capture_regions.interval))

In [None]:
ukb_exome_capture_regions.show(5)

# Create AoU matrix table

<div class="alert alert-block alert-info">
<b>Note:</b> The AoU matrix table for the alpha1 release was created via notebook 'Hail Demo' and then moved to a better place within the workspace bucket.
</div>

In [None]:
!gsutil ls {AOU_MT}

# Create UKB exomes matrix table

In [None]:
ukb_exomes = hl.import_vcf(UKB_VCFS,
                           #drop_samples=True,
                           array_elements_required=False,
                           force_bgz=True)

In [None]:
ukb_exomes.describe()

In [None]:
ukb_exomes = ukb_exomes.filter_rows(
    hl.is_defined(ukb_exome_capture_regions[ukb_exomes.locus]))

In [None]:
start = datetime.now()
print(start)

In [None]:
ukb_exomes.write(UKB_MT)

In [None]:
end = datetime.now()
print(end)
print(end - start)

In [None]:
!gsutil ls {UKB_MT}

# Provenance

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze