# Examine variant merge results

In this notebook, we examine the results of the merge of a small number of AoU and UKB variants.

# Setup 

In [None]:
from datetime import datetime
import hail as hl
import os
import time

In [None]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

## Define constants

<div class="alert alert-block alert-info">
<b>Note:</b> The AoU matrix table for the alpha1 release was created via notebook 'Hail Demo' and then moved to a better place within the workspace bucket. It contains all samples and variants for the alpha1 release.
</div>

In [None]:
AOU_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/aou/alpha1/cohort.mt'

<div class="alert alert-block alert-info">
<b>Note:</b> The UKB matrix table was created via notebook 'create_matrix_tables'. It contains data for all samples within region <kbd>chr21:10M-chr21:20M</kbd>.
</div>

In [None]:
UKB_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/ukb/exomes/cohort_chr21.mt'

<div class="alert alert-block alert-info">
<b>Note:</b> These matrix tables were created via notebook 'merge_variants'.
</div>

In [None]:
MERGED_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210527/merged-chr21_10M-chr21_20M.mt'
AOU_ONLY_TAB = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210527/aou_only-chr21_10M-chr21_20M.tab'
UKB_ONLY_TAB = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210527/ukb_only-chr21_10M-chr21_20M.tab'

In [None]:
time.strftime('%Y%m%d')

In [None]:
RESULT_BUCKET = os.getenv("WORKSPACE_BUCKET")
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

# Outputs
HAIL_LOG = f'{WORK_DIR[0]}/hail-examine-merged-variants-{TIMESTAMP}.log'

## Check access

In [None]:
!gsutil ls {MERGED_MT}

In [None]:
!gsutil ls {AOU_ONLY_TAB}

In [None]:
!gsutil ls {UKB_ONLY_TAB}

## Start Hail 

In [None]:
EXTRA_SPARK_CONFIG = {
    'spark.driver.extraJavaOptions': '-Xss4M',
    'spark.driver.maxResultSize': '50G',
    'spark.driver.memory': '90G',
    'spark.executor.extraJavaOptions': '-Xss4M',
    'spark.kryoserializer.buffer.max': '1G',
    'spark.memory.fraction': '0.33',
    'spark.network.timeout': '300',
    'spark.speculation': 'true',
    'spark.speculation.quantile': '0.95',
    'spark.task.maxFailures': '20'
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

# Read merged matrix table

In [None]:
merged = hl.read_matrix_table(MERGED_MT)

In [None]:
merged.describe()

In [None]:
hl.summarize_variants(merged)

# Read AoU-only table

In [None]:
aou_only = hl.read_table(AOU_ONLY_TAB)

In [None]:
aou_only.describe()

In [None]:
hl.summarize_variants(aou_only)

# Read UKB-only table

In [None]:
ukb_only = hl.read_table(UKB_ONLY_TAB)

In [None]:
ukb_only.describe()

In [None]:
hl.summarize_variants(ukb_only)

# Examine the data

## Are the unmerged variants mostly rare?


**Answer**: yes for UKB, somewhat for AoU

In [None]:
aou_only.aggregate(hl.agg.approx_quantiles(
    aou_only.aou_info.AF[aou_only.aou_a_index - 1],
    [0, 0.25, 0.5, 0.75, .90, .99, 1]
))

In [None]:
ukb_only.aggregate(hl.agg.approx_quantiles(
    ukb_only.ukb_info.AF[ukb_only.ukb_a_index - 1],
    [0, 0.25, 0.5, 0.75, .90, .99, 1]
))

In [None]:
aou_only_af_p = hl.plot.histogram(aou_only.aou_info.AF[aou_only.aou_a_index - 1])
show(aou_only_af_p)

In [None]:
ukb_only_af_p = hl.plot.histogram(ukb_only.ukb_info.AF[ukb_only.ukb_a_index - 1])
show(ukb_only_af_p)

## What are the common unmerged variants?

**Answer**: 

In [None]:
aou_only_common = aou_only.filter(aou_only.aou_info.AF[aou_only.aou_a_index - 1] > 0.05)

In [None]:
aou_only_common.count()

In [None]:
aou_only_common.order_by(
    hl.desc(aou_only_common.aou_info.AF[aou_only_common.aou_a_index - 1])).show(20)

In [None]:
common_loci = aou_only_common.order_by(
    hl.desc(aou_only_common.aou_info.AF[aou_only_common.aou_a_index - 1])).locus.take(20)

In [None]:
common_loci

In [None]:
ukb_exomes = hl.read_matrix_table(UKB_MT)

In [None]:
ukb_exomes.filter_rows(
    hl.literal(common_loci).contains(ukb_exomes.locus)).show(20)

In [None]:
ukb_nearby_common_aou = hl.filter_intervals(
     ukb_exomes,
     [hl.interval(hl.locus(x.contig, x.position - 10), hl.locus(x.contig, x.position + 10),
                  includes_start=True, includes_end=True) for x in common_loci])

In [None]:
ukb_nearby_common_aou.show()

# Appendix

extra cells that are useful

In [None]:
start = datetime.now()
print(start)

In [None]:
end = datetime.now()
print(end)
print(end - start)

# Provenance

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze