# Examine variant merge results

In this notebook, we examine the results of the merge of AoU and UKB variants.

# Setup 

In [None]:
from datetime import datetime
import hail as hl
import os
import time

In [None]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

## Define constants

<div class="alert alert-block alert-info">
<b>Note:</b> The AoU matrix table for the alpha1 release was created via notebook 'Hail Demo' and then moved to a better place within the workspace bucket. It contains all samples and variants for the alpha1 release.
</div>

In [None]:
AOU_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/aou/alpha1/cohort.mt'

<div class="alert alert-block alert-info">
<b>Note:</b> The UKB matrix table was created via notebook 'create_matrix_tables' and then repartitioned via notebook 'redo_partitions'.
</div>

In [None]:
UKB_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/ukb/exomes/full_dataset_fewer_partitions.mt'

<div class="alert alert-block alert-info">
<b>Note:</b> These matrix tables were created via notebook 'merge_variants'. Any variants with a filter flag were omitted.
</div>

In [None]:
MERGED_MT = [
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210609/merged-filtered-chr1.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210621/merged-filtered-chr2.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210622/merged-filtered-chr3.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chr4.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chr5.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210628/merged-filtered-chr6_chr7_chr8_chr9_chr10_chr11_chr12_chr13_chr14_chr15_chr16_chr17_chr18_chr19_chr20.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210603/merged-filtered-chr21.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chr22.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210628/merged-filtered-chrX.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/merged-filtered-chrY.mt'
]
AOU_ONLY_TAB = [
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210609/aou_only-filtered-chr1.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210621/aou_only-filtered-chr2.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210622/aou_only-filtered-chr3.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/aou_only-filtered-chr4.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/aou_only-filtered-chr5.tab',
    #    This stage failed for chr6 - chr20. We could re-run it.
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210603/aou_only-filtered-chr21.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/aou_only-filtered-chr22.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210628/aou_only-filtered-chrX.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/aou_only-filtered-chrY.tab'
]
UKB_ONLY_TAB = [
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210609/ukb_only-filtered-chr1.tab',
     #    This stage failed for chr2. We could re-run it.
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210622/ukb_only-filtered-chr3.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/ukb_only-filtered-chr4.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/ukb_only-filtered-chr5.tab',
    #    This stage failed for chr6 - chr20. We could re-run it.
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210603/ukb_only-filtered-chr21.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/ukb_only-filtered-chr22.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210628/ukb_only-filtered-chrX.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210624/ukb_only-filtered-chrY.tab'
]    

In [None]:
time.strftime('%Y%m%d')

In [None]:
RESULT_BUCKET = os.getenv("WORKSPACE_BUCKET")
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

# Outputs
HAIL_LOG = f'{WORK_DIR[0]}/hail-examine-merged-variants-{TIMESTAMP}.log'
HAIL_LOG_DIR_FOR_PROVENANCE = f'{os.getenv("WORKSPACE_BUCKET")}/hail-logs/{DATESTAMP}/'

## Check access

In [None]:
for mt in MERGED_MT:
    !gsutil ls {mt}
    print('\n')

In [None]:
for mt in AOU_ONLY_TAB:
    !gsutil ls {mt}
    print('\n')

In [None]:
for mt in UKB_ONLY_TAB:
    !gsutil ls {mt}
    print('\n')

## Start Hail 

In [None]:
# See also https://towardsdatascience.com/fetch-failed-exception-in-apache-spark-decrypting-the-most-common-causes-b8dff21075c
# See https://spark.apache.org/docs/2.4.7/configuration.html

EXTRA_SPARK_CONFIG = {
    # If set to "true", performs speculative execution of tasks. This means if one or more tasks are running
    # slowly in a stage, they will be re-launched.
    'spark.speculation': 'true', # Default is false.
    
    # Fraction of tasks which must be complete before speculation is enabled for a particular stage.
    'spark.speculation.quantile': '0.95', # Default is 0.75

    # Default timeout for all network interactions. This config will be used in place of 
    # spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, 
    # spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.
    'spark.network.timeout': '180s', # Default is 120s
        
    # (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a
    # non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient
    # network connectivity issues.
    'spark.shuffle.io.maxRetries': '10',  # Default is 3
    
    # (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds
    # by default, calculated as maxRetries * retryWait.
    'spark.shuffle.io.retryWait': '15s',  # Default is 5s
    
    # Number of failures of any particular task before giving up on the job. The total number of failures spread
    # across different tasks will not cause the job to fail; a particular task has to fail this number of attempts.
    # Should be greater than or equal to 1. Number of allowed retries = this value - 1.
    'spark.task.maxFailures': '10', # Default is 4.

    # Number of consecutive stage attempts allowed before a stage is aborted.
    'spark.stage.maxConsecutiveAttempts': '10' # Default is 4.
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

Check the configuration.

In [None]:
sc = hl.spark_context()
config = sc._conf.getAll()
config.sort()
config

# Read merged matrix table

In [None]:
merged = hl.read_matrix_table(MERGED_MT[0])

for i in range(1, len(MERGED_MT)):
    merged = merged.union_rows(hl.read_matrix_table(MERGED_MT[i]))

In [None]:
merged.count()

In [None]:
merged.describe()

In [None]:
hl.summarize_variants(merged)

# Read AoU-only table

In [None]:
aou_only = hl.read_table(AOU_ONLY_TAB[0])

for i in range(1, len(AOU_ONLY_TAB)):
    aou_only = aou_only.union(hl.read_table(AOU_ONLY_TAB[i]))

In [None]:
aou_only.describe()

In [None]:
hl.summarize_variants(aou_only)

# Read UKB-only table

In [None]:
ukb_only = hl.read_table(UKB_ONLY_TAB[0])

for i in range(1, len(UKB_ONLY_TAB)):
    ukb_only = ukb_only.union(hl.read_table(UKB_ONLY_TAB[i]))

In [None]:
ukb_only.describe()

In [None]:
hl.summarize_variants(ukb_only)

# Also read the source data

For comparison purposes.

In [None]:
aou_wgs = hl.read_matrix_table(AOU_MT)

In [None]:
ukb_exomes = hl.read_matrix_table(UKB_MT)

# Examine the data

## Are the unmerged variants mostly rare?


**Answer**: yes for UKB, somewhat for AoU

Allele frequencies at different quantiles.

In [None]:
aou_only.aggregate(hl.agg.approx_quantiles(
    aou_only.aou_info.AF[aou_only.aou_a_index - 1],
    [0, 0.25, 0.5, 0.75, .90, .99, .999, 1]
))

In [None]:
ukb_only.aggregate(hl.agg.approx_quantiles(
    ukb_only.ukb_info.AF[ukb_only.ukb_a_index - 1],
    [0, 0.25, 0.5, 0.75, .90, .99, .999, 1]
))

Plots of those allele frequencies.

**TODO(deflaux)** filter the data so that `log=True` will succeed for these plots.

In [None]:
aou_only_af_p = hl.plot.histogram(aou_only.aou_info.AF[aou_only.aou_a_index - 1])
show(aou_only_af_p)

In [None]:
ukb_only_af_p = hl.plot.histogram(ukb_only.ukb_info.AF[ukb_only.ukb_a_index - 1])
show(ukb_only_af_p)

## How many unmerged variants are common?

**Answer**: a few for each

In [None]:
aou_only_common = aou_only.filter(aou_only.aou_info.AF[aou_only.aou_a_index - 1] > 0.01)

In [None]:
aou_only_common.count()

In [None]:
hl.summarize_variants(aou_only_common)

In [None]:
ukb_only_common = ukb_only.filter(ukb_only.ukb_info.AF[ukb_only.ukb_a_index - 1] > 0.01)

In [None]:
ukb_only_common.count()

In [None]:
hl.summarize_variants(ukb_only_common)

## TODO dbSNP or gnomAD for the common SNPs

## For the common unmerged AoU variants, what do the overlapping UKB variants look like?

**Answer**: see details below

### Show common unmerged from AoU

<div class="alert alert-block alert-success">
Sorted by AF, decending.</div>


In [None]:
aou_only_common.order_by(
    hl.desc(aou_only_common.aou_info.AF[aou_only_common.aou_a_index - 1])).show(20)

In [None]:
aou_only_common_top20_loci = aou_only_common.order_by(
    hl.desc(aou_only_common.aou_info.AF[aou_only_common.aou_a_index - 1])).locus.take(20)

<div class="alert alert-block alert-success">
Of those top 20 by AF, now sorted by position, ascending.</div>

In [None]:
aou_only_common_top20 = aou_only_common.filter(
    hl.literal(aou_only_common_top20_loci).contains(aou_only_common.locus))
aou_only_common_top20 = aou_only_common_top20.select(
    aou_only_common_top20.aou_was_split,
    aou_only_common_top20.aou_filters,
    AF = aou_only_common_top20.aou_info.AF[aou_only_common_top20.aou_a_index - 1]
    )

aou_only_common_top20.order_by(aou_only_common_top20.locus).show(50)

<div class="alert alert-block alert-success">
    Show UKB <b>source data</b> locus matches.
    </div>

In [None]:
ukb_locus_matches = ukb_exomes.filter_rows(
    hl.literal(aou_only_common_top20_loci).contains(ukb_exomes.locus)).rows()
ukb_locus_matches = ukb_locus_matches.select(
    ukb_locus_matches.filters,
    ukb_locus_matches.info.AF
)

ukb_locus_matches.order_by(ukb_locus_matches.locus).show(50)

<div class="alert alert-block alert-success">
    Show UKB<b>-only</b> locus matches.
    </div>

In [None]:
ukb_only_locus_matches = ukb_only.filter(
    hl.literal(aou_only_common_top20_loci).contains(ukb_only.locus))
ukb_only_locus_matches = ukb_only_locus_matches.select(
    ukb_only_locus_matches.ukb_was_split,
    AF = ukb_only_locus_matches.ukb_info.AF[ukb_only_locus_matches.ukb_a_index - 1]
)

ukb_only_locus_matches.order_by(ukb_only_locus_matches.locus).show(50)

<div class="alert alert-block alert-success">
    Show AoU <b>source data</b> locus matches.
    </div>

In [None]:
aou_locus_matches = aou_wgs.filter_rows(
    hl.literal(aou_only_common_top20_loci).contains(aou_wgs.locus)).rows()
aou_locus_matches = aou_locus_matches.select(aou_locus_matches.info.AF)

aou_locus_matches.order_by(aou_locus_matches.locus).show(50)

### Examine a common AoU SNP

In [None]:
[(x, aou_only_common_top20_loci[x]) for x in range(0, len(aou_only_common_top20_loci))]

In [None]:
one_aou_variant = [aou_only_common_top20_loci[4]]

one_aou_variant

In [None]:
ukb_exomes.filter_rows(
    hl.literal(one_aou_variant).contains(ukb_exomes.locus)).show(20)

In [None]:
ukb_nearby_common_aou = hl.filter_intervals(
     ukb_exomes,
     [hl.interval(hl.locus(x.contig, x.position - 20), hl.locus(x.contig, x.position + 20),
                  includes_start=True, includes_end=True) for x in one_aou_variant])

In [None]:
ukb_nearby_common_aou.show(20)

## For the common unmerged UKB variants, what do the overlapping AoU variants look like?

**Answer**: see details below

### Show common unmerged from UKB

<div class="alert alert-block alert-success">
Sorted by AF, decending.</div>


In [None]:
ukb_only_common.order_by(
    hl.desc(ukb_only_common.ukb_info.AF[ukb_only_common.ukb_a_index - 1])).show(20)

In [None]:
ukb_only_common_top20_loci = ukb_only_common.order_by(
    hl.desc(ukb_only_common.ukb_info.AF[ukb_only_common.ukb_a_index - 1])).locus.take(20)

<div class="alert alert-block alert-success">
Of those top 20 by AF, now sorted by position, ascending.</div>

In [None]:
ukb_only_common_top20 = ukb_only_common.filter(
    hl.literal(ukb_only_common_top20_loci).contains(ukb_only_common.locus))
ukb_only_common_top20 = ukb_only_common_top20.select(
    ukb_only_common_top20.ukb_was_split,
    ukb_only_common_top20.ukb_filters,
    AF = ukb_only_common_top20.ukb_info.AF[ukb_only_common_top20.ukb_a_index - 1]
    )

ukb_only_common_top20.order_by(ukb_only_common_top20.locus).show(50)

<div class="alert alert-block alert-success">
    Show AoU <b>source data</b> locus matches.
    </div>

In [None]:
aou_locus_matches = aou_wgs.filter_rows(
    hl.literal(ukb_only_common_top20_loci).contains(aou_wgs.locus)).rows()
aou_locus_matches = aou_locus_matches.select(
    aou_locus_matches.filters,
    aou_locus_matches.info.AF
)

aou_locus_matches.order_by(aou_locus_matches.locus).show(50)

<div class="alert alert-block alert-success">
    Show AoU<b>-only</b> locus matches.
    </div>

In [None]:
aou_only_locus_matches = aou_only.filter(
    hl.literal(ukb_only_common_top20_loci).contains(aou_only.locus))
aou_only_locus_matches = aou_only_locus_matches.select(
    aou_only_locus_matches.aou_was_split,
    AF = aou_only_locus_matches.aou_info.AF[aou_only_locus_matches.aou_a_index - 1]
)

aou_only_locus_matches.order_by(aou_only_locus_matches.locus).show(50)

<div class="alert alert-block alert-success">
    Show UKB <b>source data</b> locus matches.
    </div>

In [None]:
ukb_locus_matches = ukb_exomes.filter_rows(
    hl.literal(ukb_only_common_top20_loci).contains(ukb_exomes.locus)).rows()
ukb_locus_matches = ukb_locus_matches.select(ukb_locus_matches.info.AF)

ukb_locus_matches.order_by(ukb_locus_matches.locus).show(50)

### Examine a common UKB SNP

In [None]:
[(x, ukb_only_common_top20_loci[x]) for x in range(0, len(ukb_only_common_top20_loci))]

In [None]:
one_ukb_variant = [ukb_only_common_top20_loci[0]]

one_ukb_variant

In [None]:
aou_wgs.filter_rows(
    hl.literal(one_ukb_variant).contains(aou_wgs.locus)).show(20)

In [None]:
aou_nearby_common_ukb = hl.filter_intervals(
     aou_wgs,
     [hl.interval(hl.locus(x.contig, x.position - 20), hl.locus(x.contig, x.position + 20),
                  includes_start=True, includes_end=True) for x in one_ukb_variant])

In [None]:
aou_nearby_common_ukb.show(20)

## TODO characterize common unmerged variants by ancestry

# Provenance

In [None]:
# Copy the Hail log to the workspace bucket so that we can retain it.
!gzip --keep {HAIL_LOG}
!gsutil cp {HAIL_LOG}.gz {HAIL_LOG_DIR_FOR_PROVENANCE}

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze