# Examine variant merge results

In this notebook, we examine the results of the merge of AoU and UKB variants.

# Setup 

In [None]:
from datetime import datetime
import hail as hl
import os
import time

In [None]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

## Define constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
# Matrix table was provided by AoU.
AOU_MT = 'gs://fc-secure-4adb21f6-46f4-4a79-99f9-afd63890c6d0/data/beta/beta_wgs_98622.mt'
# Matrix table was created from UKB 200k exome release VCFs.
# Note: The UKB matrix table was created via notebook 'create_matrix_tables' and then repartitioned via notebook 'redo_partitions'.
UKB_MT = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/ukb/exomes/full_dataset_fewer_partitions.mt'
# Partition the two matrix tables with the same NUM_MT_READ_PARTITIONS partitions from the UKB matrix table.
NUM_MT_READ_PARTITIONS = 5000
# Capture regions used for AoU WGS data.
WGS_CAPTURE_REGIONS = 'gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list'
# These matrix tables were created via notebook 'merge_variants'. 
MERGED_MT = [
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210902/merged-filtered-chr1.mt',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210905/merged-aou2-ukb-filtered-chr2_chr3_chr4_chr5_chr6_chr7_chr8_chr9_chr10_chr11_chr12_chr13_chr14_chr15_chr16_chr17_chr18_chr19_chr20_chr21_chr22.mt'
]
AOU_ONLY_TAB = [
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210902/aou_only-filtered-chr1.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210905/aou2_without_ukb-filtered-chr2_chr3_chr4_chr5_chr6_chr7_chr8_chr9_chr10_chr11_chr12_chr13_chr14_chr15_chr16_chr17_chr18_chr19_chr20_chr21_chr22.tab'
]
UKB_ONLY_TAB = [
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210902/ukb_only-filtered-chr1.tab',
    'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210905/ukb_without_aou2-filtered-chr2_chr3_chr4_chr5_chr6_chr7_chr8_chr9_chr10_chr11_chr12_chr13_chr14_chr15_chr16_chr17_chr18_chr19_chr20_chr21_chr22.tab'
]

#---[ Outputs ]---
# Create a timestamp for a folder of results generated today.
DATESTAMP = time.strftime('%Y%m%d')
TIMESTAMP = time.strftime('%Y%m%d_%H%M%S')
WORK_DIR = !pwd

#RESULTS_DIR = f'{os.getenv("WORKSPACE_BUCKET")}/data/merged/{DATESTAMP}/'
HAIL_LOG = f'{WORK_DIR[0]}/hail-examine-merged-variants-{TIMESTAMP}.log'
HAIL_LOG_DIR_FOR_PROVENANCE = f'{os.getenv("WORKSPACE_BUCKET")}/hail-logs/{DATESTAMP}/'

## Check access

In [None]:
for mt in MERGED_MT:
    !gsutil ls {mt}
    print('\n')

In [None]:
for mt in AOU_ONLY_TAB:
    !gsutil ls {mt}
    print('\n')

In [None]:
for mt in UKB_ONLY_TAB:
    !gsutil ls {mt}
    print('\n')

## Start Hail 

In [None]:
# See also https://towardsdatascience.com/fetch-failed-exception-in-apache-spark-decrypting-the-most-common-causes-b8dff21075c
# See https://spark.apache.org/docs/2.4.7/configuration.html

EXTRA_SPARK_CONFIG = {
    # If set to "true", performs speculative execution of tasks. This means if one or more tasks are running
    # slowly in a stage, they will be re-launched.
    'spark.speculation': 'true', # Default is false.
    
    # Fraction of tasks which must be complete before speculation is enabled for a particular stage.
    'spark.speculation.quantile': '0.95', # Default is 0.75

    # Default timeout for all network interactions. This config will be used in place of 
    # spark.core.connection.ack.wait.timeout, spark.storage.blockManagerSlaveTimeoutMs, 
    # spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured.
    'spark.network.timeout': '180s', # Default is 120s
        
    # (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a
    # non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient
    # network connectivity issues.
    'spark.shuffle.io.maxRetries': '10',  # Default is 3
    
    # (Netty only) How long to wait between retries of fetches. The maximum delay caused by retrying is 15 seconds
    # by default, calculated as maxRetries * retryWait.
    'spark.shuffle.io.retryWait': '15s',  # Default is 5s
    
    # Number of failures of any particular task before giving up on the job. The total number of failures spread
    # across different tasks will not cause the job to fail; a particular task has to fail this number of attempts.
    # Should be greater than or equal to 1. Number of allowed retries = this value - 1.
    'spark.task.maxFailures': '10', # Default is 4.

    # Number of consecutive stage attempts allowed before a stage is aborted.
    'spark.stage.maxConsecutiveAttempts': '10' # Default is 4.
}

In [None]:
hl.init(spark_conf=EXTRA_SPARK_CONFIG,
        min_block_size=50,
        default_reference='GRCh38',
        log=HAIL_LOG)

Check the configuration.

In [None]:
sc = hl.spark_context()
config = sc._conf.getAll()
config.sort()
config

# Read merged matrix table

In [None]:
merged = hl.read_matrix_table(MERGED_MT[0])

for i in range(1, len(MERGED_MT)):
    merged = merged.union_rows(hl.read_matrix_table(MERGED_MT[i]))

In [None]:
merged.count()

In [None]:
merged.describe()

In [None]:
merged.aggregate_cols(hl.agg.counter(merged.cohort))

In [None]:
hl.summarize_variants(merged)

# Read AoU-only table

In [None]:
aou_only = hl.read_table(AOU_ONLY_TAB[0])

for i in range(1, len(AOU_ONLY_TAB)):
    aou_only = aou_only.union(hl.read_table(AOU_ONLY_TAB[i]))

In [None]:
aou_only.describe()

In [None]:
hl.summarize_variants(aou_only)

# Read UKB-only table

In [None]:
ukb_only = hl.read_table(UKB_ONLY_TAB[0])

for i in range(1, len(UKB_ONLY_TAB)):
    ukb_only = ukb_only.union(hl.read_table(UKB_ONLY_TAB[i]))

In [None]:
ukb_only.describe()

In [None]:
hl.summarize_variants(ukb_only)

# Also read the source data

For comparison purposes.

In [None]:
aou_wgs = hl.read_matrix_table(AOU_MT)

In [None]:
aou_wgs.describe()

In [None]:
aou_wgs.rows().show(5)

In [None]:
ukb_exomes = hl.read_matrix_table(UKB_MT)

In [None]:
ukb_exomes.describe()

In [None]:
ukb_exomes.rows().show(5)

# Examine the data

## Are the unmerged variants mostly rare?


**Answer**: yes for UKB, somewhat for AoU

Allele frequencies at different quantiles.

In [None]:
aou_only.aggregate(hl.agg.approx_quantiles(
    aou_only.aou_info.AF[aou_only.aou_a_index - 1],
    [0, 0.25, 0.5, 0.75, .90, .99, .999, 1]
))

In [None]:
aou_only.aggregate(hl.agg.approx_quantiles(
    aou_only.aou_info.AC[aou_only.aou_a_index - 1],
    [0, 0.25, 0.5, 0.75, .90, .99, .999, 1]
))

In [None]:
aou_only.aggregate(hl.agg.approx_quantiles(
    aou_only.aou_info.AN,
    [0, 0.25, 0.5, 0.75, .90, .99, .999, 1]
))

In [None]:
ukb_only.aggregate(hl.agg.approx_quantiles(
    ukb_only.ukb_info.AF[ukb_only.ukb_a_index - 1],
    [0, 0.25, 0.5, 0.75, .90, .99, .999, 1]
))

<div class="alert alert-block alert-info">
<b>Note:</b> The original UKB VCFs list info fields for AC and AN but their values are not populated.
</div>

In [None]:
ukb_only.aggregate(hl.agg.approx_quantiles(
    ukb_only.ukb_info.AC[ukb_only.ukb_a_index - 1],
    [0, 0.25, 0.5, 0.75, .90, .99, .999, 1]
))

In [None]:
ukb_only.aggregate(hl.agg.approx_quantiles(
    ukb_only.ukb_info.AN,
    [0, 0.25, 0.5, 0.75, .90, .99, .999, 1]
))

Plots of those allele frequencies.

**TODO(deflaux)** filter the data so that `log=True` will succeed for these plots.

In [None]:
aou_only_af_p = hl.plot.histogram(aou_only.aou_info.AF[aou_only.aou_a_index - 1])
show(aou_only_af_p)

In [None]:
ukb_only_af_p = hl.plot.histogram(ukb_only.ukb_info.AF[ukb_only.ukb_a_index - 1])
show(ukb_only_af_p)

### Use Hail to compute the alternate allele frequency.

What about for the merged data?

In [None]:
merged = merged.annotate_rows(alt_allele_freq=hl.agg.call_stats(merged.GT, merged.alleles).AF[1])

In [None]:
merged.describe()

In [None]:
merged.aggregate_rows(hl.agg.approx_quantiles(
    merged.alt_allele_freq,
    [0, 0.25, 0.5, 0.75, .90, .99, .999, 1]
))

### Extract merged frequencies to a CSV

In [None]:
merged_rows = merged.rows()
merged_allele_freq_df = merged_rows.select(AF = merged_rows.alt_allele_freq).to_pandas()
merged_allele_freq_df.shape

In [None]:
merged_allele_freq_df.to_csv('merged_allele_freq.csv', index=False)

In [None]:
!gsutil cp -v merged_allele_freq.csv {RESULTS_DIR}

## How many unmerged variants are common?

**Answer**: a few for each

In [None]:
aou_only_common = aou_only.filter(aou_only.aou_info.AF[aou_only.aou_a_index - 1] > 0.01, keep=True)
aou_only_common = aou_only_common.filter(aou_only_common.aou_info.AF[aou_only_common.aou_a_index - 1] < 0.99, keep=True)

In [None]:
aou_only_common.count()

In [None]:
hl.summarize_variants(aou_only_common)

In [None]:
ukb_only_common = ukb_only.filter(ukb_only.ukb_info.AF[ukb_only.ukb_a_index - 1] > 0.01, keep=True)
ukb_only_common = ukb_only_common.filter(ukb_only_common.ukb_info.AF[ukb_only_common.ukb_a_index - 1] < 0.99, keep=True)

In [None]:
ukb_only_common.count()

In [None]:
hl.summarize_variants(ukb_only_common)

In [None]:
merged_common = merged.filter_rows(merged.alt_allele_freq > 0.01, keep=True)
merged_common = merged_common.filter_rows(merged_common.alt_allele_freq < 0.99, keep=True)

In [None]:
merged_common.count()

Use plink to quickly compute MAF on the BGEN.

In [None]:
REMOTE_MERGED_BGEN = 'gs://fc-secure-fd6786bf-6c28-4f33-ac30-3860fbeee5bb/data/merged/20210906/ukb-aou-alpha2-chr1-chr22.bgen'
LOCAL_MERGED_BGEN = os.path.basename(REMOTE_MERGED_BGEN)

In [None]:
!gsutil cp -v -n {REMOTE_MERGED_BGEN} .

!/tmp/plink2/plink2 \
  --bgen {LOCAL_MERGED_BGEN} ref-first \
  --chr 1-22 \
  --maf 0.01 \
  --write-snplist \
  --no-id-header \
  --out merged_bgen_common_variants_plink

In [None]:
!wc -l merged_bgen_common_variants_plink.snplist

In [None]:
hl.summarize_variants(merged_common)

## Are the WGS capture regions yielding any unmerged UKB variants?

**Answer**: none!

In [None]:
aou_wgs_capture_regions = hl.import_locus_intervals(WGS_CAPTURE_REGIONS)

In [None]:
aou_wgs_capture_regions.describe()

In [None]:
aou_wgs_capture_regions.show(5)

In [None]:
aou_wgs_capture_regions.aggregate(hl.agg.counter(aou_wgs_capture_regions.interval.start.contig))

In [None]:
ukb_only_outside_wgs_capture_region = ukb_only.filter(
    hl.is_defined(aou_wgs_capture_regions[ukb_only.locus]), keep = False)

In [None]:
ukb_only_outside_wgs_capture_region.count()

## How many unmerged variants are due to filter flags?

### UKB-only variants filtered from AoU

In [None]:
aou_wgs_rows = aou_wgs.rows()

In [None]:
aou_wgs_rows.group_by(aou_wgs_rows.filters).aggregate(n = hl.agg.count()).show()

In [None]:
aou_wgs_has_filter_flag = aou_wgs_rows.filter(hl.is_defined(aou_wgs_rows.filters), keep = True)

In [None]:
aou_wgs_has_filter_flag.count()

In [None]:
# For efficiency, do not pass the biallelic variants to the split method,
# just add the corresponding annotations.
aou_wgs_has_filter_flag_bi = aou_wgs_has_filter_flag.filter(hl.len(aou_wgs_has_filter_flag.alleles) == 2)
aou_wgs_has_filter_flag_bi = aou_wgs_has_filter_flag_bi.annotate(a_index = 1)
aou_wgs_has_filter_flag_bi = aou_wgs_has_filter_flag_bi.annotate(was_split = False)

# Split the multi-allelic sites into biallelic sites.
aou_wgs_has_filter_flag_multi = aou_wgs_has_filter_flag.filter(hl.len(aou_wgs_has_filter_flag.alleles) > 2)
aou_wgs_has_filter_flag_split = hl.split_multi_hts(aou_wgs_has_filter_flag_multi,
                                                   keep_star=False,
                                                   left_aligned=False,
                                                   vep_root='vep',
                                                   permit_shuffle=False)

# Union the two collections.
aou_wgs_has_filter_flag = aou_wgs_has_filter_flag_split.union(aou_wgs_has_filter_flag_bi)
aou_wgs_has_filter_flag.describe()

In [None]:
aou_wgs_has_filter_flag.count()

In [None]:
ukb_only_aou_flags = ukb_only.annotate(aou_filter = aou_wgs_has_filter_flag[ukb_only.key].filters)
ukb_only_aou_flags.describe()

In [None]:
ukb_only_aou_flags.group_by(ukb_only_aou_flags.aou_filter).aggregate(n = hl.agg.count()).show()

#### Extract UKB-only allele frequencies to a CSV

In [None]:
ukb_only_without_aou_filters = ukb_only_aou_flags.filter(hl.is_missing(ukb_only_aou_flags.aou_filter))
ukb_only_without_aou_filters.count()

In [None]:
ukb_only_without_aou_filters.describe()

In [None]:
ukb_only_without_aou_filters.ukb_info.AF[ukb_only_without_aou_filters.ukb_a_index - 1].show()

In [None]:
ukb_only_allele_freq_df = ukb_only_without_aou_filters.select(
    AF = ukb_only_without_aou_filters.ukb_info.AF[ukb_only_without_aou_filters.ukb_a_index - 1]).to_pandas()
ukb_only_allele_freq_df.shape

In [None]:
ukb_only_allele_freq_df.to_csv('ukb_only_allele_freq.csv', index=False)

In [None]:
!gsutil cp -v ukb_only_allele_freq.csv {RESULTS_DIR}

#### Common UKB-only 

In [None]:
ukb_only_aou_flags_common = ukb_only_aou_flags.filter(ukb_only_aou_flags.ukb_info.AF[ukb_only_aou_flags.ukb_a_index - 1] > 0.01, keep=True)
ukb_only_aou_flags_common = ukb_only_aou_flags_common.filter(ukb_only_aou_flags_common.ukb_info.AF[ukb_only_aou_flags_common.ukb_a_index - 1] < 0.99, keep=True)

In [None]:
ukb_only_aou_flags_common.group_by(ukb_only_aou_flags_common.aou_filter).aggregate(n = hl.agg.count()).show()

### AoU-only variants filtered from UKB

In [None]:
ukb_exomes_rows = ukb_exomes.rows()

In [None]:
ukb_exomes_rows.group_by(ukb_exomes_rows.filters).aggregate(n = hl.agg.count()).show()

In [None]:
ukb_exomes_has_filter_flag = ukb_exomes_rows.filter(hl.is_defined(ukb_exomes_rows.filters), keep = True)

In [None]:
ukb_exomes_has_filter_flag.count()

In [None]:
# For efficiency, do not pass the biallelic variants to the split method,
# just add the corresponding annotations.
ukb_exomes_has_filter_flag_bi = ukb_exomes_has_filter_flag.filter(hl.len(ukb_exomes_has_filter_flag.alleles) == 2)
ukb_exomes_has_filter_flag_bi = ukb_exomes_has_filter_flag_bi.annotate(a_index = 1)
ukb_exomes_has_filter_flag_bi = ukb_exomes_has_filter_flag_bi.annotate(was_split = False)

# Split the multi-allelic sites into biallelic sites.
ukb_exomes_has_filter_flag_multi = ukb_exomes_has_filter_flag.filter(hl.len(ukb_exomes_has_filter_flag.alleles) > 2)
ukb_exomes_has_filter_flag_split = hl.split_multi_hts(ukb_exomes_has_filter_flag_multi,
                                                   keep_star=False,
                                                   left_aligned=False,
                                                   vep_root='vep',
                                                   permit_shuffle=False)

# Union the two collections.
ukb_exomes_has_filter_flag = ukb_exomes_has_filter_flag_split.union(ukb_exomes_has_filter_flag_bi)
ukb_exomes_has_filter_flag.describe()

In [None]:
ukb_exomes_has_filter_flag.count()

In [None]:
aou_only_ukb_flags = aou_only.annotate(ukb_filter = ukb_exomes_has_filter_flag[aou_only.key].filters)
aou_only_ukb_flags.describe()

In [None]:
aou_only_ukb_flags.group_by(aou_only_ukb_flags.ukb_filter).aggregate(n = hl.agg.count()).show()

#### Extract UKB-only allele frequencies to a CSV

In [None]:
aou_only_without_ukb_filters = aou_only_ukb_flags.filter(hl.is_missing(aou_only_ukb_flags.ukb_filter))

In [None]:
aou_only_without_ukb_filters.describe()

In [None]:
aou_only_allele_freq_df = aou_only_without_ukb_filters.select(
    AF = aou_only_without_ukb_filters.aou_info.AF[aou_only_without_ukb_filters.aou_a_index - 1]).to_pandas()
aou_only_allele_freq_df.shape

In [None]:
aou_only_allele_freq_df.to_csv('aou_only_allele_freq.csv', index=False)

In [None]:
!gsutil cp -v aou_only_allele_freq.csv {RESULTS_DIR}

#### Common AoU-only

In [None]:
aou_only_ukb_flags_common = aou_only_ukb_flags.filter(aou_only_ukb_flags.aou_info.AF[aou_only_ukb_flags.aou_a_index - 1] > 0.01, keep=True)
aou_only_ukb_flags_common = aou_only_ukb_flags_common.filter(aou_only_ukb_flags_common.aou_info.AF[aou_only_ukb_flags_common.aou_a_index - 1] < 0.99, keep=True)

In [None]:
aou_only_ukb_flags_common.group_by(aou_only_ukb_flags_common.ukb_filter).aggregate(n = hl.agg.count()).show()

### TODO write out VCFs using annotated 'only' variants

add 'aou' or 'ukb' to the filter flag values to differentiate

## TODO gnomAD for the common variants

## TODO characterize common unmerged variants by ancestry

## For the common unmerged AoU variants, what do the overlapping UKB variants look like?

**Answer**: see details below

### Show common unmerged from AoU

<div class="alert alert-block alert-success">
Sorted by AF, decending.</div>


In [None]:
aou_only_common.order_by(
    hl.desc(aou_only_common.aou_info.AF[aou_only_common.aou_a_index - 1])).show(20)

In [None]:
aou_only_common_top20_loci = aou_only_common.order_by(
    hl.desc(aou_only_common.aou_info.AF[aou_only_common.aou_a_index - 1])).locus.take(20)

<div class="alert alert-block alert-success">
Of those top 20 by AF, now sorted by position, ascending.</div>

In [None]:
aou_only_common_top20 = aou_only_common.filter(
    hl.literal(aou_only_common_top20_loci).contains(aou_only_common.locus))
aou_only_common_top20 = aou_only_common_top20.select(
    aou_only_common_top20.aou_was_split,
    aou_only_common_top20.aou_filters,
    AF = aou_only_common_top20.aou_info.AF[aou_only_common_top20.aou_a_index - 1]
    )

aou_only_common_top20.order_by(aou_only_common_top20.locus).show(50)

<div class="alert alert-block alert-success">
    Show UKB <b>source data</b> locus matches.
    </div>

In [None]:
ukb_locus_matches = ukb_exomes.filter_rows(
    hl.literal(aou_only_common_top20_loci).contains(ukb_exomes.locus)).rows()
ukb_locus_matches = ukb_locus_matches.select(
    ukb_locus_matches.filters,
    ukb_locus_matches.info.AF
)

ukb_locus_matches.order_by(ukb_locus_matches.locus).show(50)

<div class="alert alert-block alert-success">
    Show UKB<b>-only</b> locus matches.
    </div>

In [None]:
ukb_only_locus_matches = ukb_only.filter(
    hl.literal(aou_only_common_top20_loci).contains(ukb_only.locus))
ukb_only_locus_matches = ukb_only_locus_matches.select(
    ukb_only_locus_matches.ukb_was_split,
    AF = ukb_only_locus_matches.ukb_info.AF[ukb_only_locus_matches.ukb_a_index - 1]
)

ukb_only_locus_matches.order_by(ukb_only_locus_matches.locus).show(50)

<div class="alert alert-block alert-success">
    Show AoU <b>source data</b> locus matches.
    </div>

In [None]:
aou_locus_matches = aou_wgs.filter_rows(
    hl.literal(aou_only_common_top20_loci).contains(aou_wgs.locus)).rows()
aou_locus_matches = aou_locus_matches.select(aou_locus_matches.info.AF)

aou_locus_matches.order_by(aou_locus_matches.locus).show(50)

### Examine a common AoU SNP

In [None]:
[(x, aou_only_common_top20_loci[x]) for x in range(0, len(aou_only_common_top20_loci))]

In [None]:
one_aou_variant = [aou_only_common_top20_loci[4]]

one_aou_variant

In [None]:
ukb_exomes.filter_rows(
    hl.literal(one_aou_variant).contains(ukb_exomes.locus)).show(20)

In [None]:
ukb_nearby_common_aou = hl.filter_intervals(
     ukb_exomes,
     [hl.interval(hl.locus(x.contig, x.position - 20), hl.locus(x.contig, x.position + 20),
                  includes_start=True, includes_end=True) for x in one_aou_variant])

In [None]:
ukb_nearby_common_aou.show(20)

## For the common unmerged UKB variants, what do the overlapping AoU variants look like?

**Answer**: see details below

### Show common unmerged from UKB

<div class="alert alert-block alert-success">
Sorted by AF, decending.</div>


In [None]:
ukb_only_common.order_by(
    hl.desc(ukb_only_common.ukb_info.AF[ukb_only_common.ukb_a_index - 1])).show(20)

In [None]:
ukb_only_common_top20_loci = ukb_only_common.order_by(
    hl.desc(ukb_only_common.ukb_info.AF[ukb_only_common.ukb_a_index - 1])).locus.take(20)

<div class="alert alert-block alert-success">
Of those top 20 by AF, now sorted by position, ascending.</div>

In [None]:
ukb_only_common_top20 = ukb_only_common.filter(
    hl.literal(ukb_only_common_top20_loci).contains(ukb_only_common.locus))
ukb_only_common_top20 = ukb_only_common_top20.select(
    ukb_only_common_top20.ukb_was_split,
    ukb_only_common_top20.ukb_filters,
    AF = ukb_only_common_top20.ukb_info.AF[ukb_only_common_top20.ukb_a_index - 1]
    )

ukb_only_common_top20.order_by(ukb_only_common_top20.locus).show(50)

<div class="alert alert-block alert-success">
    Show AoU <b>source data</b> locus matches.
    </div>

In [None]:
aou_locus_matches = aou_wgs.filter_rows(
    hl.literal(ukb_only_common_top20_loci).contains(aou_wgs.locus)).rows()
aou_locus_matches = aou_locus_matches.select(
    aou_locus_matches.filters,
    aou_locus_matches.info.AF
)

aou_locus_matches.order_by(aou_locus_matches.locus).show(50)

<div class="alert alert-block alert-success">
    Show AoU<b>-only</b> locus matches.
    </div>

In [None]:
aou_only_locus_matches = aou_only.filter(
    hl.literal(ukb_only_common_top20_loci).contains(aou_only.locus))
aou_only_locus_matches = aou_only_locus_matches.select(
    aou_only_locus_matches.aou_was_split,
    AF = aou_only_locus_matches.aou_info.AF[aou_only_locus_matches.aou_a_index - 1]
)

aou_only_locus_matches.order_by(aou_only_locus_matches.locus).show(50)

<div class="alert alert-block alert-success">
    Show UKB <b>source data</b> locus matches.
    </div>

In [None]:
ukb_locus_matches = ukb_exomes.filter_rows(
    hl.literal(ukb_only_common_top20_loci).contains(ukb_exomes.locus)).rows()
ukb_locus_matches = ukb_locus_matches.select(ukb_locus_matches.info.AF)

ukb_locus_matches.order_by(ukb_locus_matches.locus).show(50)

### Examine a common UKB SNP

In [None]:
[(x, ukb_only_common_top20_loci[x]) for x in range(0, len(ukb_only_common_top20_loci))]

In [None]:
one_ukb_variant = [ukb_only_common_top20_loci[0]]

one_ukb_variant

In [None]:
aou_wgs.filter_rows(
    hl.literal(one_ukb_variant).contains(aou_wgs.locus)).show(20)

In [None]:
aou_nearby_common_ukb = hl.filter_intervals(
     aou_wgs,
     [hl.interval(hl.locus(x.contig, x.position - 20), hl.locus(x.contig, x.position + 20),
                  includes_start=True, includes_end=True) for x in one_ukb_variant])

In [None]:
aou_nearby_common_ukb.show(20)

# Provenance

In [None]:
# Copy the Hail log to the workspace bucket so that we can retain it.
!gzip --keep {HAIL_LOG}
!gsutil cp -v {HAIL_LOG}.gz {HAIL_LOG_DIR_FOR_PROVENANCE}

In [None]:
print(datetime.now())

In [None]:
!pip3 freeze